1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 
25 
26 #include "nvidia-push-init.h"
27 #include "nvidia-push-utils.h"
28 #include "nvidia-push-priv.h"
29 #include "nvidia-push-priv-imports.h"
30 
31 #include "nvos.h"
32 
33 #include "nv_assert.h"
34 
35 #include "alloc/alloc_channel.h"
36 #include "class/cl0002.h" // NV01_CONTEXT_DMA
37 #include "class/cl003e.h" // NV01_MEMORY_SYSTEM
38 #include "class/cl0040.h" // NV01_MEMORY_LOCAL_USER
39 
40 #include "class/cla16f.h" // KEPLER_CHANNEL_GPFIFO_B
41 #include "class/cla26f.h" // KEPLER_CHANNEL_GPFIFO_C
42 #include "class/clb06f.h" // MAXWELL_CHANNEL_GPFIFO_A
43 #include "class/clc06f.h" // PASCAL_CHANNEL_GPFIFO_A
44 #include "class/clc36f.h" // VOLTA_CHANNEL_GPFIFO_A
45 #include "class/clc46f.h" // TURING_CHANNEL_GPFIFO_A
46 #include "class/cl50a0.h" // NV50_MEMORY_VIRTUAL
47 #include "class/clc56f.h" // AMPERE_CHANNEL_GPFIFO_A
48 #include "class/clc86f.h" // HOPPER_CHANNEL_GPFIFO_A
49 #include "class/clc361.h" // VOLTA_USERMODE_A
50 #include "class/clc661.h" // HOPPER_USERMODE_A
51 
52 #include "ctrl/ctrl0080/ctrl0080fifo.h" // NV0080_CTRL_CMD_FIFO_GET_CAPS_V2
53 #include "ctrl/ctrl2080/ctrl2080bus.h" // NV2080_CTRL_CMD_BUS_GET_INFO
54 #include "ctrl/ctrla06f.h" // KEPLER_CHANNEL_GPFIFO_A
55 #include "ctrl/ctrlc36f.h" // VOLTA_CHANNEL_GPFIFO_A
56 
GetHandle(const NvU32 * pHandlePool,NvU8 handlePoolSize,NvU64 * pUsedHandleBitmask)57 static NvU32 GetHandle(
58     const NvU32 *pHandlePool,
59     NvU8 handlePoolSize,
60     NvU64 *pUsedHandleBitmask)
61 {
62     NvU8 i;
63     const NvU64 usedHandleBitmask = *pUsedHandleBitmask;
64 
65     /*
66      * We assume there are less than 64 handles in the pool. If the
67      * pool is larger than that, we'll need a fancier bitmask.
68      */
69     nvAssert(handlePoolSize < (sizeof(NvU64) * 8));
70 
71     for (i = 0; i < handlePoolSize; i++) {
72         if ((usedHandleBitmask & NVBIT64(i)) == 0) {
73             *pUsedHandleBitmask |= NVBIT64(i);
74             return pHandlePool[i];
75         }
76     }
77 
78     nvAssert(!"Exhausted handlePool!");
79 
80     return 0;
81 }
82 
GetChannelHandle(const NvPushAllocChannelParams * pParams,NvU64 * pUsedHandleBitmask)83 static NvU32 GetChannelHandle(
84     const NvPushAllocChannelParams *pParams,
85     NvU64 *pUsedHandleBitmask)
86 {
87     return GetHandle(pParams->handlePool,
88                      ARRAY_LEN(pParams->handlePool),
89                      pUsedHandleBitmask);
90 }
91 
GetDeviceHandle(const NvPushAllocDeviceParams * pParams,NvU64 * pUsedHandleBitmask)92 static NvU32 GetDeviceHandle(
93     const NvPushAllocDeviceParams *pParams,
94     NvU64 *pUsedHandleBitmask)
95 {
96     return GetHandle(pParams->handlePool,
97                      ARRAY_LEN(pParams->handlePool),
98                      pUsedHandleBitmask);
99 }
100 
FreeSemaSurface(NvPushChannelPtr p)101 static void FreeSemaSurface(NvPushChannelPtr p)
102 {
103     NvPushDevicePtr pDevice = p->pDevice;
104     NvU32 *handle = p->progressSemaphore.handle;
105     void **ptr = p->progressSemaphore.ptr;
106     NvU32 status;
107     int sd;
108 
109     if (p->progressSemaphore.gpuVA) {
110         for (sd = pDevice->numSubDevices - 1; sd >= 0; sd--) {
111             const int deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
112             status = nvPushImportRmApiUnmapMemoryDma(
113                          pDevice,
114                          pDevice->subDevice[sd].handle,
115                          pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma,
116                          handle[sd],
117                          0,
118                          p->progressSemaphore.gpuVA);
119             if (status != NVOS_STATUS_SUCCESS) {
120                 nvAssert(!"Failed to unmap progressSemaphore");
121             }
122         }
123         p->progressSemaphore.gpuVA = 0;
124     }
125 
126     for (sd = pDevice->numSubDevices - 1; sd >= 0; sd--) {
127         const int deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
128         if (!handle[sd]) {
129             continue;
130         }
131         status = nvPushImportRmApiFree(
132                      pDevice,
133                      pDevice->subDevice[deviceIndex].deviceHandle,
134                      handle[sd]);
135         if (status != NVOS_STATUS_SUCCESS) {
136             nvAssert(!"Failed to free progressSemaphore");
137         }
138         handle[sd] = 0;
139 
140         /* Freeing this memory automatically unmaps it. */
141         ptr[sd] = NULL;
142     }
143 }
144 
AllocSemaSurface(NvPushChannelPtr p,const NvPushAllocChannelParams * pParams,NvBool coherent,NvU64 * pUsedHandleBitmask)145 static NvBool AllocSemaSurface(
146     NvPushChannelPtr p,
147     const NvPushAllocChannelParams *pParams,
148     NvBool coherent,
149     NvU64 *pUsedHandleBitmask)
150 {
151     NvPushDevicePtr pDevice = p->pDevice;
152     NvU32 *handle = p->progressSemaphore.handle;
153     void **ptr = p->progressSemaphore.ptr;
154     NvU32 status;
155     const NvU64 size = 4096;
156     unsigned int sd;
157 
158     /* 1. Allocate sysmem surface(s) to back the semaphore, get CPU mapping */
159     for (sd = 0; sd < pDevice->numSubDevices; sd++) {
160         const int deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
161         NvU64 limit = size - 1;
162         const NvU32 flags = DRF_DEF(OS02, _FLAGS, _PHYSICALITY, _NONCONTIGUOUS) |
163                       (coherent ? DRF_DEF(OS02, _FLAGS, _COHERENCY, _CACHED) :
164                                   DRF_DEF(OS02, _FLAGS, _COHERENCY, _UNCACHED));
165 
166         handle[sd] = GetChannelHandle(pParams, pUsedHandleBitmask);
167 
168         status = nvPushImportRmApiAllocMemory64(pDevice,
169                                                 pDevice->subDevice[deviceIndex].deviceHandle,
170                                                 handle[sd],
171                                                 NV01_MEMORY_SYSTEM,
172                                                 flags,
173                                                 &ptr[sd],
174                                                 &limit);
175 
176         if (status != NVOS_STATUS_SUCCESS) {
177             handle[sd] = 0;
178             nvAssert(!"Failed to allocate FIFO semaphore surface");
179             goto fail;
180         }
181     }
182 
183     /* 2. Map the surface(s) into the GPU(s) */
184     for (sd = 0; sd < pDevice->numSubDevices; sd++) {
185         NvU32 flags = DRF_DEF(OS46, _FLAGS, _ACCESS, _READ_WRITE) |
186                       DRF_DEF(OS46, _FLAGS, _PAGE_SIZE, _4KB) |
187                       (coherent ? DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _ENABLE) :
188                                   DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _DISABLE));
189         const int deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
190 
191         /*
192          * Note that this mapping is somewhat special because we use a
193          * different surface for each subdevice, but want to map at the same
194          * virtual address on all subdevices.
195          */
196         if (sd == 0) {
197             /*
198              * Create a new virtual mapping.
199              *
200              * The MapMemoryDma call will assign to
201              * 'p->progressSemaphore.gpuVA'.
202              *
203              * In !clientSli, this creates a broadcast mapping that we override
204              * with the _DMA_UNICAST_REUSE_ALLOC flag below.
205              * In clientSli, each mapping is already unicast.
206              *
207              * In both cases, the DMA_OFFSET_FIXED flag ensures the VA matches
208              * between all subdevices.
209              */
210             p->progressSemaphore.gpuVA = 0;
211             flags = FLD_SET_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _FALSE, flags);
212         } else {
213             /*
214              * The MapMemoryDma call will read from
215              * 'p->progressSemaphore.gpuVA'.
216              */
217             nvAssert(p->progressSemaphore.gpuVA != 0);
218             if (!pDevice->clientSli) {
219                 flags = FLD_SET_DRF(OS46, _FLAGS, _DMA_UNICAST_REUSE_ALLOC, _TRUE, flags);
220             }
221             flags = FLD_SET_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _TRUE, flags);
222         }
223 
224         status = nvPushImportRmApiMapMemoryDma(pDevice,
225                                                pDevice->subDevice[sd].handle,
226                                                pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma,
227                                                handle[sd],
228                                                0,
229                                                size,
230                                                flags,
231                                                &p->progressSemaphore.gpuVA);
232         if (status != NVOS_STATUS_SUCCESS) {
233             nvAssert(!"Failed to map FIFO semaphore surface");
234             goto fail;
235         }
236     }
237 
238     return TRUE;
239 fail:
240     FreeSemaSurface(p);
241     return FALSE;
242 }
243 
244 /*
245  * The size of the "progress tracker" portion of the pushbuffer.
246  *
247  * We use one set of progress tracker methods for every two GPFIFO entries (one
248  * GPFIFO entry is for the main pushbuffer, the other is for the progress
249  * tracker methods).
250  */
ProgressTrackerBufferSize(NvPushChannelPtr buffer)251 static inline NvU32 ProgressTrackerBufferSize(NvPushChannelPtr buffer)
252 {
253     return __nvPushProgressTrackerEntrySize(buffer->pDevice) *
254         (buffer->numGpFifoEntries / 2);
255 }
256 
257 /*
258  * The size of the pushbuffer allocation, including all segments and GPFIFO
259  * entries.
260  */
CalculateGPBufferSize(NvPushChannelPtr buffer)261 static inline NvU32 CalculateGPBufferSize(NvPushChannelPtr buffer)
262 {
263     return __nvPushProgressTrackerOffset(buffer) +
264         ProgressTrackerBufferSize(buffer);
265 }
266 
267 /*!
268  * Set up an NvPushChannelSegmentRec's initial state based on the provided data
269  *
270  * \param segment   Pointer to segment structure to initialize
271  * \param ptr       CPU mapping to the base of the segment.
272  * \param gpuOffset GPU mapping of the base of the segment.
273  * \param size      Size of the segment, in bytes.
274  */
InitDmaSegment(NvPushChannelSegmentPtr segment,void * ptr,NvU64 gpuOffset,NvU32 size)275 static void InitDmaSegment(NvPushChannelSegmentPtr segment,
276                            void *ptr,
277                            NvU64 gpuOffset,
278                            NvU32 size)
279 {
280     segment->base            = (NvPushChannelUnion *)ptr;
281     segment->buffer          = (NvPushChannelUnion *)ptr;
282     segment->sizeInBytes     = size;
283     segment->freeDwords      = size >> 2;
284     segment->gpuMapOffset    = gpuOffset;
285     segment->putOffset       = 0;
286 }
287 
288 /*!
289  * Set up the work submit token.  RM will write this into the "error context
290  * DMA" at the offset we request.
291  */
RequestChidToken(NvPushChannelPtr p)292 static NvBool RequestChidToken(NvPushChannelPtr p)
293 {
294     NvPushDevicePtr pDevice = p->pDevice;
295     int deviceIndex;
296 
297     for (deviceIndex = 0;
298          deviceIndex < __nvPushGetNumDevices(pDevice);
299          deviceIndex++) {
300 
301         NVC36F_CTRL_GPFIFO_SET_WORK_SUBMIT_TOKEN_NOTIF_INDEX_PARAMS notifParams = { 0 };
302         NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN_PARAMS tokenParams = { 0 };
303         NvU32 status;
304 
305         notifParams.index = NV_CHANNELGPFIFO_NOTIFICATION_TYPE__SIZE_1 + deviceIndex;
306 
307         status = nvPushImportRmApiControl(pDevice,
308                          p->channelHandle[deviceIndex],
309                          NVC36F_CTRL_CMD_GPFIFO_SET_WORK_SUBMIT_TOKEN_NOTIF_INDEX,
310                          &notifParams,
311                          sizeof(notifParams));
312         if (status != NVOS_STATUS_SUCCESS) {
313             return FALSE;
314         }
315 
316         /*
317          * Request the channel's "work submit token".  This isn't actually used for
318          * anything but RM needs it to be called after the channel has been allocated,
319          * for reasons.
320          */
321         status = nvPushImportRmApiControl(pDevice,
322                          p->channelHandle[deviceIndex],
323                          NVC36F_CTRL_CMD_GPFIFO_GET_WORK_SUBMIT_TOKEN,
324                          &tokenParams,
325                          sizeof(tokenParams));
326         if (status != NVOS_STATUS_SUCCESS) {
327             return FALSE;
328         }
329     }
330     return TRUE;
331 }
332 
BindAndScheduleChannel(NvPushDevicePtr pDevice,NvU32 channelHandle,NvU32 engineType)333 static NvBool BindAndScheduleChannel(NvPushDevicePtr pDevice,
334                                      NvU32 channelHandle,
335                                      NvU32 engineType)
336 {
337     NVA06F_CTRL_BIND_PARAMS bindParams = { 0 };
338     NVA06F_CTRL_GPFIFO_SCHEDULE_PARAMS scheduleParams = { 0 };
339     NvBool ret;
340 
341     bindParams.engineType = engineType;
342     ret = nvPushImportRmApiControl(pDevice,
343                                    channelHandle,
344                                    NVA06F_CTRL_CMD_BIND,
345                                    &bindParams,
346                                    sizeof(bindParams));
347 
348     if (ret != NVOS_STATUS_SUCCESS) {
349         nvPushImportLogError(pDevice, "Failed to bind the channel");
350         return FALSE;
351     }
352 
353     scheduleParams.bEnable = NV_TRUE;
354     ret = nvPushImportRmApiControl(pDevice,
355                                    channelHandle,
356                                    NVA06F_CTRL_CMD_GPFIFO_SCHEDULE,
357                                    &scheduleParams,
358                                    sizeof(scheduleParams));
359 
360     if (ret != NVOS_STATUS_SUCCESS) {
361         nvPushImportLogError(pDevice,
362                              "Failed to schedule the channel");
363         return FALSE;
364     }
365 
366     return TRUE;
367 }
368 
AllocChannelObject(NvPushChannelPtr buffer,const NvPushAllocChannelParams * pParams,NvU64 * pUsedHandleBitmask,NvU64 gpuAddress)369 static NvBool AllocChannelObject(
370     NvPushChannelPtr buffer,
371     const NvPushAllocChannelParams *pParams,
372     NvU64 *pUsedHandleBitmask,
373     NvU64 gpuAddress)
374 {
375     NvPushDevicePtr pDevice = buffer->pDevice;
376     NV_CHANNEL_ALLOC_PARAMS params = { 0 };
377     unsigned int sd;
378     NvU32 userdMapHandle[NV_MAX_SUBDEVICES];
379     NvU32 ret;
380     const NvU64 gpFifoOffset = gpuAddress + __nvPushGpFifoOffset(buffer);
381     int deviceIndex;
382 
383     for (deviceIndex = 0;
384          deviceIndex < __nvPushGetNumDevices(pDevice);
385          deviceIndex++) {
386         buffer->channelHandle[deviceIndex] = GetChannelHandle(pParams, pUsedHandleBitmask);
387         nvAssert(buffer->notifiers.errorCtxDma != 0);
388 
389         /* Open the DMA channel by allocating the CHANNEL_GPFIFO object */
390         params.hObjectError  = buffer->notifiers.errorCtxDma;
391         if (pDevice->subDevice[deviceIndex].gpuVASpaceObject != 0) {
392             params.hVASpace = pDevice->subDevice[deviceIndex].gpuVASpaceObject;
393         } else {
394             params.hObjectBuffer = pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma;
395         }
396         // Offset is relative to the ctx dma
397         params.gpFifoOffset  = gpFifoOffset;
398 
399         if (pDevice->hal.caps.allocateDoubleSizeGpFifo) {
400             // On Tegra, we have to allocate twice the GPFIFO size. This is because
401             // the kernel will add its own entries (max 2) for the kickoff for the
402             // pre-sync and post-sync fences. This means the max kickoff size is not
403             // actually buffer->numGpFifoEntries - 1, it's
404             // most likely buffer->numGpFifoEntries - 3.
405             //
406             // TODO: Tell the users the actual max kickoff size to avoid this
407             // WAR. NvRmTegraChannelGetMaxKickoffGpfifoCount() retrieves this piece
408             // of info on Tegra. Bug 2404063.
409             params.gpFifoEntries = buffer->numGpFifoEntries * 2;
410         } else {
411             params.gpFifoEntries = buffer->numGpFifoEntries;
412         }
413 
414         params.flags         = 0;
415         if (pParams->secureChannel) {
416             params.flags |= DRF_DEF(OS04, _FLAGS, _CC_SECURE, _TRUE);
417         }
418         if (pParams->difrPrefetch) {
419             params.flags |= DRF_DEF(OS04,
420                                     _FLAGS,
421                                     _SET_EVICT_LAST_CE_PREFETCH_CHANNEL,
422                                     _TRUE);
423         }
424 
425         if (pDevice->hal.caps.clientAllocatesUserD) {
426             if (pDevice->clientSli) {
427                 params.hUserdMemory[0] = buffer->userD[deviceIndex].hMemory;
428                 params.userdOffset[0]  = 0;
429             } else {
430                 for (sd = 0; sd < pDevice->numSubDevices; sd++) {
431                     params.hUserdMemory[sd] = buffer->userD[0].hMemory;
432                     params.userdOffset[sd]  = 0;
433                 }
434             }
435             userdMapHandle[deviceIndex] = buffer->userD[deviceIndex].hMemory;
436         } else {
437             userdMapHandle[deviceIndex] = buffer->channelHandle[deviceIndex];
438         }
439         params.engineType = pParams->engineType;
440         if (pDevice->clientSli) {
441             params.subDeviceId = (1 << deviceIndex);
442         }
443 
444         if ((ret = nvPushImportRmApiAlloc(pDevice,
445                                           pDevice->subDevice[deviceIndex].deviceHandle,
446                                           buffer->channelHandle[deviceIndex],
447                                           pDevice->gpfifoClass,
448                                           &params)) != NVOS_STATUS_SUCCESS)
449         {
450             nvPushImportLogError(pDevice,
451                                  "Push buffer object allocation failed: 0x%x (%s)",
452                                  ret, nvstatusToString(ret));
453             buffer->channelHandle[deviceIndex] = 0;
454             return FALSE;
455         }
456 
457         if (!BindAndScheduleChannel(pDevice,
458                                     buffer->channelHandle[deviceIndex],
459                                     pParams->engineType)) {
460             return FALSE;
461         }
462     }
463 
464     for (sd = 0; sd < pDevice->numSubDevices; sd++) {
465         void *pUserD;
466 
467         deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
468 
469         // Map the DMA controls for each subdevice.
470         ret = nvPushImportRmApiMapMemory(pDevice,
471                                          pDevice->subDevice[sd].handle,
472                                          userdMapHandle[deviceIndex],
473                                          0,
474                                          pDevice->userDSize,
475                                          &pUserD,
476                                          0);
477         if (ret != NVOS_STATUS_SUCCESS) {
478             nvPushImportLogError(pDevice,
479                                  "Push buffer mapping failed: 0x%x (%s)",
480                                  ret, nvstatusToString(ret));
481             return FALSE;
482         }
483 
484         buffer->control[sd] = pUserD;
485     }
486 
487     return TRUE;
488 }
489 
490 /*
491  * It might be nice to suballocate these rather
492  * than create a separate RM allocation for each channel.
493  */
nvDmaAllocUserD(NvPushChannelPtr p,const NvPushAllocChannelParams * pParams,NvU64 * pUsedHandleBitmask)494 static NvBool nvDmaAllocUserD(
495     NvPushChannelPtr p,
496     const NvPushAllocChannelParams *pParams,
497     NvU64 *pUsedHandleBitmask)
498 {
499     NvPushDevicePtr pDevice = p->pDevice;
500     int deviceIndex;
501 
502     if (!pDevice->hal.caps.clientAllocatesUserD) {
503         return TRUE;
504     }
505 
506     for (deviceIndex = 0;
507          deviceIndex < __nvPushGetNumDevices(pDevice);
508          deviceIndex++) {
509         NV_MEMORY_ALLOCATION_PARAMS memAllocParams = { 0 };
510         const NvU32 attr =
511             DRF_DEF(OS32, _ATTR, _LOCATION, _VIDMEM) |
512             DRF_DEF(OS32, _ATTR, _PAGE_SIZE, _4KB) |
513             DRF_DEF(OS32, _ATTR, _COHERENCY, _UNCACHED);
514         const NvU32 flags =
515             NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE |
516             NVOS32_ALLOC_FLAGS_PERSISTENT_VIDMEM;
517         NvU32 ret;
518 
519         NvU32 hMemory = GetChannelHandle(pParams, pUsedHandleBitmask);
520 
521         memAllocParams.owner = pDevice->clientHandle;
522         memAllocParams.type = NVOS32_TYPE_DMA;
523         memAllocParams.size = pDevice->userDSize;
524         memAllocParams.attr = attr;
525         memAllocParams.flags = flags;
526         memAllocParams.alignment = pDevice->userDSize;
527 
528         ret = nvPushImportRmApiAlloc(pDevice,
529                                      pDevice->subDevice[deviceIndex].deviceHandle,
530                                      hMemory,
531                                      NV01_MEMORY_LOCAL_USER,
532                                      &memAllocParams);
533         if (ret != NV_OK) {
534             return FALSE;
535         }
536 
537         p->userD[deviceIndex].hMemory = hMemory;
538     }
539 
540     return TRUE;
541 }
542 
nvPushGetSupportedClassIndex(NvPushDevicePtr pDevice,const void * pClassTable,size_t classTableStride,size_t classTableLength)543 int nvPushGetSupportedClassIndex(
544     NvPushDevicePtr pDevice,
545     const void *pClassTable,
546     size_t classTableStride,
547     size_t classTableLength)
548 {
549     unsigned int i, j;
550 
551     for (i = 0; i < classTableLength; i++) {
552 
553         const NvU8 *bytes = (const NvU8 *)pClassTable;
554         const size_t byteOffset = i * classTableStride;
555         const NvPushSupportedClass *pClass =
556             (const NvPushSupportedClass *) (bytes + byteOffset);
557 
558         if (nvPushIsAModel(pDevice)) {
559             if (pDevice->amodelConfig == pClass->amodelConfig) {
560                 return i;
561             }
562             continue;
563         }
564 
565         for (j = 0; j < pDevice->numClasses; j++) {
566             if (pClass->classNumber == pDevice->supportedClasses[j]) {
567                 return i;
568             }
569         }
570     }
571     return -1;
572 }
573 
GetChannelClassAndUserDSize(NvPushDevicePtr pDevice,const NvPushAllocDeviceParams * pParams)574 static NvBool GetChannelClassAndUserDSize(
575     NvPushDevicePtr pDevice,
576     const NvPushAllocDeviceParams *pParams)
577 {
578     const struct {
579         NvPushSupportedClass base;
580         size_t gpFifoSize;
581     } gpFifoDmaClasses[] = {
582     {
583         { HOPPER_CHANNEL_GPFIFO_A,
584           NV_AMODEL_HOPPER },
585         sizeof(HopperAControlGPFifo)
586     },
587     {
588         { AMPERE_CHANNEL_GPFIFO_A,
589           NV_AMODEL_ADA },
590         sizeof(AmpereAControlGPFifo)
591     },
592     {
593         { AMPERE_CHANNEL_GPFIFO_A,
594           NV_AMODEL_AMPERE },
595         sizeof(AmpereAControlGPFifo)
596     },
597     {
598         { TURING_CHANNEL_GPFIFO_A,
599           NV_AMODEL_TURING },
600         sizeof(TuringAControlGPFifo)
601     },
602     {
603         { VOLTA_CHANNEL_GPFIFO_A,
604           NV_AMODEL_VOLTA },
605         sizeof(VoltaAControlGPFifo)
606     },
607     {
608         { PASCAL_CHANNEL_GPFIFO_A,
609           NV_AMODEL_PASCAL },
610         sizeof(PascalAControlGPFifo)
611     },
612     {
613         { MAXWELL_CHANNEL_GPFIFO_A,
614           NV_AMODEL_MAXWELL },
615         sizeof(MaxwellAControlGPFifo)
616     },
617     {
618         { KEPLER_CHANNEL_GPFIFO_C,
619           NV_AMODEL_KEPLER_SM35 },
620         sizeof(KeplerCControlGPFifo)
621     },
622     {
623         { KEPLER_CHANNEL_GPFIFO_B,
624           NV_AMODEL_KEPLER },
625         sizeof(KeplerBControlGPFifo)
626     },
627 
628     };
629 
630     int i;
631 
632     i = nvPushGetSupportedClassIndex(pDevice, gpFifoDmaClasses,
633                                      sizeof(gpFifoDmaClasses[0]),
634                                      ARRAY_LEN(gpFifoDmaClasses));
635     if (i == -1) {
636         return FALSE;
637     }
638 
639     pDevice->gpfifoClass = gpFifoDmaClasses[i].base.classNumber;
640     pDevice->userDSize = gpFifoDmaClasses[i].gpFifoSize;
641     return TRUE;
642 }
643 
644 /*
645  * Query GPU<->CPU coherency.  In particular, *pCoherent is set to TRUE when
646  * the GPU is capable of accessing CPU-cached system memory coherently with
647  * respect to CPU accesses.
648  *
649  * For surfaces with CPU read/write or CPU read-mostly such as notifiers:
650  * If *pCoherent is TRUE:
651  * - create CPU mappings with COHERENCY_WRITE_BACK
652  * - create GPU mappings with CACHE_SNOOP_ENABLE
653  * If *pCoherent is FALSE:
654  * - create CPU mappings with COHERENCY_UNCACHED
655  * - create GPU mappings with CACHE_SNOOP_DISABLE
656  *
657  * (CPU write-mostly surfaces such as the pushbuffer always use WRITE_COMBINED
658  * memory.)
659  *
660  * Note we only query on the first subdevice and assume the other subdevices
661  * are the same.
662  */
GetCoherenceFlags(NvPushChannelPtr pChannel,NvBool * pCoherent)663 static NvBool GetCoherenceFlags(
664     NvPushChannelPtr pChannel,
665     NvBool *pCoherent)
666 {
667     NvPushDevicePtr pDevice = pChannel->pDevice;
668     NV2080_CTRL_BUS_GET_INFO_PARAMS busInfo = { 0 };
669     struct {
670         NV2080_CTRL_BUS_INFO coherentFlags;
671     } busInfoList;
672 
673     NvU32 ret;
674 
675     NVMISC_MEMSET(&busInfoList, 0, sizeof(busInfoList));
676     busInfoList.coherentFlags.index =
677         NV2080_CTRL_BUS_INFO_INDEX_COHERENT_DMA_FLAGS;
678 
679     busInfo.busInfoListSize = sizeof(busInfoList) /
680                               sizeof(NV2080_CTRL_BUS_INFO);
681     busInfo.busInfoList = NV_PTR_TO_NvP64(&busInfoList);
682 
683     ret = nvPushImportRmApiControl(pDevice,
684                                    pDevice->subDevice[0].handle,
685                                    NV2080_CTRL_CMD_BUS_GET_INFO,
686                                    &busInfo, sizeof(busInfo));
687 
688     if (ret != NVOS_STATUS_SUCCESS) {
689         return FALSE;
690     }
691 
692     *pCoherent =
693         FLD_TEST_DRF(2080_CTRL_BUS_INFO, _COHERENT_DMA_FLAGS, _GPUGART, _TRUE,
694                      busInfoList.coherentFlags.data);
695     return TRUE;
696 }
697 
TryAllocAndMapPushbuffer(NvPushChannelPtr pChannel,const NvU32 allocFlags,const NvU32 mapFlags,const NvU32 limit,void ** pCpuAddress,NvU64 * pGpuAddress)698 static NvBool TryAllocAndMapPushbuffer(
699     NvPushChannelPtr pChannel,
700     const NvU32 allocFlags,
701     const NvU32 mapFlags,
702     const NvU32 limit,
703     void **pCpuAddress,
704     NvU64 *pGpuAddress)
705 {
706     NvU32 ret;
707     NvU64 localLimit;
708     NvU64 size = limit + 1;
709     void  *cpuAddress = NULL;
710     NvU64 gpuAddress = 0;
711     NvPushDevicePtr pDevice = pChannel->pDevice;
712     int deviceIndex;
713     NvBool vaAlloc[NV_MAX_SUBDEVICES] = { 0 };
714     NvBool vaMap[NV_MAX_SUBDEVICES] = { 0 };
715     NvBool surfaceAlloc = FALSE;
716 
717     for (deviceIndex = 0;
718          deviceIndex < __nvPushGetNumDevices(pDevice);
719          deviceIndex++) {
720         NV_MEMORY_ALLOCATION_PARAMS vaParams = { 0 };
721 
722         vaParams.owner = 0x70757368;
723         vaParams.type = NVOS32_TYPE_DMA;
724         vaParams.flags =
725             NVOS32_ALLOC_FLAGS_MEMORY_HANDLE_PROVIDED |
726             NVOS32_ALLOC_FLAGS_VIRTUAL;
727         vaParams.size = size;
728         vaParams.hVASpace = pDevice->subDevice[deviceIndex].gpuVASpaceObject;
729 
730         if (deviceIndex == 0) {
731             /* For the first device, RM assigns a virtual address. */
732             if (pChannel->pDevice->hal.caps.extendedBase) {
733                 /*
734                  * Force the virtual mapping to be naturally aligned.
735                  * This ensures that the allocation cannot cross a 40-bit
736                  * boundary, so we can initialize the higher bits of the VA
737                  * with the PB_EXTENDED_BASE_OPERAND GPFIFO command once at
738                  * init time and not worry about it being able to change
739                  * between any two GPFIFO entries.
740                  */
741                 vaParams.flags |= NVOS32_ALLOC_FLAGS_ALIGNMENT_FORCE;
742                 vaParams.alignment = size;
743                 ROUNDUP_POW2_U64(vaParams.alignment);
744             }
745         } else {
746             /* For subsequent devices, use the same virtual address. */
747             vaParams.flags |= NVOS32_ALLOC_FLAGS_FIXED_ADDRESS_ALLOCATE;
748             nvAssert(gpuAddress != 0);
749             vaParams.offset = gpuAddress;
750         }
751 
752         ret = nvPushImportRmApiAlloc(
753                   pDevice,
754                   pDevice->subDevice[deviceIndex].deviceHandle,
755                   pChannel->pushbufferVAHandle[deviceIndex],
756                   NV50_MEMORY_VIRTUAL,
757                   &vaParams);
758 
759         if (ret != NVOS_STATUS_SUCCESS) {
760             goto fail;
761         }
762         vaAlloc[deviceIndex] = TRUE;
763 
764         if (deviceIndex == 0) {
765             gpuAddress = vaParams.offset;
766             nvAssert(vaParams.size >= size);
767             /* The VA allocation may have been bloated to a larger size, to
768              * align with the page size.  Adjust to ensure that we allocate a
769              * surface of at least that size, or else attempts to map it will
770              * fail. */
771             size = vaParams.size;
772         } else {
773             nvAssert(gpuAddress == vaParams.offset);
774             nvAssert(vaParams.size == size);
775         }
776     }
777 
778     /* Allocate a single surface in system memory for the pushbuffer. */
779     localLimit = size - 1;
780     ret = nvPushImportRmApiAllocMemory64(
781               pDevice,
782               pDevice->subDevice[0].deviceHandle,
783               pChannel->pushbufferHandle,
784               NV01_MEMORY_SYSTEM,
785               allocFlags,
786               &cpuAddress,
787               &localLimit);
788 
789     if (ret != NVOS_STATUS_SUCCESS) {
790         goto fail;
791     }
792     nvAssert(localLimit + 1 >= size);
793     surfaceAlloc = TRUE;
794 
795     for (deviceIndex = 0;
796          deviceIndex < __nvPushGetNumDevices(pDevice);
797          deviceIndex++) {
798         NvU64 mapOffset = 0;
799 
800         ret = nvPushImportRmApiMapMemoryDma(
801                   pDevice,
802                   pDevice->subDevice[deviceIndex].deviceHandle,
803                   pChannel->pushbufferVAHandle[deviceIndex],
804                   pChannel->pushbufferHandle,
805                   0,
806                   size,
807                   mapFlags,
808                   &mapOffset);
809 
810         if (ret != NVOS_STATUS_SUCCESS) {
811             goto fail;
812         }
813         vaMap[deviceIndex] = TRUE;
814         /* mapMemoryDma takes in a relative offset but assigns an absolute VA */
815         nvAssert(mapOffset == gpuAddress);
816     }
817 
818     /* success */
819     *pCpuAddress = cpuAddress;
820     *pGpuAddress = gpuAddress;
821     return TRUE;
822 
823 fail:
824     for (deviceIndex = __nvPushGetNumDevices(pDevice) - 1;
825          deviceIndex >= 0;
826          deviceIndex--) {
827         if (vaMap[deviceIndex]) {
828             ret = nvPushImportRmApiUnmapMemoryDma(pDevice,
829                                 pDevice->subDevice[deviceIndex].deviceHandle,
830                                 pChannel->pushbufferVAHandle[deviceIndex],
831                                 pChannel->pushbufferHandle,
832                                 0,
833                                 gpuAddress);
834             nvAssert(ret == NVOS_STATUS_SUCCESS);
835             vaMap[deviceIndex] = FALSE;
836         }
837         if (vaAlloc[deviceIndex]) {
838             ret = nvPushImportRmApiFree(pDevice,
839                                 pDevice->subDevice[deviceIndex].deviceHandle,
840                                 pChannel->pushbufferVAHandle[deviceIndex]);
841             nvAssert(ret == NVOS_STATUS_SUCCESS);
842             vaAlloc[deviceIndex] = FALSE;
843         }
844     };
845 
846     if (surfaceAlloc) {
847         ret = nvPushImportRmApiFree(pDevice,
848                                     pDevice->subDevice[0].deviceHandle,
849                                     pChannel->pushbufferHandle);
850         nvAssert(ret == NVOS_STATUS_SUCCESS);
851     }
852 
853     return FALSE;
854 }
855 
AllocPushbuffer(NvPushChannelPtr pChannel,const NvPushAllocChannelParams * pParams,NvU64 * pUsedHandleBitmask,void ** pCpuAddress,NvU64 * pGpuAddress)856 static NvBool AllocPushbuffer(
857     NvPushChannelPtr pChannel,
858     const NvPushAllocChannelParams *pParams,
859     NvU64 *pUsedHandleBitmask,
860     void **pCpuAddress,
861     NvU64 *pGpuAddress)
862 {
863     const NvU32 size = CalculateGPBufferSize(pChannel);
864     NvU32 limit = size - 1;
865     int deviceIndex;
866 
867     pChannel->pushbufferHandle = GetChannelHandle(pParams, pUsedHandleBitmask);
868     for (deviceIndex = 0;
869          deviceIndex < __nvPushGetNumDevices(pChannel->pDevice);
870          deviceIndex++) {
871         pChannel->pushbufferVAHandle[deviceIndex] =
872             GetChannelHandle(pParams, pUsedHandleBitmask);
873     }
874 
875     if (TryAllocAndMapPushbuffer(
876             pChannel,
877             DRF_DEF(OS02, _FLAGS, _PHYSICALITY, _NONCONTIGUOUS) |
878                 DRF_DEF(OS02, _FLAGS, _COHERENCY, _WRITE_COMBINE),
879             DRF_DEF(OS46, _FLAGS, _CACHE_SNOOP, _DISABLE),
880             limit,
881             pCpuAddress,
882             pGpuAddress)) {
883         return TRUE;
884     }
885 
886     pChannel->pushbufferHandle = 0;
887     NVMISC_MEMSET(pChannel->pushbufferVAHandle, 0, sizeof(pChannel->pushbufferVAHandle));
888     return FALSE;
889 }
890 
891 /*!
892  * Free resources allocated in AllocUserMode().
893  */
FreeUserMode(NvPushDevicePtr pDevice)894 static void FreeUserMode(
895     NvPushDevicePtr pDevice)
896 {
897     NvU32 sd;
898 
899     for (sd = 0; sd < pDevice->numSubDevices; sd++) {
900 
901         if (pDevice->subDevice[sd].pUserMode != NULL) {
902             nvPushImportRmApiUnmapMemory(
903                   pDevice,
904                   pDevice->subDevice[sd].handle,
905                   pDevice->subDevice[sd].hUserMode,
906                   pDevice->subDevice[sd].pUserMode,
907                   0 /* flags */);
908             pDevice->subDevice[sd].pUserMode = NULL;
909         }
910 
911         if (pDevice->subDevice[sd].hUserMode != 0) {
912             nvPushImportRmApiFree(
913                   pDevice,
914                   pDevice->subDevice[sd].handle,
915                   pDevice->subDevice[sd].hUserMode);
916             pDevice->subDevice[sd].hUserMode = 0;
917         }
918     }
919 }
920 
921 /*!
922  * Allocate and map the "usermode" object on each subdevice, supported on GV100
923  * and up.  This mapping exposes registers considered safe for userspace to
924  * access directly.  Most importantly, it contains the "doorbell" register
925  * which we use to notify HOST that we've updated GP_PUT so that it will fetch
926  * work for the channel.
927  */
AllocUserMode(NvPushDevicePtr pDevice,const NvPushAllocDeviceParams * pParams,NvU64 * pUsedHandleBitmask)928 static NvBool AllocUserMode(
929     NvPushDevicePtr pDevice,
930     const NvPushAllocDeviceParams *pParams,
931     NvU64 *pUsedHandleBitmask)
932 {
933     unsigned int sd;
934 
935     static const NvPushSupportedClass userModeClasses[] = {
936         { HOPPER_USERMODE_A,
937           NV_AMODEL_HOPPER },
938         { VOLTA_USERMODE_A,
939           NV_AMODEL_VOLTA },
940     };
941     int i;
942 
943     if (!pDevice->hal.caps.clientAllocatesUserD) {
944         return TRUE;
945     }
946 
947     i = nvPushGetSupportedClassIndex(pDevice, userModeClasses,
948                                      sizeof(userModeClasses[0]),
949                                      ARRAY_LEN(userModeClasses));
950     if (i == -1) {
951         return FALSE;
952     }
953 
954     for (sd = 0; sd < pDevice->numSubDevices; sd++) {
955         NvU32 ret;
956         void *allocParams = NULL;
957 
958         NV_HOPPER_USERMODE_A_PARAMS hopperParams = { 0 };
959         if (userModeClasses[i].classNumber != VOLTA_USERMODE_A) {
960             allocParams = &hopperParams;
961             // The BAR1 mapping is used for (faster and more efficient) writes
962             // to perform work submission, but can't be used for reads.
963             // If we ever want to read from the USERMODE region (e.g., to read
964             // PTIMER) then we need a second mapping.
965             hopperParams.bBar1Mapping = NV_TRUE;
966         }
967 
968         pDevice->subDevice[sd].hUserMode =
969             GetDeviceHandle(pParams, pUsedHandleBitmask);
970 
971         ret = nvPushImportRmApiAlloc(
972                   pDevice,
973                   pDevice->subDevice[sd].handle,
974                   pDevice->subDevice[sd].hUserMode,
975                   userModeClasses[i].classNumber,
976                   allocParams);
977 
978         if (ret != NVOS_STATUS_SUCCESS) {
979             pDevice->subDevice[sd].hUserMode = 0;
980             goto fail;
981         }
982 
983         ret = nvPushImportRmApiMapMemory(
984                   pDevice,
985                   pDevice->subDevice[sd].handle,
986                   pDevice->subDevice[sd].hUserMode,
987                   0, /* offset */
988                   NVC361_NV_USERMODE__SIZE,
989                   &pDevice->subDevice[sd].pUserMode,
990                   0 /* flags */);
991 
992         if (ret != NVOS_STATUS_SUCCESS) {
993             goto fail;
994         }
995     }
996 
997     return TRUE;
998 
999 fail:
1000     FreeUserMode(pDevice);
1001     return FALSE;
1002 }
1003 
CheckCaps(NvPushDevicePtr pDevice)1004 static void CheckCaps(NvPushDevicePtr pDevice)
1005 {
1006     int deviceIndex;
1007 
1008     pDevice->hostLBoverflowBug1667921 = FALSE;
1009 
1010     for (deviceIndex = 0;
1011          deviceIndex < __nvPushGetNumDevices(pDevice);
1012          deviceIndex++) {
1013         NV0080_CTRL_FIFO_GET_CAPS_V2_PARAMS fifoCapsParams = { 0 };
1014         NvU32 ret;
1015 
1016         ret = nvPushImportRmApiControl(pDevice,
1017                                    pDevice->subDevice[deviceIndex].deviceHandle,
1018                                    NV0080_CTRL_CMD_FIFO_GET_CAPS_V2,
1019                                    &fifoCapsParams,
1020                                    sizeof(fifoCapsParams));
1021         if (ret != NVOS_STATUS_SUCCESS) {
1022             nvAssert(!"Failed to determine chip fifo capabilities");
1023             return;
1024         }
1025 
1026         pDevice->hostLBoverflowBug1667921 |=
1027             !!NV0080_CTRL_FIFO_GET_CAP(fifoCapsParams.capsTbl,
1028               NV0080_CTRL_FIFO_CAPS_HAS_HOST_LB_OVERFLOW_BUG_1667921);
1029     }
1030 }
1031 
1032 
FreeNotifiers(NvPushChannelPtr pChannel)1033 static void FreeNotifiers(
1034     NvPushChannelPtr pChannel)
1035 {
1036     NvPushDevicePtr pDevice = pChannel->pDevice;
1037 
1038     if (pChannel->notifiers.errorCtxDma != 0) {
1039         nvPushImportRmApiFree(pDevice,
1040                               pDevice->clientHandle,
1041                               pChannel->notifiers.errorCtxDma);
1042         pChannel->notifiers.errorCtxDma = 0;
1043 
1044     }
1045 
1046     if (pChannel->notifiers.gpuAddress != 0) {
1047         int deviceIndex;
1048         for (deviceIndex = __nvPushGetNumDevices(pDevice) - 1;
1049              deviceIndex >= 0;
1050              deviceIndex--) {
1051             nvPushImportRmApiUnmapMemoryDma(pDevice,
1052                                             pDevice->subDevice[deviceIndex].deviceHandle,
1053                                             pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma,
1054                                             pChannel->notifiers.memoryHandle,
1055                                             0,
1056                                             pChannel->notifiers.gpuAddress);
1057         }
1058         pChannel->notifiers.gpuAddress = 0;
1059     }
1060 
1061     if (pChannel->notifiers.memoryHandle != 0) {
1062         nvPushImportRmApiFree(pDevice,
1063                               pDevice->subDevice[0].deviceHandle,
1064                               pChannel->notifiers.memoryHandle);
1065         pChannel->notifiers.memoryHandle = 0;
1066     }
1067 }
1068 
1069 /*
1070  * Allocate enough notifier memory to store:
1071  * - numNotifiers host driver requested NvNotifications, per subDevice
1072  * - NV_PUSH_NUM_INTERNAL_NOTIFIERS NvNotifications, per channel
1073  */
AllocNotifiers(NvPushChannelPtr pChannel,const NvPushAllocChannelParams * pParams,NvBool coherent,NvU64 * pUsedHandleBitmask)1074 static NvBool AllocNotifiers(
1075     NvPushChannelPtr pChannel,
1076     const NvPushAllocChannelParams *pParams,
1077     NvBool coherent,
1078     NvU64 *pUsedHandleBitmask)
1079 {
1080     NvPushDevicePtr pDevice = pChannel->pDevice;
1081     const NvU32 size =
1082         (((pParams->numNotifiers * pDevice->numSubDevices) +
1083           NV_PUSH_NUM_INTERNAL_NOTIFIERS) *
1084          sizeof(NvNotification));
1085     NV_CONTEXT_DMA_ALLOCATION_PARAMS ctxdmaParams = { 0 };
1086 
1087     NvU64 limit = size - 1;
1088     int deviceIndex;
1089     NvU32 ret;
1090     NvU32 allocFlags, gpuMapFlags;
1091 
1092     /*
1093      * The host-driver specified number of notifiers must not collide
1094      * with the reserved bit we use to indicate internal notifiers.
1095      */
1096     if (pParams->numNotifiers & NV_PUSH_NOTIFIER_INTERNAL_BIT) {
1097         return FALSE;
1098     }
1099 
1100     pChannel->notifiers.num = pParams->numNotifiers;
1101     pChannel->notifiers.memoryHandle =
1102         GetChannelHandle(pParams, pUsedHandleBitmask);
1103 
1104     allocFlags = DRF_DEF(OS02,_FLAGS,_PHYSICALITY,_NONCONTIGUOUS),
1105     gpuMapFlags = 0;
1106     if (coherent) {
1107         allocFlags = FLD_SET_DRF(OS02, _FLAGS, _COHERENCY, _WRITE_BACK, allocFlags);
1108         gpuMapFlags = FLD_SET_DRF(OS46, _FLAGS, _CACHE_SNOOP, _ENABLE, gpuMapFlags);
1109     } else {
1110         allocFlags = FLD_SET_DRF(OS02, _FLAGS, _COHERENCY, _UNCACHED, allocFlags);
1111         gpuMapFlags = FLD_SET_DRF(OS46, _FLAGS, _CACHE_SNOOP, _DISABLE, gpuMapFlags);
1112     }
1113 
1114     ret = nvPushImportRmApiAllocMemory64(
1115               pDevice,
1116               pDevice->subDevice[0].deviceHandle,
1117               pChannel->notifiers.memoryHandle,
1118               NV01_MEMORY_SYSTEM,
1119               allocFlags,
1120               (void **)&pChannel->notifiers.cpuAddress,
1121               &limit);
1122 
1123     if (ret != NVOS_STATUS_SUCCESS) {
1124         pChannel->notifiers.memoryHandle = 0;
1125         goto fail;
1126     }
1127 
1128     /* Map the memory into the GPU's VA space. */
1129 
1130     for (deviceIndex = 0;
1131          deviceIndex < __nvPushGetNumDevices(pDevice);
1132          deviceIndex++) {
1133         NvU32 mapFlags = gpuMapFlags;
1134         NvU64 gpuAddress;
1135         if (deviceIndex == 0) {
1136             /* For the first device, RM assigns a virtual address. */
1137             gpuAddress = 0;
1138         } else {
1139             /* For subsequent devices, use the same virtual address. */
1140             mapFlags = FLD_SET_DRF(OS46, _FLAGS, _DMA_OFFSET_FIXED, _TRUE,
1141                                    mapFlags);
1142             gpuAddress = pChannel->notifiers.gpuAddress;
1143             nvAssert(gpuAddress != 0);
1144         }
1145         ret = nvPushImportRmApiMapMemoryDma(
1146                   pDevice,
1147                   pDevice->subDevice[deviceIndex].deviceHandle,
1148                   pDevice->subDevice[deviceIndex].gpuVASpaceCtxDma,
1149                   pChannel->notifiers.memoryHandle,
1150                   0, /* offset */
1151                   size,
1152                   mapFlags,
1153                   &gpuAddress);
1154 
1155         if (ret != NVOS_STATUS_SUCCESS) {
1156             goto fail;
1157         }
1158 
1159         if (deviceIndex == 0) {
1160             pChannel->notifiers.gpuAddress = gpuAddress;
1161         } else {
1162             nvAssert(pChannel->notifiers.gpuAddress == gpuAddress);
1163         }
1164     }
1165 
1166     /* Create the internal notifier ctxDma. */
1167 
1168     pChannel->notifiers.errorCtxDma =
1169         GetChannelHandle(pParams, pUsedHandleBitmask);
1170 
1171     ctxdmaParams.hMemory = pChannel->notifiers.memoryHandle;
1172     ctxdmaParams.flags = DRF_DEF(OS03, _FLAGS, _MAPPING, _KERNEL) |
1173                          DRF_DEF(OS03, _FLAGS, _HASH_TABLE, _DISABLE);
1174     /* the internal notifiers are at the start of the memory */
1175     ctxdmaParams.offset = 0;
1176     ctxdmaParams.limit = (NV_PUSH_NUM_INTERNAL_NOTIFIERS *
1177                           sizeof(NvNotification)) - 1;
1178 
1179     ret = nvPushImportRmApiAlloc(pDevice,
1180                                  pDevice->subDevice[0].deviceHandle,
1181                                  pChannel->notifiers.errorCtxDma,
1182                                  NV01_CONTEXT_DMA,
1183                                  &ctxdmaParams);
1184 
1185     if (ret != NVOS_STATUS_SUCCESS) {
1186         pChannel->notifiers.errorCtxDma = 0;
1187         goto fail;
1188     }
1189 
1190     /*
1191      * Initialize the error notifier; note that there is only one
1192      * error notifier shared by all subdevices, so we specify master as the
1193      * subDeviceMask.
1194      */
1195     nvPushInitWaitForNotifier(pChannel,
1196                               NV_PUSH_ERROR_NOTIFIER_INDEX,
1197                               NV_PUSH_SUBDEVICE_MASK_PRIMARY);
1198 
1199     return TRUE;
1200 
1201 fail:
1202     FreeNotifiers(pChannel);
1203     return FALSE;
1204 }
1205 
GetExtendedBase(NvU64 offset)1206 static NvU32 GetExtendedBase(NvU64 offset)
1207 {
1208     return NvU64_HI32(offset) >> 8;
1209 }
1210 
InitGpFifoExtendedBase(NvPushChannelPtr pChannel)1211 static void InitGpFifoExtendedBase(
1212     NvPushChannelPtr pChannel)
1213 {
1214     const NvU64 pbBase = pChannel->main.gpuMapOffset;
1215     const NvU32 extendedBase = GetExtendedBase(pbBase);
1216     NvU32 *gpPointer = &(pChannel->gpfifo[pChannel->gpPutOffset*2]);
1217     NvU32 i;
1218 
1219     if (!pChannel->pDevice->hal.caps.extendedBase) {
1220         nvAssert(extendedBase == 0);
1221         return;
1222     }
1223 
1224     /*
1225      * Because of the natural VA alignment specified when allocating the
1226      * pushbuffer, all parts of the pushbuffer surface should be in the same
1227      * 40-bit region.
1228      */
1229     nvAssert(GetExtendedBase(pChannel->main.gpuMapOffset) ==
1230              GetExtendedBase(pChannel->progressTracker.gpuMapOffset));
1231     nvAssert(GetExtendedBase(pChannel->main.gpuMapOffset +
1232                              pChannel->main.sizeInBytes - 1) ==
1233              GetExtendedBase(pChannel->main.gpuMapOffset));
1234     nvAssert(GetExtendedBase(pChannel->progressTracker.gpuMapOffset +
1235                              pChannel->progressTracker.sizeInBytes - 1) ==
1236              GetExtendedBase(pChannel->progressTracker.gpuMapOffset));
1237 
1238     /* Set the "extended base" for all subsequent methods */
1239     gpPointer[0] = DRF_NUM(C86F, _GP_ENTRY0, _PB_EXTENDED_BASE_OPERAND, extendedBase);
1240     gpPointer[1] = DRF_DEF(C86F, _GP_ENTRY1, _OPCODE, _SET_PB_SEGMENT_EXTENDED_BASE);
1241     gpPointer += 2;
1242 
1243     /* Pad out with NOP GPFIFO methods so everything remains aligned. */
1244     for (i = 1; i < NV_PUSH_NUM_GPFIFO_ENTRIES_PER_KICKOFF; i++) {
1245         gpPointer[0] = 0;
1246         gpPointer[1] = DRF_DEF(C86F, _GP_ENTRY1, _OPCODE, _NOP);
1247         gpPointer += 2;
1248     }
1249 
1250     pChannel->gpPutOffset += NV_PUSH_NUM_GPFIFO_ENTRIES_PER_KICKOFF;
1251 
1252 }
1253 
nvPushAllocChannel(const NvPushAllocChannelParams * pParams,NvPushChannelPtr buffer)1254 NvBool nvPushAllocChannel(
1255     const NvPushAllocChannelParams *pParams,
1256     NvPushChannelPtr buffer)
1257 {
1258     NvPushDevicePtr pDevice;
1259     void  *cpuAddress = NULL;
1260     NvU64  gpuAddress = 0;
1261     NvU64  usedHandleBitmask = 0;
1262     NvBool coherent = FALSE;
1263 
1264     NVMISC_MEMSET(buffer, 0, sizeof(*buffer));
1265 
1266     pDevice = pParams->pDevice;
1267 
1268     buffer->pDevice = pDevice;
1269     buffer->logNvDiss = pParams->logNvDiss;
1270     buffer->noTimeout = pParams->noTimeout;
1271     buffer->ignoreChannelErrors = pParams->ignoreChannelErrors;
1272 
1273     buffer->currentSubDevMask = NV_PUSH_SUBDEVICE_MASK_ALL;
1274 
1275     /*
1276      * Assign main.sizeInBytes early, because the rest of
1277      * initialization relies on knowing the main pushbuffer size.
1278      * Note this must fit in NV_PUSH_PROGRESS_TRACKER_SEMAPHORE_GET,
1279      * which stores dwords.
1280      */
1281     nvAssert((DRF_MASK(NV_PUSH_PROGRESS_TRACKER_SEMAPHORE_GET) * 4) >
1282              pParams->pushBufferSizeInBytes);
1283     buffer->main.sizeInBytes = pParams->pushBufferSizeInBytes;
1284 
1285     /*
1286      * Compute numGpFifoEntries.  There are several constraints:
1287      *
1288      * - We make numGpFifoEntries 1/64th the size of the main
1289      *   pushbuffer.  The maximum pushbuffer size is 1048572, and we
1290      *   consume 2 gpFifo entries per kickoff.  This works out to be
1291      *   128 bytes of pushbuffer (32 dwords) per kickoff, before we
1292      *   are gpFifo-limited.
1293      *
1294      * - Per dev_pbdma.ref, "The number of GP entries in the circular
1295      *   buffer is always a power of 2."  So, round up to the next
1296      *   power of two.
1297      *
1298      * - Because we consume 2 gpFifo entries per kickoff
1299      *   (NV_PUSH_NUM_GPFIFO_ENTRIES_PER_KICKOFF), we also align to a
1300      *   multiple of 2.  This should be guaranteed by the power of 2
1301      *   check.
1302      *
1303      * - numGpFifoEntries must fit in
1304      *   NV_PUSH_PROGRESS_TRACKER_SEMAPHORE_GP_GET so that the
1305      *   progress tracker semaphore releases can report the consumed
1306      *   gpFifo entry.  The distribution of bits in
1307      *   NV_PUSH_PROGRESS_TRACKER_SEMAPHORE should ensure this is
1308      *   satisfied.
1309      */
1310 
1311     buffer->numGpFifoEntries = pParams->pushBufferSizeInBytes / 64;
1312 
1313     buffer->numGpFifoEntries = nvNextPow2_U32(buffer->numGpFifoEntries);
1314 
1315     nvAssert((buffer->numGpFifoEntries %
1316               NV_PUSH_NUM_GPFIFO_ENTRIES_PER_KICKOFF) == 0);
1317 
1318     nvAssert((DRF_MASK(NV_PUSH_PROGRESS_TRACKER_SEMAPHORE_GP_GET) * 2) >
1319              buffer->numGpFifoEntries);
1320 
1321     if (!GetCoherenceFlags(buffer, &coherent)) {
1322         goto failed;
1323     }
1324 
1325     if (!AllocNotifiers(buffer, pParams, coherent, &usedHandleBitmask)) {
1326         nvPushImportLogError(pDevice,
1327                              "Failed to allocate notification memory.");
1328         goto failed;
1329     }
1330 
1331     /* Only allocate memory for one pushbuffer.  All subdevices will share */
1332     if (!AllocPushbuffer(buffer,
1333                          pParams,
1334                          &usedHandleBitmask,
1335                          &cpuAddress,
1336                          &gpuAddress)) {
1337         nvPushImportLogError(pDevice,
1338                              "Push buffer DMA allocation failed");
1339         goto failed;
1340     }
1341 
1342     /* First the "main" pushbuffer */
1343     InitDmaSegment(&buffer->main,
1344                    cpuAddress,
1345                    gpuAddress,
1346                    pParams->pushBufferSizeInBytes);
1347     /* Next the GPFIFO */
1348     buffer->gpfifo =
1349         (NvU32 *)((char *)cpuAddress +  __nvPushGpFifoOffset(buffer));
1350     buffer->gpPutOffset = 0;
1351     /* Next the "progressTracker" */
1352     InitDmaSegment(&buffer->progressTracker,
1353                    (char *)cpuAddress + __nvPushProgressTrackerOffset(buffer),
1354                    gpuAddress + __nvPushProgressTrackerOffset(buffer),
1355                    ProgressTrackerBufferSize(buffer));
1356 
1357     if (!nvDmaAllocUserD(buffer, pParams, &usedHandleBitmask)) {
1358         goto failed;
1359     }
1360 
1361     if (!AllocChannelObject(buffer, pParams,
1362                             &usedHandleBitmask, gpuAddress)) {
1363         goto failed;
1364     }
1365 
1366     if (pDevice->hal.caps.clientAllocatesUserD &&
1367         !RequestChidToken(buffer)) {
1368         goto failed;
1369     }
1370 
1371     if (!AllocSemaSurface(buffer, pParams, coherent, &usedHandleBitmask)) {
1372         goto failed;
1373     }
1374 
1375 #if defined(DEBUG)
1376     if (buffer->logNvDiss) {
1377         nvPushImportLogNvDiss(buffer, "nvdiss:  encoding 2\n");
1378     }
1379 #endif /* DEBUG */
1380 
1381     InitGpFifoExtendedBase(buffer);
1382 
1383     if (!__nvPushTestPushBuffer(buffer)) {
1384         goto failed;
1385     }
1386 
1387     buffer->initialized = TRUE;
1388 
1389     return TRUE;
1390 
1391 failed:
1392     nvPushFreeChannel(buffer);
1393     return FALSE;
1394 }
1395 
1396 /*!
1397  * Free resources allocated by AllocChannel().
1398  */
nvPushFreeChannel(NvPushChannelPtr buffer)1399 void nvPushFreeChannel(NvPushChannelPtr buffer)
1400 {
1401     NvPushDevicePtr pDevice = buffer->pDevice;
1402     unsigned int sd;
1403     int deviceIndex;
1404 
1405     if (pDevice == NULL) {
1406         goto done;
1407     }
1408 
1409     /* Unmap pushbuffer DMA controls */
1410     for (sd = 0; sd < pDevice->numSubDevices; sd++) {
1411         NvU32 userdMapHandle;
1412 
1413         deviceIndex = __nvPushGetDeviceIndex(pDevice, sd);
1414         if (pDevice->hal.caps.clientAllocatesUserD) {
1415             userdMapHandle = buffer->userD[deviceIndex].hMemory;
1416         } else {
1417             userdMapHandle = buffer->channelHandle[deviceIndex];
1418         }
1419 
1420         if (buffer->control[sd]) {
1421             nvPushImportRmApiUnmapMemory(pDevice,
1422                                          pDevice->subDevice[sd].handle,
1423                                          userdMapHandle,
1424                                          buffer->control[sd],
1425                                          0);
1426             buffer->control[sd] = NULL;
1427         }
1428     }
1429 
1430     for (deviceIndex = __nvPushGetNumDevices(pDevice) - 1;
1431          deviceIndex >= 0;
1432          deviceIndex--) {
1433         if (buffer->channelHandle[deviceIndex] != 0) {
1434             nvPushImportRmApiFree(pDevice,
1435                                   pDevice->subDevice[deviceIndex].deviceHandle,
1436                                   buffer->channelHandle[deviceIndex]);
1437             buffer->channelHandle[deviceIndex] = 0;
1438         }
1439 
1440         if (buffer->userD[deviceIndex].hMemory != 0) {
1441             nvPushImportRmApiFree(pDevice,
1442                                   pDevice->subDevice[deviceIndex].deviceHandle,
1443                                   buffer->userD[deviceIndex].hMemory);
1444             buffer->userD[deviceIndex].hMemory = 0;
1445         }
1446 
1447         if (buffer->pushbufferVAHandle[deviceIndex] != 0) {
1448             nvPushImportRmApiFree(pDevice,
1449                                   pDevice->subDevice[deviceIndex].deviceHandle,
1450                                   buffer->pushbufferVAHandle[deviceIndex]);
1451             buffer->pushbufferVAHandle[deviceIndex] = 0;
1452         }
1453     }
1454 
1455     if (buffer->pushbufferHandle != 0) {
1456         nvPushImportRmApiFree(pDevice,
1457                               pDevice->subDevice[0].deviceHandle,
1458                               buffer->pushbufferHandle);
1459         buffer->pushbufferHandle = 0;
1460     }
1461 
1462     FreeNotifiers(buffer);
1463 
1464     FreeSemaSurface(buffer);
1465 
1466 done:
1467     NVMISC_MEMSET(buffer, 0, sizeof(*buffer));
1468 }
1469 
nvPushAllocDevice(const NvPushAllocDeviceParams * pParams,NvPushDevicePtr pDevice)1470 NvBool nvPushAllocDevice(
1471     const NvPushAllocDeviceParams *pParams,
1472     NvPushDevicePtr pDevice)
1473 {
1474     unsigned int sd;
1475     NvU64 usedHandleBitmask = 0;
1476 
1477     NVMISC_MEMSET(pDevice, 0, sizeof(*pDevice));
1478 
1479     pDevice->hostDevice       = pParams->hostDevice;
1480     pDevice->pImports         = pParams->pImports;
1481     pDevice->numSubDevices    = pParams->numSubDevices;
1482     pDevice->clientSli        = pParams->clientSli;
1483     pDevice->clientHandle     = pParams->clientHandle;
1484 
1485     pDevice->numClasses       = pParams->numClasses;
1486     pDevice->supportedClasses = pParams->supportedClasses;
1487 
1488     pDevice->confidentialComputeMode  = pParams->confidentialComputeMode;
1489 
1490     for (sd = 0; sd < pParams->numSubDevices; sd++) {
1491         pDevice->subDevice[sd].handle = pParams->subDevice[sd].handle;
1492         pDevice->subDevice[sd].deviceHandle = pParams->subDevice[sd].deviceHandle;
1493         pDevice->subDevice[sd].gpuVASpaceObject = pParams->subDevice[sd].gpuVASpaceObject;
1494         pDevice->subDevice[sd].gpuVASpaceCtxDma = pParams->subDevice[sd].gpuVASpace;
1495     }
1496 
1497     if (pParams->amodel.config != NV_AMODEL_NONE) {
1498         nvAssert(!"Ignoring AModel configuration on non-XAMODEL build");
1499     }
1500     pDevice->amodelConfig = pParams->amodel.config;
1501 
1502     CheckCaps(pDevice);
1503 
1504     if (!GetChannelClassAndUserDSize(pDevice, pParams)) {
1505         nvPushImportLogError(pDevice,
1506                              "No supported command buffer format found");
1507         goto fail;
1508     }
1509 
1510     if (!__nvPushGetHal(pParams, pDevice->gpfifoClass, &pDevice->hal)) {
1511         nvPushImportLogError(pDevice, "No push buffer implementation found.");
1512         goto fail;
1513     }
1514 
1515     if (!AllocUserMode(pDevice, pParams, &usedHandleBitmask)) {
1516         nvPushImportLogError(pDevice,
1517                              "Unable to allocate push buffer controls.");
1518         goto fail;
1519     }
1520 
1521 
1522     return TRUE;
1523 
1524 fail:
1525     nvPushFreeDevice(pDevice);
1526 
1527     return FALSE;
1528 }
1529 
nvPushFreeDevice(NvPushDevicePtr pDevice)1530 void nvPushFreeDevice(
1531     NvPushDevicePtr pDevice)
1532 {
1533     FreeUserMode(pDevice);
1534 
1535     NVMISC_MEMSET(pDevice, 0, sizeof(*pDevice));
1536 }
1537