1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 /*
25  * DIFR stands for Display Idle Frame Refresh which is a low-power feature
26  * for display that allows scanning out frames from L2 cache. The actual GPU
27  * memory can be gated off while the display outputs are served off the
28  * cache.
29  *
30  * DIFR is defined in three operational layers 1, 2, and 3 and operates in
31  * terms of entering and exiting these layers in order.
32  *
33  * Layer 1 has to deem it's possible to enter DIFR until layer 2 and 3 can
34  * start considering. Any layer seeing conditions that prevent entering DIFR
35  * mode can abort the attempt to enter. But, finally, if all layers agree
36  * the hardware will switch to low-power mode, turn off GPU memory, and
37  * start serving pixels off the cache.
38  *
39  * Managing some high-level state to help the hardware transition from one
40  * layer to another is implemented in NVKMS and RM. Simplified, NVKMS
41  * handles assistance for layer 1 and RM for layer 2.
42  *
43  * Much of the layer 1 or NVKMS DIFR specific code is collected into this
44  * file, centered around an object called NVDIFRStateEvo.
45  *
46  * The role of NVKMS is to listen for DIFR prefetch events (which originate
47  * from h/w and get dispatched by RM), prefetch framebuffer pixels into L2
48  * cache, and report back to h/w (via RM). NVKMS will also disable DIFR each
49  * time there's an explicitly known display update (such as a flip) and
50  * re-enable it once enough idle time has passed.
51  *
52  * The rest of NVKMS will call entrypoints in this file to inform the DIFR
53  * implementation here about changes in relevant state.
54  *
55  * For each DevEvo object nvkms-evo.c will call
56  * nvDIFRAllocate()/nvDIFRFree() here to also create a corresponding DIFR
57  * state object. The DIFR state will contain everything needed to implement
58  * prefetching such as channel and copy engine allocation.
59  *
60  * If DIFR state was successfully allocated, nvkms-rm.c will create an event
61  * listener for DIFR prefetch events which will call back to
62  * nvDIFRPrefetchSurfaces() here in order to do prefetching. This means
63  * going through each active head and issuing a special CE copy, for all
64  * layers of the surface, to populate the L2 cache with framebuffer pixel
65  * data.
66  *
67  * After all prefetches are complete, RM needs to know about the completion
68  * status. This is implemented in nvDIFRSendPrefetchResponse(), again called
69  * by nvkms-rm.c.
70  *
71  * NVKMS must also temporarily disable DIFR in hardware if it knows about
72  * upcoming updates to the framebuffer and then re-enable DIFR when the
73  * screen becomes idle again. For this, nvFlipEvoOneHead() will call us back
74  * via nvDIFRNotifyFlip() when a new flip is happening. We will call RM to
75  * disable DIFR, then set up a timer into the future and when it triggers we
76  * will re-enable DIFR again. But if nvFlipEvoOneHead() notifies us about
77  * another upcoming frame, we'll just replace the old timer with a new one.
78  * Thus, the timer will eventually wake us after notifications of new frames
79  * cease to come in.
80  *
81  * The DIFR hardware will automatically detect activity in graphics/copy
82  * engines and will not try to enter the low-power mode if there is any. So
83  * this is something NVKMS doesn't have to worry about.
84  *
85  * Userspace can also flag surfaces as non-cacheable which makes us abort
86  * any prefetches if those surfaces are currently displayed on any active
87  * heads. For now, CPU mapped surfaces are flagged as such because neither
88  * NVKMS nor the hardware can observe CPU writes into a surface.
89  */
90 
91 
92 
93 #include "nvkms-difr.h"
94 #include "nvkms-push.h"
95 #include "nvkms-rm.h"
96 #include "nvkms-rmapi.h"
97 #include "nvkms-utils.h"
98 
99 #include "nvidia-push-init.h"
100 #include "nvidia-push-methods.h"
101 #include "nvidia-push-types.h"
102 #include "nvidia-push-types.h"
103 #include "nvidia-push-utils.h"
104 
105 #include <class/cl2080.h>
106 #include <class/cla06f.h>
107 #include <class/cla06fsubch.h>
108 #include <class/cla0b5.h>
109 #include <class/clb0b5sw.h>
110 #include <class/clc7b5.h>
111 #include <ctrl/ctrl2080/ctrl2080ce.h>
112 #include <ctrl/ctrl2080/ctrl2080lpwr.h>
113 
114 #define PREFETCH_DONE_VALUE             0x00000fed
115 
116 /* How long to wait after last flip until re-enabling DIFR. */
117 #define DIFR_IDLE_WAIT_PERIOD_US        500000
118 
119 /* How long to wait for prefetch dma completion. */
120 #define DIFR_PREFETCH_WAIT_PERIOD_US    10000 /* 10ms */
121 
122 /*
123  * DIFR runtime state
124  */
125 typedef struct _NVDIFRStateEvoRec {
126     NVDevEvoPtr pDevEvo;
127     NvU32 copyEngineType;
128 
129     /*
130      * This is kept in sync with whether DIFR is explicitly disabled in
131      * hardware.
132      */
133     NvBool hwDisabled;
134     NvU64 lastFlipTime;
135     nvkms_timer_handle_t *idleTimer;
136 
137     /* Pushbuffer for DIFR prefetches. */
138     NvPushChannelRec prefetchPushChannel;
139     NvU32 pushChannelHandlePool[NV_PUSH_CHANNEL_HANDLE_POOL_NUM];
140 
141     /* Copy engine instance for DIFR prefetches. */
142     NvU32 prefetchEngine;
143 
144     /* For tracking which surfaces have been prefetched already. */
145     NvU32 prefetchPass;
146 } NVDIFRStateEvoRec;
147 
148 /*
149  * Prefetch parameters for DMA copy.
150  */
151 typedef struct {
152     NvUPtr surfGpuAddress;
153     size_t surfSizeBytes;
154     enum NvKmsSurfaceMemoryFormat surfFormat;
155     NvU32 surfPitchBytes;
156 } NVDIFRPrefetchParams;
157 
158 static NvBool AllocDIFRPushChannel(NVDIFRStateEvoPtr pDifr);
159 static void FreeDIFRPushChannel(NVDIFRStateEvoPtr pDifr);
160 static NvBool AllocDIFRCopyEngine(NVDIFRStateEvoPtr pDifr);
161 static void FreeDIFRCopyEngine(NVDIFRStateEvoPtr pDifr);
162 
163 static NvU32 PrefetchSingleSurface(NVDIFRStateEvoPtr pDifr,
164                                    NVDIFRPrefetchParams *pParams,
165                                    size_t *remainingCache);
166 static NvBool PrefetchHelperSurfaceEvo(NVDIFRStateEvoPtr pDifr,
167                                        size_t *cacheRemaining,
168                                        NVSurfaceEvoPtr pSurfaceEvo,
169                                        NvU32 *status);
170 static NvBool PrefetchHelperLutSurface(NVDIFRStateEvoPtr pDifr,
171                                        size_t *cacheRemaining,
172                                        NVLutSurfaceEvoPtr pLutSurface,
173                                        NvU32 *status);
174 
175 static NvBool SetDisabledState(NVDIFRStateEvoPtr pDifr,
176                                NvBool shouldDisable);
177 static NvBool IsCECompatibleWithDIFR(NVDevEvoPtr pDevEvo,
178                                      NvU32 instance);
179 static void EnsureIdleTimer(NVDIFRStateEvoPtr pDifr);
180 static void IdleTimerProc(void *dataPtr, NvU32 dataU32);
181 
182 /*
183  * Public entry points.
184  */
185 
186 NVDIFRStateEvoPtr nvDIFRAllocate(NVDevEvoPtr pDevEvo)
187 {
188     NV2080_CTRL_CMD_LPWR_DIFR_CTRL_PARAMS params = { 0 };
189     NVDIFRStateEvoPtr pDifr;
190     NvU32 ret;
191 
192     /* DIFR not supported/implemented on RM SLI */
193     if (pDevEvo->numSubDevices > 1) {
194         return NULL;
195     }
196 
197     params.ctrlParamVal = NV2080_CTRL_LPWR_DIFR_CTRL_SUPPORT_STATUS;
198     ret = nvRmApiControl(nvEvoGlobal.clientHandle,
199                          pDevEvo->pSubDevices[0]->handle,
200                          NV2080_CTRL_CMD_LPWR_DIFR_CTRL,
201                          &params,
202                          sizeof(params));
203 
204     if (ret != NV_OK) {
205         nvEvoLogDev(pDevEvo,
206                     EVO_LOG_WARN,
207                     "unable to query whether display caching is supported");
208         return NULL;
209     }
210 
211     if (params.ctrlParamVal != NV2080_CTRL_LPWR_DIFR_SUPPORTED) {
212         return NULL;
213     }
214 
215     pDifr = nvCalloc(sizeof(*pDifr), 1);
216     if (!pDifr) {
217         return NULL;
218     }
219 
220     pDifr->pDevEvo = pDevEvo;
221 
222     if (!AllocDIFRPushChannel(pDifr) ||
223         !AllocDIFRCopyEngine(pDifr)) {
224         nvDIFRFree(pDifr);
225 
226         return NULL;
227     }
228 
229     return pDifr;
230 }
231 
232 void nvDIFRFree(NVDIFRStateEvoPtr pDifr)
233 {
234     nvAssert(pDifr);
235 
236     /* Cancel pending idle timer. */
237     nvkms_free_timer(pDifr->idleTimer);
238 
239     /* Leave DIFR enabled (default state). */
240     SetDisabledState(pDifr, FALSE);
241 
242     /* Free resources. */
243     FreeDIFRCopyEngine(pDifr);
244     FreeDIFRPushChannel(pDifr);
245 
246     nvFree(pDifr);
247 }
248 
249 /*
250  * Notify of a new or upcoming flip. This will disable DIFR for a brief
251  * period in anticipation of further flips.
252  */
253 void nvDIFRNotifyFlip(NVDIFRStateEvoPtr pDifr)
254 {
255     pDifr->lastFlipTime = nvkms_get_usec();
256 
257     /* A flip is coming: signal RM to disable DIFR if we haven't already. */
258     if (SetDisabledState(pDifr, TRUE)) {
259         /* Check back after a while and re-enable if idle again. */
260         EnsureIdleTimer(pDifr);
261     }
262 }
263 
264 NvU32 nvDIFRPrefetchSurfaces(NVDIFRStateEvoPtr pDifr, size_t l2CacheSize)
265 {
266     NVDevEvoPtr pDevEvo = pDifr->pDevEvo;
267     NVEvoSubDevPtr pSubDev;
268     NVEvoSubDevHeadStatePtr pHeadState;
269     size_t cacheRemaining = l2CacheSize;
270     NvU32 layer;
271     NvU32 head;
272     NvU32 apiHead;
273     NvU32 eye;
274     NvU32 i;
275     NvU32 status;
276 
277     /*
278      * If DIFR is disabled it's because we know we were or will be flipping, or
279      * if console is active then the scanout surfaces will get updated by the
280      * OS console driver without any knowledge of NVKMS.
281      */
282     if (pDifr->hwDisabled || nvEvoIsConsoleActive(pDevEvo)) {
283         return NV2080_CTRL_LPWR_DIFR_PREFETCH_FAIL_OS_FLIPS_ENABLED;
284     }
285 
286     status = NV2080_CTRL_LPWR_DIFR_PREFETCH_SUCCESS;
287 
288     pSubDev = &pDevEvo->gpus[0];
289 
290     /* Get new prefetch pass counter for this iteration. */
291     pDifr->prefetchPass++;
292 
293     /*
294      * Start by prefetching the cursor surface and image surfaces from
295      * present layers.
296      */
297     for (head = 0; head < pDevEvo->numHeads; head++) {
298         pHeadState = &pSubDev->headState[head];
299 
300         if (!PrefetchHelperSurfaceEvo(pDifr,
301                                       &cacheRemaining,
302                                       pHeadState->cursor.pSurfaceEvo,
303                                       &status)) {
304             goto out;
305         }
306 
307         for (layer = 0; layer <= pDevEvo->head[head].numLayers; layer++) {
308             for (eye = 0; eye < NVKMS_MAX_EYES; eye++) {
309 
310                 if (!PrefetchHelperSurfaceEvo(pDifr,
311                                               &cacheRemaining,
312                                               pHeadState->layer[layer].pSurfaceEvo[eye],
313                                               &status)) {
314                     goto out;
315                 }
316             }
317 
318             /*
319              * Prefetch per-layer LUTs, if any, but skip null LUTs and
320              * duplicates already prefetched.
321              */
322             if (!PrefetchHelperLutSurface(pDifr,
323                                           &cacheRemaining,
324                                           pHeadState->layer[layer].inputLut.pLutSurfaceEvo,
325                                           &status)) {
326                 goto out;
327             }
328 
329             if (!PrefetchHelperLutSurface(pDifr,
330                                           &cacheRemaining,
331                                           pHeadState->layer[layer].tmoLut.pLutSurfaceEvo,
332                                           &status)) {
333                 goto out;
334             }
335         }
336     }
337 
338     /*
339      * Finally prefetch the known main LUTs.
340      */
341     if (!PrefetchHelperLutSurface(pDifr,
342                                   &cacheRemaining,
343                                   pDevEvo->lut.defaultLut,
344                                   &status)) {
345         goto out;
346     }
347 
348     for (apiHead = 0; apiHead < pDevEvo->numApiHeads; apiHead++) {
349         for (i = 0; i < ARRAY_LEN(pDevEvo->lut.apiHead[apiHead].LUT); i++) {
350             if (!PrefetchHelperLutSurface(pDifr,
351                                           &cacheRemaining,
352                                           pDevEvo->lut.apiHead[apiHead].LUT[i],
353                                           &status)) {
354                 goto out;
355             }
356         }
357     }
358 
359 out:
360     return status;
361 }
362 
363 NvBool nvDIFRSendPrefetchResponse(NVDIFRStateEvoPtr pDifr, NvU32 responseStatus)
364 {
365     NVDevEvoPtr pDevEvo = pDifr->pDevEvo;
366     NV2080_CTRL_CMD_LPWR_DIFR_PREFETCH_RESPONSE_PARAMS params = { 0 };
367 
368     params.responseVal = responseStatus;
369 
370     return (nvRmApiControl(nvEvoGlobal.clientHandle,
371                            pDevEvo->pSubDevices[0]->handle,
372                            NV2080_CTRL_CMD_LPWR_DIFR_PREFETCH_RESPONSE,
373                            &params,
374                            sizeof(params))
375             == NV_OK);
376 }
377 
378 /*
379  * Local helper functions.
380  */
381 static NvBool AllocDIFRPushChannel(NVDIFRStateEvoPtr pDifr)
382 {
383     NVDevEvoPtr pDevEvo = pDifr->pDevEvo;
384     NvPushAllocChannelParams params = { 0 };
385     NvU32 i;
386 
387     pDifr->copyEngineType = NV2080_ENGINE_TYPE_NULL;
388 
389     for (i = 0; i < NV2080_ENGINE_TYPE_COPY_SIZE; i++) {
390         if (IsCECompatibleWithDIFR(pDevEvo, i)) {
391             pDifr->copyEngineType = NV2080_ENGINE_TYPE_COPY(i);
392             break;
393         }
394     }
395 
396     if (pDifr->copyEngineType == NV2080_ENGINE_TYPE_NULL) {
397         return FALSE;
398     }
399 
400     params.engineType = pDifr->copyEngineType;
401     params.pDevice = &pDifr->pDevEvo->nvPush.device;
402     params.difrPrefetch = TRUE;
403     params.logNvDiss = FALSE;
404     params.noTimeout = FALSE;
405     params.ignoreChannelErrors = FALSE;
406     params.numNotifiers = 1;
407     params.pushBufferSizeInBytes = 1024;
408 
409     ct_assert(sizeof(params.handlePool) == sizeof(pDifr->pushChannelHandlePool));
410 
411     for (i = 0; i < ARRAY_LEN(pDifr->pushChannelHandlePool); i++) {
412         pDifr->pushChannelHandlePool[i] =
413             nvGenerateUnixRmHandle(&pDevEvo->handleAllocator);
414 
415         params.handlePool[i] = pDifr->pushChannelHandlePool[i];
416     }
417 
418     if (!nvPushAllocChannel(&params, &pDifr->prefetchPushChannel)) {
419         return FALSE;
420     }
421 
422     return TRUE;
423 }
424 
425 static void FreeDIFRPushChannel(NVDIFRStateEvoPtr pDifr)
426 {
427     NVDevEvoPtr pDevEvo = pDifr->pDevEvo;
428     NvU32 i;
429 
430     nvPushFreeChannel(&pDifr->prefetchPushChannel);
431 
432     for (i = 0; i < ARRAY_LEN(pDifr->pushChannelHandlePool); i++) {
433         nvFreeUnixRmHandle(&pDevEvo->handleAllocator,
434                            pDifr->pushChannelHandlePool[i]);
435         pDifr->pushChannelHandlePool[i] = 0;
436     }
437 }
438 
439 static NvBool AllocDIFRCopyEngine(NVDIFRStateEvoPtr pDifr)
440 {
441     NVB0B5_ALLOCATION_PARAMETERS allocParams = { 0 };
442     NVDevEvoPtr pDevEvo = pDifr->pDevEvo;
443     NvU32 ret;
444 
445     /*
446      * We will only be called if NV2080_CTRL_CMD_LPWR_DIFR_CTRL says DIFR is
447      * supported in which case we assume the chip supports this CE class.
448      */
449     nvAssert(nvRmEvoClassListCheck(pDevEvo, AMPERE_DMA_COPY_B));
450 
451     pDifr->prefetchEngine = nvGenerateUnixRmHandle(&pDevEvo->handleAllocator);
452     if (pDifr->prefetchEngine == 0) {
453         return NV_FALSE;
454     }
455 
456     allocParams.version = NVB0B5_ALLOCATION_PARAMETERS_VERSION_1;
457     allocParams.engineType = pDifr->copyEngineType;
458 
459     ret = nvRmApiAlloc(nvEvoGlobal.clientHandle,
460                        pDifr->prefetchPushChannel.channelHandle[0],
461                        pDifr->prefetchEngine,
462                        AMPERE_DMA_COPY_B,
463                        &allocParams);
464     if (ret != NVOS_STATUS_SUCCESS) {
465         return NV_FALSE;
466     }
467 
468     return NV_TRUE;
469 }
470 
471 static void FreeDIFRCopyEngine(NVDIFRStateEvoPtr pDifr)
472 {
473     if (pDifr->prefetchEngine != 0) {
474         nvRmApiFree(nvEvoGlobal.clientHandle,
475                     pDifr->pDevEvo->pSubDevices[0]->handle,
476                     pDifr->prefetchEngine);
477     }
478 
479     nvFreeUnixRmHandle(&pDifr->pDevEvo->handleAllocator,
480                        pDifr->prefetchEngine);
481 }
482 
483 static NvU32 PrefetchSingleSurface(NVDIFRStateEvoPtr pDifr,
484                                    NVDIFRPrefetchParams *pParams,
485                                    size_t *cacheRemaining)
486 {
487     NvPushChannelPtr p = &pDifr->prefetchPushChannel;
488     NvU64 semaphoreGPUAddress = nvPushGetNotifierGpuAddress(p, 0, 0);
489     NvGpuSemaphore *semaphore = (NvGpuSemaphore *)
490         nvPushGetNotifierCpuAddress(p, 0, 0);
491     const NvKmsSurfaceMemoryFormatInfo *finfo =
492         nvKmsGetSurfaceMemoryFormatInfo(pParams->surfFormat);
493     NvU32 componentSizes;
494     NvU32 line_length_in;
495     NvU32 line_count;
496     NvU64 starttime;
497     NvU64 endtime;
498 
499     /*
500      * Tell SET_REMAP_COMPONENTS the byte-size of a pixel in terms of color
501      * component size and count. It doesn't matter which actual combinations we
502      * choose as long as size*count will be equal to bytesPerPixel. This is
503      * because we won't be doing any actual remapping per se: we will just
504      * effectively tell the prefetch operation to fetch the correct amount of
505      * bytes for each pixel.
506      */
507     switch (finfo->rgb.bytesPerPixel) {
508 #define COMPONENTS(size, num)                                           \
509     (DRF_DEF(A0B5, _SET_REMAP_COMPONENTS, _COMPONENT_SIZE, size) |      \
510      DRF_DEF(A0B5, _SET_REMAP_COMPONENTS, _NUM_SRC_COMPONENTS, num) |   \
511      DRF_DEF(A0B5, _SET_REMAP_COMPONENTS, _NUM_DST_COMPONENTS, num))
512 
513     case 1: componentSizes = COMPONENTS(_ONE, _ONE); break;
514     case 2: componentSizes = COMPONENTS(_ONE, _TWO); break;
515     case 3: componentSizes = COMPONENTS(_ONE, _THREE); break;
516     case 4: componentSizes = COMPONENTS(_ONE, _FOUR); break;
517     case 6: componentSizes = COMPONENTS(_TWO, _THREE); break;
518     case 8: componentSizes = COMPONENTS(_TWO, _FOUR); break;
519     case 12: componentSizes = COMPONENTS(_FOUR, _THREE); break;
520     case 16: componentSizes = COMPONENTS(_FOUR, _FOUR); break;
521     default: componentSizes = 0; break;
522 #undef COMPONENTS
523     }
524 
525     /*
526      * TODO: For now, we don't prefetch multiplane surfaces. In order to do so
527      * we'd need to loop over all valid planes of the pSurfaceEvo and issue a
528      * prefetch for each plane.
529      */
530     if (finfo->numPlanes > 1) {
531         /*
532          * Regardless of its wording, this is the proper failure code to send
533          * upstream. This lets the RM disable DIFR until the next modeset.
534          */
535         return NV2080_CTRL_LPWR_DIFR_PREFETCH_FAIL_INSUFFICIENT_L2_SIZE;
536     }
537 
538     /*
539      * Compute some dimensional values to obtain correct blob size for
540      * prefetching. Use the given pitch and calculate the number of lines
541      * needed to cover the whole memory region.
542      */
543     nvAssert(pParams->surfPitchBytes % finfo->rgb.bytesPerPixel == 0);
544     line_length_in = pParams->surfPitchBytes / finfo->rgb.bytesPerPixel;
545 
546     nvAssert(pParams->surfSizeBytes % pParams->surfPitchBytes == 0);
547     line_count = pParams->surfSizeBytes / pParams->surfPitchBytes;
548 
549     /*
550      * Greedy strategy: assume all surfaces will fit in the supplied L2 size but
551      * the first one that doesn't will cause the prefetch request to fail. If we
552      * run out of cache then DIFR will disable itself until the next modeset.
553      */
554     if (*cacheRemaining < pParams->surfSizeBytes) {
555         return NV2080_CTRL_LPWR_DIFR_PREFETCH_FAIL_INSUFFICIENT_L2_SIZE;
556     }
557 
558     *cacheRemaining -= pParams->surfSizeBytes;
559 
560     /*
561      * Push buffer DMA copy and semaphore programming.
562      */
563     nvPushSetObject(p, NVA06F_SUBCHANNEL_COPY_ENGINE, &pDifr->prefetchEngine);
564     nvPushMethod(p, NVA06F_SUBCHANNEL_COPY_ENGINE,
565                  NVA0B5_SET_REMAP_COMPONENTS, 1);
566     nvPushSetMethodData(p,
567                         componentSizes |
568                         DRF_DEF(A0B5, _SET_REMAP_COMPONENTS, _DST_X, _CONST_A) |
569                         DRF_DEF(A0B5, _SET_REMAP_COMPONENTS, _DST_Y, _CONST_A) |
570                         DRF_DEF(A0B5, _SET_REMAP_COMPONENTS, _DST_Z, _CONST_A) |
571                         DRF_DEF(A0B5, _SET_REMAP_COMPONENTS, _DST_W, _CONST_A));
572     nvPushImmedVal(p, NVA06F_SUBCHANNEL_COPY_ENGINE,
573                    NVA0B5_SET_REMAP_CONST_A, 0);
574     nvPushMethod(p, NVA06F_SUBCHANNEL_COPY_ENGINE, NVA0B5_OFFSET_IN_UPPER, 2);
575     nvPushSetMethodDataU64(p, pParams->surfGpuAddress);
576     nvPushMethod(p, NVA06F_SUBCHANNEL_COPY_ENGINE, NVA0B5_OFFSET_OUT_UPPER, 2);
577     nvPushSetMethodDataU64(p, pParams->surfGpuAddress);
578 
579     /*
580      * We don't expect phenomally large pitches but the .mfs for DMA copy
581      * defines PitchIn/PitchOut to be of signed 32-bit type for all
582      * architectures so assert that the value will be what h/w understands.
583      */
584     nvAssert(pParams->surfPitchBytes <= NV_S32_MAX);
585 
586     nvPushMethod(p, NVA06F_SUBCHANNEL_COPY_ENGINE, NVA0B5_PITCH_IN, 1);
587     nvPushSetMethodData(p, pParams->surfPitchBytes);
588     nvPushMethod(p, NVA06F_SUBCHANNEL_COPY_ENGINE, NVA0B5_PITCH_OUT, 1);
589     nvPushSetMethodData(p, pParams->surfPitchBytes);
590 
591     nvPushMethod(p, NVA06F_SUBCHANNEL_COPY_ENGINE, NVA0B5_LINE_LENGTH_IN, 1);
592     nvPushSetMethodData(p, line_length_in);
593     nvPushMethod(p, NVA06F_SUBCHANNEL_COPY_ENGINE, NVA0B5_LINE_COUNT, 1);
594     nvPushSetMethodData(p, line_count);
595     nvAssert(pParams->surfPitchBytes * line_count == pParams->surfSizeBytes);
596 
597     nvPushMethod(p, NVA06F_SUBCHANNEL_COPY_ENGINE, NVA0B5_LAUNCH_DMA, 1);
598     nvPushSetMethodData
599         (p,
600          DRF_DEF(A0B5, _LAUNCH_DMA, _DATA_TRANSFER_TYPE, _PIPELINED) |
601          DRF_DEF(A0B5, _LAUNCH_DMA, _FLUSH_ENABLE,       _TRUE)      |
602          DRF_DEF(A0B5, _LAUNCH_DMA, _SEMAPHORE_TYPE,     _NONE)      |
603          DRF_DEF(A0B5, _LAUNCH_DMA, _INTERRUPT_TYPE,     _NONE)      |
604          DRF_DEF(A0B5, _LAUNCH_DMA, _REMAP_ENABLE,       _TRUE)      |
605          DRF_DEF(A0B5, _LAUNCH_DMA, _SRC_MEMORY_LAYOUT,  _PITCH)     |
606          DRF_DEF(A0B5, _LAUNCH_DMA, _DST_MEMORY_LAYOUT,  _PITCH)     |
607          DRF_DEF(A0B5, _LAUNCH_DMA, _MULTI_LINE_ENABLE,  _TRUE)      |
608          DRF_DEF(A0B5, _LAUNCH_DMA, _SRC_TYPE,           _VIRTUAL)   |
609          DRF_DEF(A0B5, _LAUNCH_DMA, _DST_TYPE,           _VIRTUAL));
610 
611     /*
612      * Reset semaphore value. A memory barrier will be issued by nvidia-push so
613      * we don't need one here.
614      */
615     semaphore->data[0] = 0;
616 
617     /* Program a semaphore release after prefetch DMA copy. */
618     nvPushMethod(p, 0, NVA06F_SEMAPHOREA, 4);
619     nvPushSetMethodDataU64(p, semaphoreGPUAddress);
620     nvPushSetMethodData(p, PREFETCH_DONE_VALUE);
621     nvPushSetMethodData(p,
622                         DRF_DEF(A06F, _SEMAPHORED, _OPERATION, _RELEASE) |
623                         DRF_DEF(A06F, _SEMAPHORED, _RELEASE_WFI, _EN) |
624                         DRF_DEF(A06F, _SEMAPHORED, _RELEASE_SIZE, _4BYTE));
625     nvPushKickoff(p);
626 
627     /*
628      * Errors and prefetch faults are handled as follows. If prefetch
629      * succeeds the semaphore release will trigger and we will exit upon
630      * seeing PREFETCH_DONE_VALUE in the memory location. Upon failure we
631      * will end up timing out, signal RM of the CE fault and DIFR will
632      * remain disabled until next driver load.
633      *
634      * Currently the total launch-to-end effective (with scheduling)
635      * prefetch rate on silicon seems to be around 15k pixels per
636      * microsecond, empirically. Thus, the time will range from a couple of
637      * hundred microseconds for a very small panel to slightly less than 2
638      * milliseconds for a single 4k display. We'll wait for 100us at a time
639      * and expect a realistic completion within few milliseconds at most.
640      */
641     starttime = nvkms_get_usec();
642     do {
643         endtime = nvkms_get_usec();
644 
645         if (semaphore->data[0] == PREFETCH_DONE_VALUE) {
646             return NV2080_CTRL_LPWR_DIFR_PREFETCH_SUCCESS;
647         }
648 
649         nvkms_usleep(100);
650     } while (endtime - starttime < DIFR_PREFETCH_WAIT_PERIOD_US); /* 10ms */
651 
652     return NV2080_CTRL_LPWR_DIFR_PREFETCH_FAIL_CE_HW_ERROR;
653 }
654 
655 static NvBool PrefetchHelperSurfaceEvo(NVDIFRStateEvoPtr pDifr,
656                                        size_t *cacheRemaining,
657                                        NVSurfaceEvoPtr pSurfaceEvo,
658                                        NvU32 *status)
659 {
660     NVDIFRPrefetchParams params;
661 
662     nvAssert(*status == NV2080_CTRL_LPWR_DIFR_PREFETCH_SUCCESS);
663 
664     if (!pSurfaceEvo) {
665         return TRUE;
666     }
667 
668     if (pSurfaceEvo->noDisplayCaching) {
669         *status = NV2080_CTRL_LPWR_DIFR_PREFETCH_FAIL_OS_FLIPS_ENABLED;
670         return FALSE;
671     }
672 
673     /*
674      * If we see the same SurfaceEvo twice (UBB, multi-head X screens, etc)
675      * we only ever want to prefetch it once within a single
676      * nvDIFRPrefetchSurfaces() call.
677      */
678     if (pSurfaceEvo->difrLastPrefetchPass == pDifr->prefetchPass) {
679         return TRUE;
680     }
681 
682     /*
683      * Update pass counter even if we fail later: we want to try each
684      * surface only once.
685      */
686     pSurfaceEvo->difrLastPrefetchPass = pDifr->prefetchPass;
687 
688     /* Collect copy parameters and do the prefetch. */
689     params.surfGpuAddress = pSurfaceEvo->gpuAddress;
690     params.surfSizeBytes = pSurfaceEvo->planes[0].rmObjectSizeInBytes;
691     params.surfPitchBytes = pSurfaceEvo->planes[0].pitch;
692     params.surfFormat = pSurfaceEvo->format;
693 
694     if (pSurfaceEvo->layout == NvKmsSurfaceMemoryLayoutBlockLinear) {
695         params.surfPitchBytes *= NVKMS_BLOCK_LINEAR_GOB_WIDTH;
696     }
697 
698     *status = PrefetchSingleSurface(pDifr, &params, cacheRemaining);
699 
700     return *status == NV2080_CTRL_LPWR_DIFR_PREFETCH_SUCCESS;
701 }
702 
703 static NvBool PrefetchHelperLutSurface(NVDIFRStateEvoPtr pDifr,
704                                        size_t *cacheRemaining,
705                                        NVLutSurfaceEvoPtr pLutSurface,
706                                        NvU32 *status)
707 {
708     NVDIFRPrefetchParams params;
709 
710     nvAssert(*status == NV2080_CTRL_LPWR_DIFR_PREFETCH_SUCCESS);
711 
712     if (!pLutSurface) {
713         return TRUE;
714     }
715 
716     /*
717      * LUTs are often shared so we only want to prefetch (or consider) each
718      * LUT at most once during the prefetch process.
719      */
720     if (pLutSurface->difrLastPrefetchPass == pDifr->prefetchPass) {
721         return TRUE;
722     }
723 
724     pLutSurface->difrLastPrefetchPass = pDifr->prefetchPass;
725 
726     /* Collect copy parameters and do the prefetch. */
727     params.surfGpuAddress = (NvUPtr)pLutSurface->gpuAddress;
728     params.surfSizeBytes = pLutSurface->size;
729     params.surfPitchBytes = pLutSurface->size;
730     params.surfFormat = NvKmsSurfaceMemoryFormatI8;
731 
732     *status = PrefetchSingleSurface(pDifr, &params, cacheRemaining);
733 
734     return *status == NV2080_CTRL_LPWR_DIFR_PREFETCH_SUCCESS;
735 }
736 
737 /*
738  * Set DIFR disabled state in H/W. Return true if state was changed and it
739  * was successfully signalled downstream.
740  */
741 static NvBool SetDisabledState(NVDIFRStateEvoPtr pDifr,
742                                NvBool shouldDisable)
743 {
744     NVDevEvoPtr pDevEvo = pDifr->pDevEvo;
745     NV2080_CTRL_CMD_LPWR_DIFR_CTRL_PARAMS params = { 0 };
746     NvU32 ret;
747 
748     if (shouldDisable == pDifr->hwDisabled) {
749         return TRUE;
750     }
751 
752     params.ctrlParamVal = shouldDisable
753         ? NV2080_CTRL_LPWR_DIFR_CTRL_DISABLE
754         : NV2080_CTRL_LPWR_DIFR_CTRL_ENABLE;
755 
756     ret = nvRmApiControl(nvEvoGlobal.clientHandle,
757                          pDevEvo->pSubDevices[0]->handle,
758                          NV2080_CTRL_CMD_LPWR_DIFR_CTRL,
759                          &params,
760                          sizeof(params));
761 
762     if (ret != NV_OK) {
763         return FALSE;
764     }
765 
766     pDifr->hwDisabled = shouldDisable;
767 
768     return TRUE;
769 }
770 
771 static NvBool IsCECompatibleWithDIFR(NVDevEvoPtr pDevEvo, NvU32 instance)
772 {
773     NV2080_CTRL_CE_GET_CAPS_V2_PARAMS params;
774     NvU32 ret;
775 
776     nvkms_memset(&params, 0, sizeof(params));
777     params.ceEngineType = NV2080_ENGINE_TYPE_COPY(instance);
778 
779     ret = nvRmApiControl(nvEvoGlobal.clientHandle,
780                          pDevEvo->pSubDevices[0]->handle,
781                          NV2080_CTRL_CMD_CE_GET_CAPS_V2,
782                          &params,
783                          sizeof(params));
784 
785     if (ret != NVOS_STATUS_SUCCESS) {
786         return FALSE;
787     }
788 
789     ct_assert(sizeof(params.capsTbl) == NV2080_CTRL_CE_CAPS_TBL_SIZE);
790 
791     /* Current criteria: DIFR prefetches can't use graphics CEs. */
792     if (NV2080_CTRL_CE_GET_CAP(params.capsTbl, NV2080_CTRL_CE_CAPS_CE_GRCE)) {
793         return FALSE;
794     }
795 
796     return TRUE;
797 }
798 
799 /*
800  * Make sure we have a pending idle timer to check back on idleness.
801  */
802 static void EnsureIdleTimer(NVDIFRStateEvoPtr pDifr)
803 {
804     if (!pDifr->idleTimer) {
805         /* Wait 100x longer in emulation. */
806         NvU64 idlePeriod =
807             DIFR_IDLE_WAIT_PERIOD_US *
808             (nvIsEmulationEvo(pDifr->pDevEvo) ? 100 : 1);
809 
810         pDifr->idleTimer =
811             nvkms_alloc_timer(IdleTimerProc, pDifr, 0, idlePeriod);
812     }
813 }
814 
815 /*
816  * An idle timer should always remain pending after a flip until further
817  * flips cease and DIFR can be re-enabled.
818  *
819  * Currently we'll try to re-enable DIFR after a constant period of idleness
820  * since the last flip but this could resonate badly with a client that's
821  * rendering at the same pace.
822  *
823  * To avoid churn we could track the time DIFR actually did remain enabled.
824  * If the enabled-period is relatively short against the disabled-period, we
825  * should bump the timeout to re-enable so that we won't be retrying all the
826  * time. Conversely, we should reset the bumped timeout after we actually
827  * managed to sleep long enough with DIFR enabled.
828  *
829  * Note: There's the question of whether we should apply slight hysteresis
830  * within NVKMS regarding enabling/disabling DIFR. The hardware itself does
831  * some churn-limiting and practical observations show that it seems to work
832  * sufficiently and I've not observed rapid, repeating prefetch requests.
833  * Keeping this note here in case this matter needs to be revisited later.
834  */
835 static void IdleTimerProc(void *dataPtr, NvU32 dataU32)
836 {
837     NVDIFRStateEvoPtr pDifr = (NVDIFRStateEvoPtr)dataPtr;
838     NvU64 now = nvkms_get_usec();
839     NvU64 idlePeriod =
840         DIFR_IDLE_WAIT_PERIOD_US *
841         (nvIsEmulationEvo(pDifr->pDevEvo) ? 100 : 1);
842 
843     /* First free the timer that triggered us. */
844     nvkms_free_timer(pDifr->idleTimer);
845     pDifr->idleTimer = NULL;
846 
847     if (now - pDifr->lastFlipTime >= idlePeriod) {
848         /* Enough time has passed with no new flips, enable DIFR. */
849         SetDisabledState(pDifr, FALSE);
850     } else {
851         /* New flips have happened since the original, reset idle timer. */
852         EnsureIdleTimer(pDifr);
853     }
854 }
855