1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file backend.cpp
24  *
25  * @brief Backend handles rasterization, pixel shading and output merger
26  *        operations.
27  *
28  ******************************************************************************/
29 
30 #include <smmintrin.h>
31 
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 #include "backends/gen_BackendPixelRate.hpp"
38 
39 #include <algorithm>
40 
41 
42 //////////////////////////////////////////////////////////////////////////
43 /// @brief Process compute work.
44 /// @param pDC - pointer to draw context (dispatch).
45 /// @param workerId - The unique worker ID that is assigned to this thread.
46 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessComputeBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t threadGroupId,void * & pSpillFillBuffer,void * & pScratchSpace)47 void ProcessComputeBE(DRAW_CONTEXT* pDC,
48                       uint32_t      workerId,
49                       uint32_t      threadGroupId,
50                       void*&        pSpillFillBuffer,
51                       void*&        pScratchSpace)
52 {
53     SWR_CONTEXT* pContext = pDC->pContext;
54 
55     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEDispatch, pDC->drawId);
56 
57     const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
58     SWR_ASSERT(pTaskData != nullptr);
59 
60     // Ensure spill fill memory has been allocated.
61     size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
62     if (spillFillSize && pSpillFillBuffer == nullptr)
63     {
64         pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD16_BYTES);
65     }
66 
67     size_t scratchSpaceSize =
68         pDC->pState->state.scratchSpaceSizePerWarp * pDC->pState->state.scratchSpaceNumWarps;
69     if (scratchSpaceSize && pScratchSpace == nullptr)
70     {
71         pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD16_BYTES);
72     }
73 
74     const API_STATE& state = GetApiState(pDC);
75 
76     SWR_CS_CONTEXT csContext{0};
77     csContext.tileCounter         = threadGroupId;
78     csContext.dispatchDims[0]     = pTaskData->threadGroupCountX;
79     csContext.dispatchDims[1]     = pTaskData->threadGroupCountY;
80     csContext.dispatchDims[2]     = pTaskData->threadGroupCountZ;
81     csContext.pTGSM               = pContext->ppScratch[workerId];
82     csContext.pSpillFillBuffer    = (uint8_t*)pSpillFillBuffer;
83     csContext.pScratchSpace       = (uint8_t*)pScratchSpace;
84     csContext.scratchSpacePerWarp = pDC->pState->state.scratchSpaceSizePerWarp;
85 
86     state.pfnCsFunc(GetPrivateState(pDC),
87                     pContext->threadPool.pThreadData[workerId].pWorkerPrivateData,
88                     &csContext);
89 
90     UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
91     AR_EVENT(CSStats((HANDLE)&csContext.stats));
92 
93     RDTSC_END(pDC->pContext->pBucketMgr, BEDispatch, 1);
94 }
95 
96 //////////////////////////////////////////////////////////////////////////
97 /// @brief Process shutdown.
98 /// @param pDC - pointer to draw context (dispatch).
99 /// @param workerId - The unique worker ID that is assigned to this thread.
100 /// @param threadGroupId - the linear index for the thread group within the dispatch.
ProcessShutdownBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)101 void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
102 {
103     // Dummy function
104 }
105 
ProcessSyncBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pUserData)106 void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
107 {
108     uint32_t x, y;
109     MacroTileMgr::getTileIndices(macroTile, x, y);
110     SWR_ASSERT(x == 0 && y == 0);
111 }
112 
ProcessStoreTileBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,STORE_TILES_DESC * pDesc,SWR_RENDERTARGET_ATTACHMENT attachment)113 void ProcessStoreTileBE(DRAW_CONTEXT*               pDC,
114                         uint32_t                    workerId,
115                         uint32_t                    macroTile,
116                         STORE_TILES_DESC*           pDesc,
117                         SWR_RENDERTARGET_ATTACHMENT attachment)
118 {
119     SWR_CONTEXT* pContext           = pDC->pContext;
120     HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
121 
122     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStoreTiles, pDC->drawId);
123 
124     SWR_FORMAT srcFormat;
125     switch (attachment)
126     {
127     case SWR_ATTACHMENT_COLOR0:
128     case SWR_ATTACHMENT_COLOR1:
129     case SWR_ATTACHMENT_COLOR2:
130     case SWR_ATTACHMENT_COLOR3:
131     case SWR_ATTACHMENT_COLOR4:
132     case SWR_ATTACHMENT_COLOR5:
133     case SWR_ATTACHMENT_COLOR6:
134     case SWR_ATTACHMENT_COLOR7:
135         srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
136         break;
137     case SWR_ATTACHMENT_DEPTH:
138         srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT;
139         break;
140     case SWR_ATTACHMENT_STENCIL:
141         srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT;
142         break;
143     default:
144         SWR_INVALID("Unknown attachment: %d", attachment);
145         srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
146         break;
147     }
148 
149     uint32_t x, y;
150     MacroTileMgr::getTileIndices(macroTile, x, y);
151 
152     // Only need to store the hottile if it's been rendered to...
153     HOTTILE* pHotTile =
154         pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
155     if (pHotTile)
156     {
157         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
158         if (pHotTile->state == HOTTILE_CLEAR)
159         {
160             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
161             SWR_ASSERT(pfnClearTiles != nullptr);
162 
163             pfnClearTiles(pDC,
164                           hWorkerPrivateData,
165                           attachment,
166                           macroTile,
167                           pHotTile->renderTargetArrayIndex,
168                           pHotTile->clearData,
169                           pDesc->rect);
170         }
171 
172         if (pHotTile->state == HOTTILE_DIRTY ||
173             pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
174         {
175             int32_t destX = KNOB_MACROTILE_X_DIM * x;
176             int32_t destY = KNOB_MACROTILE_Y_DIM * y;
177 
178             pContext->pfnStoreTile(pDC,
179                                    hWorkerPrivateData,
180                                    srcFormat,
181                                    attachment,
182                                    destX,
183                                    destY,
184                                    pHotTile->renderTargetArrayIndex,
185                                    pHotTile->pBuffer);
186         }
187 
188         if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
189         {
190             if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY &&
191                   pHotTile->state == HOTTILE_RESOLVED))
192             {
193                 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
194             }
195         }
196     }
197     RDTSC_END(pDC->pContext->pBucketMgr, BEStoreTiles, 1);
198 }
199 
ProcessStoreTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)200 void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
201 {
202     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData;
203 
204     unsigned long rt   = 0;
205     uint32_t      mask = pDesc->attachmentMask;
206     while (_BitScanForward(&rt, mask))
207     {
208         mask &= ~(1 << rt);
209         ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
210     }
211 }
212 
ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)213 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
214                                      uint32_t      workerId,
215                                      uint32_t      macroTile,
216                                      void*         pData)
217 {
218     DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pData;
219     SWR_CONTEXT*                   pContext = pDC->pContext;
220 
221     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
222 
223     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
224     {
225         if (pDesc->attachmentMask & (1 << i))
226         {
227             HOTTILE* pHotTile =
228                 pContext->pHotTileMgr->GetHotTileNoLoad(pContext,
229                                                         pDC,
230                                                         macroTile,
231                                                         (SWR_RENDERTARGET_ATTACHMENT)i,
232                                                         pDesc->createNewTiles,
233                                                         numSamples);
234             if (pHotTile)
235             {
236                 HOTTILE_STATE newState = (HOTTILE_STATE)pDesc->newTileState;;
237                 if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_CLEAR)
238                 {
239                     if (newState == HOTTILE_INVALID)
240                     {
241                         // This is OK for APIs that explicitly allow discards
242                         // (for e.g. depth / stencil data)
243                         //SWR_INVALID("Discarding valid data!");
244                     }
245                 }
246                 pHotTile->state = newState;
247             }
248         }
249     }
250 }
251 
252 template <uint32_t sampleCountT>
BackendNullPS(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)253 void BackendNullPS(DRAW_CONTEXT*        pDC,
254                    uint32_t             workerId,
255                    uint32_t             x,
256                    uint32_t             y,
257                    SWR_TRIANGLE_DESC&   work,
258                    RenderOutputBuffers& renderBuffers)
259 {
260     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BENullBackend, pDC->drawId);
261     ///@todo: handle center multisample pattern
262     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
263 
264     const API_STATE& state = GetApiState(pDC);
265 
266     BarycentricCoeffs coeffs;
267     SetupBarycentricCoeffs(&coeffs, work);
268 
269     uint8_t *pDepthBuffer, *pStencilBuffer;
270     SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);
271 
272     SWR_PS_CONTEXT psContext;
273     // skip SetupPixelShaderContext(&psContext, ...); // not needed here
274 
275     RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
276 
277     simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
278 
279     const simdscalar           dy        = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
280     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
281     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
282     {
283         simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
284 
285         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
286 
287         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
288         {
289             // iterate over active samples
290             unsigned long sample     = 0;
291             uint32_t      sampleMask = state.blendState.sampleMask;
292             while (_BitScanForward(&sample, sampleMask))
293             {
294                 sampleMask &= ~(1 << sample);
295 
296                 simdmask coverageMask = work.coverageMask[sample] & MASK;
297 
298                 if (coverageMask)
299                 {
300                     // offset depth/stencil buffers current sample
301                     uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
302                     uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
303 
304                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
305                     {
306                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
307                                       "Unsupported depth hot tile format");
308 
309                         const simdscalar z =
310                             _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
311 
312                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
313                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
314 
315                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
316                     }
317 
318                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
319 
320                     // calculate per sample positions
321                     psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
322                     psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));
323 
324                     CalcSampleBarycentrics(coeffs, psContext);
325 
326                     // interpolate and quantize z
327                     psContext.vZ = vplaneps(coeffs.vZa,
328                                             coeffs.vZb,
329                                             coeffs.vZc,
330                                             psContext.vI.sample,
331                                             psContext.vJ.sample);
332                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
333 
334                     RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
335 
336                     // interpolate user clip distance if available
337                     if (state.backendState.clipDistanceMask)
338                     {
339                         coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
340                                                              work.pUserClipBuffer,
341                                                              psContext.vI.sample,
342                                                              psContext.vJ.sample);
343                     }
344 
345                     simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
346                     simdscalar stencilPassMask = vCoverageMask;
347 
348                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
349                     simdscalar depthPassMask = DepthStencilTest(&state,
350                                                                 work.triFlags.frontFacing,
351                                                                 work.triFlags.viewportIndex,
352                                                                 psContext.vZ,
353                                                                 pDepthSample,
354                                                                 vCoverageMask,
355                                                                 pStencilSample,
356                                                                 &stencilPassMask);
357                     AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask),
358                                                          _simd_movemask_ps(stencilPassMask),
359                                                          _simd_movemask_ps(vCoverageMask)));
360                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
361                                       &state.depthStencilState,
362                                       work.triFlags.frontFacing,
363                                       psContext.vZ,
364                                       pDepthSample,
365                                       depthPassMask,
366                                       vCoverageMask,
367                                       pStencilSample,
368                                       stencilPassMask);
369                     RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
370 
371                     uint32_t statMask  = _simd_movemask_ps(depthPassMask);
372                     uint32_t statCount = _mm_popcnt_u32(statMask);
373                     UPDATE_STAT_BE(DepthPassCount, statCount);
374                 }
375 
376             Endtile:
377                 ATTR_UNUSED;
378                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
379             }
380 
381             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
382             pStencilBuffer +=
383                 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
384 
385             vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
386         }
387 
388         vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
389     }
390 
391     RDTSC_END(pDC->pContext->pBucketMgr, BENullBackend, 0);
392 }
393 
394 PFN_CLEAR_TILES  gClearTilesTable[NUM_SWR_FORMATS] = {};
395 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
396 PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
397                                      [2]                           // canEarlyZ
398     = {};
399 PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
400                                        [SWR_INPUT_COVERAGE_COUNT][2]   // centroid
401                                        [2]                             // forcedSampleCount
402                                        [2]                             // canEarlyZ
403     = {};
404 PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT]
405                                         [2] // centroid
406                                         [2] // canEarlyZ
407     = {};
408 
InitBackendFuncTables()409 void InitBackendFuncTables()
410 {
411     InitBackendPixelRate();
412     InitBackendSingleFuncTable(gBackendSingleSample);
413     InitBackendSampleFuncTable(gBackendSampleRateTable);
414 
415     gBackendNullPs[SWR_MULTISAMPLE_1X]  = &BackendNullPS<SWR_MULTISAMPLE_1X>;
416     gBackendNullPs[SWR_MULTISAMPLE_2X]  = &BackendNullPS<SWR_MULTISAMPLE_2X>;
417     gBackendNullPs[SWR_MULTISAMPLE_4X]  = &BackendNullPS<SWR_MULTISAMPLE_4X>;
418     gBackendNullPs[SWR_MULTISAMPLE_8X]  = &BackendNullPS<SWR_MULTISAMPLE_8X>;
419     gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>;
420 }
421