1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file backend.cpp
24  *
25  * @brief Backend handles rasterization, pixel shading and output merger
26  *        operations.
27  *
28  ******************************************************************************/
29 
30 #include <smmintrin.h>
31 
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 
38 #include <algorithm>
39 
40 template <typename T>
BackendSingleSample(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)41 void BackendSingleSample(DRAW_CONTEXT*        pDC,
42                          uint32_t             workerId,
43                          uint32_t             x,
44                          uint32_t             y,
45                          SWR_TRIANGLE_DESC&   work,
46                          RenderOutputBuffers& renderBuffers)
47 {
48     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESingleSampleBackend, pDC->drawId);
49     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
50 
51     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
52 
53     const API_STATE& state = GetApiState(pDC);
54 
55     BarycentricCoeffs coeffs;
56     SetupBarycentricCoeffs(&coeffs, work);
57 
58     SWR_PS_CONTEXT             psContext;
59     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
60     SetupPixelShaderContext<T>(&psContext, samplePos, work);
61 
62     uint8_t *pDepthBuffer, *pStencilBuffer;
63     SetupRenderBuffers(psContext.pColorBuffer,
64                        &pDepthBuffer,
65                        &pStencilBuffer,
66                        state.colorHottileEnable,
67                        renderBuffers);
68 
69     // Indicates backend rendered something to the color buffer
70     bool isTileDirty = false;
71 
72     RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 1);
73 
74     psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
75     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
76 
77     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
78 
79     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
80     {
81         psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
82         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
83 
84         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
85 
86         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
87         {
88             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
89 
90 
91             simdmask coverageMask = work.coverageMask[0] & MASK;
92 
93             if (coverageMask)
94             {
95                 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
96                 {
97                     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
98                                   "Unsupported depth hot tile format");
99 
100                     const simdscalar z =
101                         _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));
102 
103                     const float minz = state.depthBoundsState.depthBoundsTestMinValue;
104                     const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
105 
106                     coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
107                 }
108 
109                 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
110                 {
111                     const uint64_t* pCoverageMask =
112                         (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
113                             ? &work.innerCoverageMask
114                             : &work.coverageMask[0];
115 
116                     generateInputCoverage<T, T::InputCoverage>(
117                         pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
118                 }
119 
120                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
121 
122                 CalcPixelBarycentrics(coeffs, psContext);
123 
124                 CalcCentroid<T, true>(
125                     &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
126 
127                 // interpolate and quantize z
128                 psContext.vZ = vplaneps(
129                     coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
130                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
131 
132                 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 1);
133 
134                 // interpolate user clip distance if available
135                 if (state.backendState.clipDistanceMask)
136                 {
137                     coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
138                                                          work.pUserClipBuffer,
139                                                          psContext.vI.center,
140                                                          psContext.vJ.center);
141                 }
142 
143                 simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
144                 simdscalar depthPassMask   = vCoverageMask;
145                 simdscalar stencilPassMask = vCoverageMask;
146 
147                 // Early-Z?
148                 if (T::bCanEarlyZ)
149                 {
150                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
151                     depthPassMask = DepthStencilTest(&state,
152                                                      work.triFlags.frontFacing,
153                                                      work.triFlags.viewportIndex,
154                                                      psContext.vZ,
155                                                      pDepthBuffer,
156                                                      vCoverageMask,
157                                                      pStencilBuffer,
158                                                      &stencilPassMask);
159                     AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
160                                                                _simd_movemask_ps(stencilPassMask),
161                                                                _simd_movemask_ps(vCoverageMask)));
162                     RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
163 
164                     // early-exit if no pixels passed depth or earlyZ is forced on
165                     if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
166                     {
167                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
168                                           &state.depthStencilState,
169                                           work.triFlags.frontFacing,
170                                           psContext.vZ,
171                                           pDepthBuffer,
172                                           depthPassMask,
173                                           vCoverageMask,
174                                           pStencilBuffer,
175                                           stencilPassMask);
176 
177                         if (!_simd_movemask_ps(depthPassMask))
178                         {
179                             goto Endtile;
180                         }
181                     }
182                 }
183 
184                 psContext.sampleIndex = 0;
185                 psContext.activeMask  = _simd_castps_si(vCoverageMask);
186 
187                 // execute pixel shader
188                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
189                 state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
190                 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
191 
192                 // update stats
193                 UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
194                 AR_EVENT(PSStats((HANDLE)&psContext.stats));
195 
196                 vCoverageMask = _simd_castsi_ps(psContext.activeMask);
197 
198                 if (_simd_movemask_ps(vCoverageMask))
199                 {
200                     isTileDirty = true;
201                 }
202 
203                 // late-Z
204                 if (!T::bCanEarlyZ)
205                 {
206                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
207                     depthPassMask = DepthStencilTest(&state,
208                                                      work.triFlags.frontFacing,
209                                                      work.triFlags.viewportIndex,
210                                                      psContext.vZ,
211                                                      pDepthBuffer,
212                                                      vCoverageMask,
213                                                      pStencilBuffer,
214                                                      &stencilPassMask);
215                     AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
216                                                               _simd_movemask_ps(stencilPassMask),
217                                                               _simd_movemask_ps(vCoverageMask)));
218                     RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
219 
220                     if (!_simd_movemask_ps(depthPassMask))
221                     {
222                         // need to call depth/stencil write for stencil write
223                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
224                                           &state.depthStencilState,
225                                           work.triFlags.frontFacing,
226                                           psContext.vZ,
227                                           pDepthBuffer,
228                                           depthPassMask,
229                                           vCoverageMask,
230                                           pStencilBuffer,
231                                           stencilPassMask);
232                         goto Endtile;
233                     }
234                 }
235                 else
236                 {
237                     // for early z, consolidate discards from shader
238                     // into depthPassMask
239                     depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
240                 }
241 
242                 uint32_t statMask  = _simd_movemask_ps(depthPassMask);
243                 uint32_t statCount = _mm_popcnt_u32(statMask);
244                 UPDATE_STAT_BE(DepthPassCount, statCount);
245 
246                 // output merger
247                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
248 
249                 OutputMerger8x2(pDC,
250                                 psContext,
251                                 psContext.pColorBuffer,
252                                 0,
253                                 &state.blendState,
254                                 state.pfnBlendFunc,
255                                 vCoverageMask,
256                                 depthPassMask,
257                                 state.psState.renderTargetMask,
258                                 useAlternateOffset,
259                                 workerId);
260 
261                 // do final depth write after all pixel kills
262                 if (!state.psState.forceEarlyZ)
263                 {
264                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
265                                       &state.depthStencilState,
266                                       work.triFlags.frontFacing,
267                                       psContext.vZ,
268                                       pDepthBuffer,
269                                       depthPassMask,
270                                       vCoverageMask,
271                                       pStencilBuffer,
272                                       stencilPassMask);
273                 }
274                 RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
275             }
276 
277         Endtile:
278             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
279 
280             work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
281             if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
282             {
283                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
284             }
285 
286             if (useAlternateOffset)
287             {
288                 unsigned long rt;
289                 uint32_t rtMask = state.colorHottileEnable;
290                 while (_BitScanForward(&rt, rtMask))
291                 {
292                     rtMask &= ~(1 << rt);
293                     psContext.pColorBuffer[rt] +=
294                         (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
295                 }
296             }
297 
298             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
299             pStencilBuffer +=
300                 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
301 
302             RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
303 
304             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
305             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
306         }
307 
308         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
309         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
310     }
311 
312     if (isTileDirty)
313     {
314         SetRenderHotTilesDirty(pDC, renderBuffers);
315     }
316 
317     RDTSC_END(pDC->pContext->pBucketMgr, BESingleSampleBackend, 0);
318 }
319 
320 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
321 // arguments to static template arguments.
322 template <uint32_t... ArgsT>
323 struct BEChooserSingleSample
324 {
325     // Last Arg Terminator
GetFuncBEChooserSingleSample326     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
327     {
328         switch (tArg)
329         {
330         case SWR_BACKEND_SINGLE_SAMPLE:
331             return BackendSingleSample<SwrBackendTraits<ArgsT...>>;
332             break;
333         case SWR_BACKEND_MSAA_PIXEL_RATE:
334         case SWR_BACKEND_MSAA_SAMPLE_RATE:
335         default:
336             SWR_ASSERT(0 && "Invalid backend func\n");
337             return nullptr;
338             break;
339         }
340     }
341 
342     // Recursively parse args
343     template <typename... TArgsT>
GetFuncBEChooserSingleSample344     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
345     {
346         switch (tArg)
347         {
348         case SWR_INPUT_COVERAGE_NONE:
349             return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
350                 remainingArgs...);
351             break;
352         case SWR_INPUT_COVERAGE_NORMAL:
353             return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
354                 remainingArgs...);
355             break;
356         case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
357             return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
358                 remainingArgs...);
359             break;
360         default:
361             SWR_ASSERT(0 && "Invalid sample pattern\n");
362             return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
363                 remainingArgs...);
364             break;
365         }
366     }
367 
368     // Recursively parse args
369     template <typename... TArgsT>
GetFuncBEChooserSingleSample370     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
371     {
372         switch (tArg)
373         {
374         case SWR_MULTISAMPLE_1X:
375             return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
376             break;
377         case SWR_MULTISAMPLE_2X:
378             return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
379             break;
380         case SWR_MULTISAMPLE_4X:
381             return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
382             break;
383         case SWR_MULTISAMPLE_8X:
384             return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
385             break;
386         case SWR_MULTISAMPLE_16X:
387             return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
388             break;
389         default:
390             SWR_ASSERT(0 && "Invalid sample count\n");
391             return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
392             break;
393         }
394     }
395 
396     // Recursively parse args
397     template <typename... TArgsT>
GetFuncBEChooserSingleSample398     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
399     {
400         if (tArg == true)
401         {
402             return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
403         }
404 
405         return BEChooserSingleSample<ArgsT..., 0>::GetFunc(remainingArgs...);
406     }
407 };
408 
InitBackendSingleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_INPUT_COVERAGE_COUNT][2][2])409 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
410 {
411     for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
412     {
413         for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
414         {
415             for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
416             {
417                 table[inputCoverage][isCentroid][canEarlyZ] =
418                     BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X,
419                                                      false,
420                                                      (SWR_INPUT_COVERAGE)inputCoverage,
421                                                      (isCentroid > 0),
422                                                      false,
423                                                      (canEarlyZ > 0),
424                                                      SWR_BACKEND_SINGLE_SAMPLE);
425             }
426         }
427     }
428 }
429