1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file backend.cpp
24  *
25  * @brief Backend handles rasterization, pixel shading and output merger
26  *        operations.
27  *
28  ******************************************************************************/
29 
30 #include <smmintrin.h>
31 
32 #include "backend.h"
33 #include "backend_impl.h"
34 #include "tilemgr.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
37 
38 #include <algorithm>
39 
40 template <typename T>
BackendSampleRate(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t x,uint32_t y,SWR_TRIANGLE_DESC & work,RenderOutputBuffers & renderBuffers)41 void BackendSampleRate(DRAW_CONTEXT*        pDC,
42                        uint32_t             workerId,
43                        uint32_t             x,
44                        uint32_t             y,
45                        SWR_TRIANGLE_DESC&   work,
46                        RenderOutputBuffers& renderBuffers)
47 {
48     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESampleRateBackend, pDC->drawId);
49     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
50 
51     void* pWorkerData      = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
52     const API_STATE& state = GetApiState(pDC);
53 
54     BarycentricCoeffs coeffs;
55     SetupBarycentricCoeffs(&coeffs, work);
56 
57     SWR_PS_CONTEXT             psContext;
58     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
59     SetupPixelShaderContext<T>(&psContext, samplePos, work);
60 
61     uint8_t *pDepthBuffer, *pStencilBuffer;
62     SetupRenderBuffers(psContext.pColorBuffer,
63                        &pDepthBuffer,
64                        &pStencilBuffer,
65                        state.colorHottileEnable,
66                        renderBuffers);
67 
68     bool isTileDirty = false;
69 
70     RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
71 
72     psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
73     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
74 
75     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
76 
77     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
78     {
79         psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
80         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
81 
82         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
83 
84         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
85         {
86             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
87 
88 
89             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
90             {
91                 const uint64_t* pCoverageMask =
92                     (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
93                         ? &work.innerCoverageMask
94                         : &work.coverageMask[0];
95 
96                 generateInputCoverage<T, T::InputCoverage>(
97                     pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
98             }
99 
100             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
101 
102             CalcPixelBarycentrics(coeffs, psContext);
103 
104             CalcCentroid<T, false>(
105                 &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
106 
107             RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
108 
109             for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
110             {
111                 simdmask coverageMask = work.coverageMask[sample] & MASK;
112 
113                 if (coverageMask)
114                 {
115                     // offset depth/stencil buffers current sample
116                     uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
117                     uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
118 
119                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
120                     {
121                         static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
122                                       "Unsupported depth hot tile format");
123 
124                         const simdscalar z =
125                             _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
126 
127                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
128                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
129 
130                         coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
131                     }
132 
133                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
134 
135                     // calculate per sample positions
136                     psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
137                     psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
138 
139                     CalcSampleBarycentrics(coeffs, psContext);
140 
141                     // interpolate and quantize z
142                     psContext.vZ = vplaneps(coeffs.vZa,
143                                             coeffs.vZb,
144                                             coeffs.vZc,
145                                             psContext.vI.sample,
146                                             psContext.vJ.sample);
147                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
148 
149                     RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
150 
151                     // interpolate user clip distance if available
152                     if (state.backendState.clipDistanceMask)
153                     {
154                         coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
155                                                              work.pUserClipBuffer,
156                                                              psContext.vI.sample,
157                                                              psContext.vJ.sample);
158                     }
159 
160                     simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
161                     simdscalar depthPassMask   = vCoverageMask;
162                     simdscalar stencilPassMask = vCoverageMask;
163 
164                     // Early-Z?
165                     if (T::bCanEarlyZ)
166                     {
167                         RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEarlyDepthTest, pDC->drawId);
168                         depthPassMask = DepthStencilTest(&state,
169                                                          work.triFlags.frontFacing,
170                                                          work.triFlags.viewportIndex,
171                                                          psContext.vZ,
172                                                          pDepthSample,
173                                                          vCoverageMask,
174                                                          pStencilSample,
175                                                          &stencilPassMask);
176                         AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
177                                                                  _simd_movemask_ps(stencilPassMask),
178                                                                  _simd_movemask_ps(vCoverageMask)));
179                         RDTSC_END(pDC->pContext->pBucketMgr, BEEarlyDepthTest, 0);
180 
181                         // early-exit if no samples passed depth or earlyZ is forced on.
182                         if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
183                         {
184                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
185                                               &state.depthStencilState,
186                                               work.triFlags.frontFacing,
187                                               psContext.vZ,
188                                               pDepthSample,
189                                               depthPassMask,
190                                               vCoverageMask,
191                                               pStencilSample,
192                                               stencilPassMask);
193 
194                             if (!_simd_movemask_ps(depthPassMask))
195                             {
196                                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
197                                 continue;
198                             }
199                         }
200                     }
201 
202                     psContext.sampleIndex = sample;
203                     psContext.activeMask  = _simd_castps_si(vCoverageMask);
204 
205                     // execute pixel shader
206                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
207                     state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
208                     RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
209 
210                     // update stats
211                     UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
212                     AR_EVENT(PSStats((HANDLE)&psContext.stats));
213 
214                     vCoverageMask = _simd_castsi_ps(psContext.activeMask);
215 
216                     if (_simd_movemask_ps(vCoverageMask))
217                     {
218                         isTileDirty = true;
219                     }
220 
221                     // late-Z
222                     if (!T::bCanEarlyZ)
223                     {
224                         RDTSC_BEGIN(pDC->pContext->pBucketMgr, BELateDepthTest, pDC->drawId);
225                         depthPassMask = DepthStencilTest(&state,
226                                                          work.triFlags.frontFacing,
227                                                          work.triFlags.viewportIndex,
228                                                          psContext.vZ,
229                                                          pDepthSample,
230                                                          vCoverageMask,
231                                                          pStencilSample,
232                                                          &stencilPassMask);
233                         AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
234                                                                 _simd_movemask_ps(stencilPassMask),
235                                                                 _simd_movemask_ps(vCoverageMask)));
236                         RDTSC_END(pDC->pContext->pBucketMgr, BELateDepthTest, 0);
237 
238                         if (!_simd_movemask_ps(depthPassMask))
239                         {
240                             // need to call depth/stencil write for stencil write
241                             DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
242                                               &state.depthStencilState,
243                                               work.triFlags.frontFacing,
244                                               psContext.vZ,
245                                               pDepthSample,
246                                               depthPassMask,
247                                               vCoverageMask,
248                                               pStencilSample,
249                                               stencilPassMask);
250 
251                             work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
252                             continue;
253                         }
254                     }
255 
256                     uint32_t statMask  = _simd_movemask_ps(depthPassMask);
257                     uint32_t statCount = _mm_popcnt_u32(statMask);
258                     UPDATE_STAT_BE(DepthPassCount, statCount);
259 
260                     // output merger
261                     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
262 
263                     OutputMerger8x2(pDC,
264                                     psContext,
265                                     psContext.pColorBuffer,
266                                     sample,
267                                     &state.blendState,
268                                     state.pfnBlendFunc,
269                                     vCoverageMask,
270                                     depthPassMask,
271                                     state.psState.renderTargetMask,
272                                     useAlternateOffset,
273                                     workerId);
274 
275                     // do final depth write after all pixel kills
276                     if (!state.psState.forceEarlyZ)
277                     {
278                         DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
279                                           &state.depthStencilState,
280                                           work.triFlags.frontFacing,
281                                           psContext.vZ,
282                                           pDepthSample,
283                                           depthPassMask,
284                                           vCoverageMask,
285                                           pStencilSample,
286                                           stencilPassMask);
287                     }
288                     RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
289                 }
290                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
291             }
292 
293         Endtile:
294             ATTR_UNUSED;
295 
296             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
297 
298             if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
299             {
300                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
301             }
302 
303             if (useAlternateOffset)
304             {
305                 unsigned long rt;
306                 uint32_t rtMask = state.colorHottileEnable;
307                 while (_BitScanForward(&rt, rtMask))
308                 {
309                     rtMask &= ~(1 << rt);
310                     psContext.pColorBuffer[rt] +=
311                         (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
312                 }
313             }
314 
315             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
316             pStencilBuffer +=
317                 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
318 
319             RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
320 
321             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
322             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
323         }
324 
325         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
326         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
327     }
328 
329     if (isTileDirty)
330     {
331         SetRenderHotTilesDirty(pDC, renderBuffers);
332     }
333 
334     RDTSC_END(pDC->pContext->pBucketMgr, BESampleRateBackend, 0);
335 }
336 
337 // Recursive template used to auto-nest conditionals.  Converts dynamic enum function
338 // arguments to static template arguments.
339 template <uint32_t... ArgsT>
340 struct BEChooserSampleRate
341 {
342     // Last Arg Terminator
GetFuncBEChooserSampleRate343     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
344     {
345         switch (tArg)
346         {
347         case SWR_BACKEND_MSAA_SAMPLE_RATE:
348             return BackendSampleRate<SwrBackendTraits<ArgsT...>>;
349             break;
350         case SWR_BACKEND_SINGLE_SAMPLE:
351         case SWR_BACKEND_MSAA_PIXEL_RATE:
352             SWR_ASSERT(0 && "Invalid backend func\n");
353             return nullptr;
354             break;
355         default:
356             SWR_ASSERT(0 && "Invalid backend func\n");
357             return nullptr;
358             break;
359         }
360     }
361 
362     // Recursively parse args
363     template <typename... TArgsT>
GetFuncBEChooserSampleRate364     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
365     {
366         switch (tArg)
367         {
368         case SWR_INPUT_COVERAGE_NONE:
369             return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
370                 remainingArgs...);
371             break;
372         case SWR_INPUT_COVERAGE_NORMAL:
373             return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
374                 remainingArgs...);
375             break;
376         case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
377             return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
378                 remainingArgs...);
379             break;
380         default:
381             SWR_ASSERT(0 && "Invalid sample pattern\n");
382             return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
383                 remainingArgs...);
384             break;
385         }
386     }
387 
388     // Recursively parse args
389     template <typename... TArgsT>
GetFuncBEChooserSampleRate390     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
391     {
392         switch (tArg)
393         {
394         case SWR_MULTISAMPLE_1X:
395             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
396             break;
397         case SWR_MULTISAMPLE_2X:
398             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
399             break;
400         case SWR_MULTISAMPLE_4X:
401             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
402             break;
403         case SWR_MULTISAMPLE_8X:
404             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
405             break;
406         case SWR_MULTISAMPLE_16X:
407             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
408             break;
409         default:
410             SWR_ASSERT(0 && "Invalid sample count\n");
411             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
412             break;
413         }
414     }
415 
416     // Recursively parse args
417     template <typename... TArgsT>
GetFuncBEChooserSampleRate418     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
419     {
420         if (tArg == true)
421         {
422             return BEChooserSampleRate<ArgsT..., 1>::GetFunc(remainingArgs...);
423         }
424 
425         return BEChooserSampleRate<ArgsT..., 0>::GetFunc(remainingArgs...);
426     }
427 };
428 
InitBackendSampleFuncTable(PFN_BACKEND_FUNC (& table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])429 void InitBackendSampleFuncTable(
430     PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
431 {
432     for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT;
433          sampleCount++)
434     {
435         for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
436         {
437             for (uint32_t centroid = 0; centroid < 2; centroid++)
438             {
439                 for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
440                 {
441                     table[sampleCount][inputCoverage][centroid][canEarlyZ] =
442                         BEChooserSampleRate<>::GetFunc(
443                             (SWR_MULTISAMPLE_COUNT)sampleCount,
444                             false,
445                             (SWR_INPUT_COVERAGE)inputCoverage,
446                             (centroid > 0),
447                             false,
448                             (canEarlyZ > 0),
449                             (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
450                 }
451             }
452         }
453     }
454 }
455