1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file backend.h
24  *
25  * @brief Backend handles rasterization, pixel shading and output merger
26  *        operations.
27  *
28  ******************************************************************************/
29 #pragma once
30 
31 #include "tilemgr.h"
32 #include "state.h"
33 #include "context.h"
34 
35 
36 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
37 void InitBackendSampleFuncTable(
38     PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
39 
40 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
41                                           SWR_PS_CONTEXT&          psContext);
42 
43 
44 enum SWR_BACKEND_FUNCS
45 {
46     SWR_BACKEND_SINGLE_SAMPLE,
47     SWR_BACKEND_MSAA_PIXEL_RATE,
48     SWR_BACKEND_MSAA_SAMPLE_RATE,
49     SWR_BACKEND_FUNCS_MAX,
50 };
51 
52 #if KNOB_SIMD_WIDTH == 8
53 static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
54 static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
55 static const __m256 vULOffsetsX     = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
56 static const __m256 vULOffsetsY     = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
57 #define MASK 0xff
58 #endif
59 
ComputeUserClipMask(uint8_t clipMask,float * pUserClipBuffer,simdscalar const & vI,simdscalar const & vJ)60 static INLINE simdmask ComputeUserClipMask(uint8_t           clipMask,
61                                            float*            pUserClipBuffer,
62                                            simdscalar const& vI,
63                                            simdscalar const& vJ)
64 {
65     simdscalar vClipMask       = _simd_setzero_ps();
66     uint32_t   numClipDistance = _mm_popcnt_u32(clipMask);
67 
68     for (uint32_t i = 0; i < numClipDistance; ++i)
69     {
70         // pull triangle clip distance values from clip buffer
71         simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
72         simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
73         simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
74 
75         // interpolate
76         simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
77 
78         // clip if interpolated clip distance is < 0 || NAN
79         simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
80 
81         vClipMask = _simd_or_ps(vClipMask, vCull);
82     }
83 
84     return _simd_movemask_ps(vClipMask);
85 }
86 
RasterTileColorOffset(uint32_t sampleNum)87 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
88 {
89     static const uint32_t RasterTileColorOffsets[16]{
90         0,
91         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
92         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
93         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
94         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
95         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
96         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
97         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
98         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
99         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
100         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
101             10,
102         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
103             11,
104         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
105             12,
106         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
107             13,
108         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
109             14,
110         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
111             15,
112     };
113     assert(sampleNum < 16);
114     return RasterTileColorOffsets[sampleNum];
115 }
116 
RasterTileDepthOffset(uint32_t sampleNum)117 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
118 {
119     static const uint32_t RasterTileDepthOffsets[16]{
120         0,
121         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
122         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
123         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
124         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
125         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
126         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
127         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
128         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
129         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
130         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
131             10,
132         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
133             11,
134         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
135             12,
136         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
137             13,
138         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
139             14,
140         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
141             15,
142     };
143     assert(sampleNum < 16);
144     return RasterTileDepthOffsets[sampleNum];
145 }
146 
RasterTileStencilOffset(uint32_t sampleNum)147 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
148 {
149     static const uint32_t RasterTileStencilOffsets[16]{
150         0,
151         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
152         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
153             2,
154         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
155             3,
156         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
157             4,
158         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
159             5,
160         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
161             6,
162         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
163             7,
164         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
165             8,
166         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
167             9,
168         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
169             10,
170         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
171             11,
172         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
173             12,
174         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
175             13,
176         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
177             14,
178         (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
179             15,
180     };
181     assert(sampleNum < 16);
182     return RasterTileStencilOffsets[sampleNum];
183 }
184 
185 template <typename T, uint32_t InputCoverage>
186 struct generateInputCoverage
187 {
generateInputCoveragegenerateInputCoverage188     INLINE generateInputCoverage(const uint64_t* const coverageMask,
189                                  uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
190                                  const uint32_t sampleMask)
191     {
192         // will need to update for avx512
193         assert(KNOB_SIMD_WIDTH == 8);
194 
195         simdscalari mask[2];
196         simdscalari sampleCoverage[2];
197 
198         if (T::bIsCenterPattern)
199         {
200             // center coverage is the same for all samples; just broadcast to the sample slots
201             uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
202             if (T::MultisampleT::numSamples == 1)
203             {
204                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
205             }
206             else if (T::MultisampleT::numSamples == 2)
207             {
208                 sampleCoverage[0] =
209                     _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
210             }
211             else if (T::MultisampleT::numSamples == 4)
212             {
213                 sampleCoverage[0] = _simd_set_epi32(
214                     0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
215             }
216             else if (T::MultisampleT::numSamples == 8)
217             {
218                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
219             }
220             else if (T::MultisampleT::numSamples == 16)
221             {
222                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
223                 sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
224             }
225         }
226         else
227         {
228             simdscalari src    = _simd_set1_epi32(0);
229             simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
230 
231             if (T::MultisampleT::numSamples == 1)
232             {
233                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
234             }
235             else if (T::MultisampleT::numSamples == 2)
236             {
237                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
238             }
239             else if (T::MultisampleT::numSamples == 4)
240             {
241                 mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
242             }
243             else if (T::MultisampleT::numSamples == 8)
244             {
245                 mask[0] = _simd_set1_epi32(-1);
246             }
247             else if (T::MultisampleT::numSamples == 16)
248             {
249                 mask[0] = _simd_set1_epi32(-1);
250                 mask[1] = _simd_set1_epi32(-1);
251                 index1  = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
252             }
253 
254             // gather coverage for samples 0-7
255             sampleCoverage[0] =
256                 _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
257                                                             (const float*)coverageMask,
258                                                             index0,
259                                                             _mm256_castsi256_ps(mask[0]),
260                                                             8));
261             if (T::MultisampleT::numSamples > 8)
262             {
263                 // gather coverage for samples 8-15
264                 sampleCoverage[1] =
265                     _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
266                                                                 (const float*)coverageMask,
267                                                                 index1,
268                                                                 _mm256_castsi256_ps(mask[1]),
269                                                                 8));
270             }
271         }
272 
273         mask[0] = _mm256_set_epi8(-1,
274                                   -1,
275                                   -1,
276                                   -1,
277                                   -1,
278                                   -1,
279                                   -1,
280                                   -1,
281                                   -1,
282                                   -1,
283                                   -1,
284                                   -1,
285                                   0xC,
286                                   0x8,
287                                   0x4,
288                                   0x0,
289                                   -1,
290                                   -1,
291                                   -1,
292                                   -1,
293                                   -1,
294                                   -1,
295                                   -1,
296                                   -1,
297                                   -1,
298                                   -1,
299                                   -1,
300                                   -1,
301                                   0xC,
302                                   0x8,
303                                   0x4,
304                                   0x0);
305         // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
306         simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
307 
308         simdscalari packedCoverage1;
309         if (T::MultisampleT::numSamples > 8)
310         {
311             // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit
312             // lane
313             packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
314         }
315 
316 #if (KNOB_ARCH == KNOB_ARCH_AVX)
317         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
318         simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
319         simdscalar  shufRes = _mm256_shuffle_ps(
320             _mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
321         packedCoverage0 = _mm256_castps_si256(
322             _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
323 
324         simdscalari packedSampleCoverage;
325         if (T::MultisampleT::numSamples > 8)
326         {
327             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
328             hiToLow         = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
329             shufRes         = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow),
330                                         _mm256_castsi256_ps(hiToLow),
331                                         _MM_SHUFFLE(1, 1, 0, 1));
332             shufRes         = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
333             packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(
334                 _mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
335             packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(
336                 _mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
337         }
338         else
339         {
340             packedSampleCoverage = packedCoverage0;
341         }
342 #else
343         simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
344         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
345         packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
346 
347         simdscalari packedSampleCoverage;
348         if (T::MultisampleT::numSamples > 8)
349         {
350             permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
351             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
352             packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
353 
354             // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
355             packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
356         }
357         else
358         {
359             packedSampleCoverage = packedCoverage0;
360         }
361 #endif
362 
363         for (int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
364         {
365             // convert packed sample coverage masks into single coverage masks for all samples for
366             // each pixel in the 4x2
367             inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
368 
369             if (!T::bForcedSampleCount)
370             {
371                 // input coverage has to be anded with sample mask if MSAA isn't forced on
372                 inputMask[i] &= sampleMask;
373             }
374 
375             // shift to the next pixel in the 4x2
376             packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
377         }
378     }
379 
generateInputCoveragegenerateInputCoverage380     INLINE generateInputCoverage(const uint64_t* const coverageMask,
381                                  simdscalar&           inputCoverage,
382                                  const uint32_t        sampleMask)
383     {
384         uint32_t inputMask[KNOB_SIMD_WIDTH];
385         generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
386         inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7],
387                                                         inputMask[6],
388                                                         inputMask[5],
389                                                         inputMask[4],
390                                                         inputMask[3],
391                                                         inputMask[2],
392                                                         inputMask[1],
393                                                         inputMask[0]));
394     }
395 };
396 
397 template <typename T>
398 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
399 {
400     INLINE generateInputCoverage(const uint64_t* const coverageMask,
401                                  simdscalar&           inputCoverage,
402                                  const uint32_t        sampleMask)
403     {
404         // will need to update for avx512
405         assert(KNOB_SIMD_WIDTH == 8);
406         simdscalari       vec = _simd_set1_epi32(coverageMask[0]);
407         const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
408         vec                   = _simd_and_si(vec, bit);
409         vec                   = _simd_cmplt_epi32(_simd_setzero_si(), vec);
410         vec                   = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
411         inputCoverage         = _simd_castsi_ps(vec);
412     }
413 
414     INLINE generateInputCoverage(const uint64_t* const coverageMask,
415                                  uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
416                                  const uint32_t sampleMask)
417     {
418         uint32_t              simdCoverage     = (coverageMask[0] & MASK);
419         static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
420         for (int i = 0; i < KNOB_SIMD_WIDTH; i++)
421         {
422             // set all samples to covered if conservative coverage mask is set for that pixel
423             inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
424         }
425     }
426 };
427 
428 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
429 // Centroid behaves exactly as follows :
430 // (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center
431 // (even if the sample pattern does not happen to
432 //     have a sample location there).
433 // (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample
434 // index, where sample coverage is after ANDing the
435 //     coverage with the SampleMask Rasterizer State.
436 // (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to
437 // fill out 2x2 pixel stamps, the attribute is
438 //     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the
439 //     pixel, then the first sample covered by the SampleMask Rasterizer State is the evaluation
440 //     point.Otherwise (full SampleMask), the pixel center is the evaluation point.
441 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
442 template <typename T>
443 INLINE void CalcCentroidPos(SWR_PS_CONTEXT&            psContext,
444                             const SWR_MULTISAMPLE_POS& samplePos,
445                             const uint64_t* const      coverageMask,
446                             const uint32_t             sampleMask,
447                             simdscalar const&          vXSamplePosUL,
448                             simdscalar const&          vYSamplePosUL)
449 {
450     uint32_t inputMask[KNOB_SIMD_WIDTH];
451     generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
452 
453     // Case (2) - partially covered pixel
454 
455     // scan for first covered sample per pixel in the 4x2 span
456     unsigned long sampleNum[KNOB_SIMD_WIDTH];
457     (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
458     (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
459     (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
460     (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
461     (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
462     (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
463     (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
464     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
465 
466     // look up and set the sample offsets from UL pixel corner for first covered sample
467     simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
468                                        samplePos.X(sampleNum[6]),
469                                        samplePos.X(sampleNum[5]),
470                                        samplePos.X(sampleNum[4]),
471                                        samplePos.X(sampleNum[3]),
472                                        samplePos.X(sampleNum[2]),
473                                        samplePos.X(sampleNum[1]),
474                                        samplePos.X(sampleNum[0]));
475 
476     simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
477                                        samplePos.Y(sampleNum[6]),
478                                        samplePos.Y(sampleNum[5]),
479                                        samplePos.Y(sampleNum[4]),
480                                        samplePos.Y(sampleNum[3]),
481                                        samplePos.Y(sampleNum[2]),
482                                        samplePos.Y(sampleNum[1]),
483                                        samplePos.Y(sampleNum[0]));
484     // add sample offset to UL pixel corner
485     vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
486     vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
487 
488     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
489     static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
490     simdscalari              vInputCoveragei   = _simd_set_epi32(inputMask[7],
491                                                   inputMask[6],
492                                                   inputMask[5],
493                                                   inputMask[4],
494                                                   inputMask[3],
495                                                   inputMask[2],
496                                                   inputMask[1],
497                                                   inputMask[0]);
498     simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
499 
500     static const simdscalari vZero = _simd_setzero_si();
501     const simdscalari vSampleMask  = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
502     simdscalari       vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
503     simdscalari       vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
504     simdscalari       vCase3b           = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
505 
506     simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
507 
508     // set the centroid position based on results from above
509     psContext.vX.centroid =
510         _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
511     psContext.vY.centroid =
512         _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
513 
514     // Case (3a) No samples covered and partial sample mask
515     simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
516     // sample mask should never be all 0's for this case, but handle it anyways
517     unsigned long firstCoveredSampleMaskSample = 0;
518     (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask))
519                      : (firstCoveredSampleMaskSample = 0);
520 
521     simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
522 
523     vXSample = _simd_set1_ps(samplePos.X(firstCoveredSampleMaskSample));
524     vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
525 
526     // blend in case 3a pixel locations
527     psContext.vX.centroid =
528         _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
529     psContext.vY.centroid =
530         _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
531 }
532 
533 INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs,
534                                      SWR_PS_CONTEXT&          psContext,
535                                      const simdscalar&        vXSamplePosUL,
536                                      const simdscalar&        vYSamplePosUL)
537 {
538     // evaluate I,J
539     psContext.vI.centroid =
540         vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
541     psContext.vJ.centroid =
542         vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
543     psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
544     psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
545 
546     // interpolate 1/w
547     psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW,
548                                             coeffs.vBOneOverW,
549                                             coeffs.vCOneOverW,
550                                             psContext.vI.centroid,
551                                             psContext.vJ.centroid);
552 }
553 
554 INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const& z, float minz, float maxz)
555 {
556     const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
557     const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
558 
559     return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
560 }
561 
562 template <typename T>
563 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
564 {
565     // RT has to be single sample if we're in forcedMSAA mode
566     if (T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
567     {
568         return 1;
569     }
570     // unless we're forced to single sample, in which case we run the OM at the sample count of the
571     // RT
572     else if (T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
573     {
574         return GetNumSamples(blendSampleCount);
575     }
576     // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
577     else
578     {
579         return T::MultisampleT::numSamples;
580     }
581 }
582 
583 inline void SetupBarycentricCoeffs(BarycentricCoeffs* coeffs, const SWR_TRIANGLE_DESC& work)
584 {
585     // broadcast scalars
586 
587     coeffs->vIa = _simd_broadcast_ss(&work.I[0]);
588     coeffs->vIb = _simd_broadcast_ss(&work.I[1]);
589     coeffs->vIc = _simd_broadcast_ss(&work.I[2]);
590 
591     coeffs->vJa = _simd_broadcast_ss(&work.J[0]);
592     coeffs->vJb = _simd_broadcast_ss(&work.J[1]);
593     coeffs->vJc = _simd_broadcast_ss(&work.J[2]);
594 
595     coeffs->vZa = _simd_broadcast_ss(&work.Z[0]);
596     coeffs->vZb = _simd_broadcast_ss(&work.Z[1]);
597     coeffs->vZc = _simd_broadcast_ss(&work.Z[2]);
598 
599     coeffs->vRecipDet = _simd_broadcast_ss(&work.recipDet);
600 
601     coeffs->vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
602     coeffs->vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
603     coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
604 }
605 
606 inline void SetupRenderBuffers(uint8_t*             pColorBuffer[SWR_NUM_RENDERTARGETS],
607                                uint8_t**            pDepthBuffer,
608                                uint8_t**            pStencilBuffer,
609                                uint32_t             colorHotTileMask,
610                                RenderOutputBuffers& renderBuffers)
611 {
612     unsigned long index;
613     while (_BitScanForward(&index, colorHotTileMask))
614     {
615         assert(index < SWR_NUM_RENDERTARGETS);
616         colorHotTileMask &= ~(1 << index);
617         pColorBuffer[index] = renderBuffers.pColor[index];
618     }
619 
620     if (pDepthBuffer)
621     {
622         *pDepthBuffer = renderBuffers.pDepth;
623     }
624 
625     if (pStencilBuffer)
626     {
627         *pStencilBuffer = renderBuffers.pStencil;
628         ;
629     }
630 }
631 
632 INLINE void SetRenderHotTilesDirty(DRAW_CONTEXT* pDC, RenderOutputBuffers& renderBuffers)
633 {
634     const API_STATE& state = GetApiState(pDC);
635 
636     unsigned long rtSlot                 = 0;
637     uint32_t      colorHottileEnableMask = state.colorHottileEnable;
638     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
639     {
640         colorHottileEnableMask &= ~(1 << rtSlot);
641         renderBuffers.pColorHotTile[rtSlot]->state = HOTTILE_DIRTY;
642     }
643 }
644 
645 template <typename T>
646 void SetupPixelShaderContext(SWR_PS_CONTEXT*            psContext,
647                              const SWR_MULTISAMPLE_POS& samplePos,
648                              SWR_TRIANGLE_DESC&         work)
649 {
650     psContext->pAttribs               = work.pAttribs;
651     psContext->pPerspAttribs          = work.pPerspAttribs;
652     psContext->frontFace              = work.triFlags.frontFacing;
653     psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
654     psContext->viewportIndex          = work.triFlags.viewportIndex;
655 
656     // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull
657     // attribs
658     psContext->I = work.I;
659     psContext->J = work.J;
660 
661     psContext->recipDet = work.recipDet;
662     psContext->pRecipW  = work.pRecipW;
663     psContext->pSamplePosX =
664         samplePos.X(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
665     psContext->pSamplePosY =
666         samplePos.Y(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
667     psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
668     psContext->sampleIndex           = 0;
669 }
670 
671 template <typename T, bool IsSingleSample>
672 void CalcCentroid(SWR_PS_CONTEXT*            psContext,
673                   const SWR_MULTISAMPLE_POS& samplePos,
674                   const BarycentricCoeffs&   coeffs,
675                   const uint64_t* const      coverageMask,
676                   uint32_t                   sampleMask)
677 {
678     if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid
679                         // positions are still different
680     {
681         // for 1x case, centroid is pixel center
682         psContext->vX.centroid        = psContext->vX.center;
683         psContext->vY.centroid        = psContext->vY.center;
684         psContext->vI.centroid        = psContext->vI.center;
685         psContext->vJ.centroid        = psContext->vJ.center;
686         psContext->vOneOverW.centroid = psContext->vOneOverW.center;
687     }
688     else
689     {
690         if (T::bCentroidPos)
691         {
692             ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
693             if (T::bIsCenterPattern)
694             {
695                 psContext->vX.centroid = _simd_add_ps(psContext->vX.UL, _simd_set1_ps(0.5f));
696                 psContext->vY.centroid = _simd_add_ps(psContext->vY.UL, _simd_set1_ps(0.5f));
697             }
698             else
699             {
700                 // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate
701                 // coverage 2X'..
702                 CalcCentroidPos<T>(*psContext,
703                                    samplePos,
704                                    coverageMask,
705                                    sampleMask,
706                                    psContext->vX.UL,
707                                    psContext->vY.UL);
708             }
709 
710             CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
711         }
712         else
713         {
714             psContext->vX.centroid = psContext->vX.sample;
715             psContext->vY.centroid = psContext->vY.sample;
716         }
717     }
718 }
719 
720 template <typename T>
721 struct PixelRateZTestLoop
722 {
723     PixelRateZTestLoop(DRAW_CONTEXT*            DC,
724                        uint32_t                 _workerId,
725                        const SWR_TRIANGLE_DESC& Work,
726                        const BarycentricCoeffs& Coeffs,
727                        const API_STATE&         apiState,
728                        uint8_t*&                depthBuffer,
729                        uint8_t*&                stencilBuffer,
730                        const uint8_t            ClipDistanceMask) :
731         pDC(DC),
732         workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
733         samplePos(state.rastState.samplePositions), clipDistanceMask(ClipDistanceMask),
734         pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
735 
736     INLINE
737     uint32_t operator()(simdscalar&        activeLanes,
738                         SWR_PS_CONTEXT&    psContext,
739                         const CORE_BUCKETS BEDepthBucket,
740                         uint32_t           currentSimdIn8x8 = 0)
741     {
742 
743         uint32_t   statCount            = 0;
744         simdscalar anyDepthSamplePassed = _simd_setzero_ps();
745         for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
746         {
747             const uint8_t* pCoverageMask = (uint8_t*)&work.coverageMask[sample];
748             vCoverageMask[sample] =
749                 _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
750 
751             if (!_simd_movemask_ps(vCoverageMask[sample]))
752             {
753                 vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] =
754                     _simd_setzero_ps();
755                 continue;
756             }
757 
758             // offset depth/stencil buffers current sample
759             uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
760             uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
761 
762             if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
763             {
764                 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
765                               "Unsupported depth hot tile format");
766 
767                 const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
768 
769                 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
770                 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
771 
772                 vCoverageMask[sample] =
773                     _simd_and_ps(vCoverageMask[sample],
774                                  _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
775             }
776 
777             RDTSC_BEGIN(psContext.pBucketManager, BEBarycentric, pDC->drawId);
778 
779             // calculate per sample positions
780             psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
781             psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));
782 
783             // calc I & J per sample
784             CalcSampleBarycentrics(coeffs, psContext);
785 
786             if (psState.writesODepth)
787             {
788                 {
789                     // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
790                     vZ[sample] = psContext.vZ;
791                 }
792             }
793             else
794             {
795                 vZ[sample] = vplaneps(
796                     coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
797                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
798             }
799 
800             RDTSC_END(psContext.pBucketManager, BEBarycentric, 0);
801 
802             ///@todo: perspective correct vs non-perspective correct clipping?
803             // if clip distances are enabled, we need to interpolate for each sample
804             if (clipDistanceMask)
805             {
806                 uint8_t clipMask = ComputeUserClipMask(clipDistanceMask,
807                                                        work.pUserClipBuffer,
808                                                        psContext.vI.sample,
809                                                        psContext.vJ.sample);
810 
811                 vCoverageMask[sample] =
812                     _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
813             }
814 
815             // ZTest for this sample
816             ///@todo Need to uncomment out this bucket.
817             // RDTSC_BEGIN(psContext.pBucketManager, BEDepthBucket, pDC->drawId);
818             depthPassMask[sample]   = vCoverageMask[sample];
819             stencilPassMask[sample] = vCoverageMask[sample];
820             depthPassMask[sample]   = DepthStencilTest(&state,
821                                                      work.triFlags.frontFacing,
822                                                      work.triFlags.viewportIndex,
823                                                      vZ[sample],
824                                                      pDepthSample,
825                                                      vCoverageMask[sample],
826                                                      pStencilSample,
827                                                      &stencilPassMask[sample]);
828             // RDTSC_END(psContext.pBucketManager, BEDepthBucket, 0);
829 
830             // early-exit if no pixels passed depth or earlyZ is forced on
831             if (psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
832             {
833                 DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
834                                   &state.depthStencilState,
835                                   work.triFlags.frontFacing,
836                                   vZ[sample],
837                                   pDepthSample,
838                                   depthPassMask[sample],
839                                   vCoverageMask[sample],
840                                   pStencilSample,
841                                   stencilPassMask[sample]);
842 
843                 if (!_simd_movemask_ps(depthPassMask[sample]))
844                 {
845                     continue;
846                 }
847             }
848             anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
849             uint32_t statMask    = _simd_movemask_ps(depthPassMask[sample]);
850             statCount += _mm_popcnt_u32(statMask);
851         }
852 
853         activeLanes = _simd_and_ps(anyDepthSamplePassed, activeLanes);
854         // return number of samples that passed depth and coverage
855         return statCount;
856     }
857 
858     // saved depth/stencil/coverage masks and interpolated Z used in OM and DepthWrite
859     simdscalar vZ[T::MultisampleT::numCoverageSamples];
860     simdscalar vCoverageMask[T::MultisampleT::numCoverageSamples];
861     simdscalar depthPassMask[T::MultisampleT::numCoverageSamples];
862     simdscalar stencilPassMask[T::MultisampleT::numCoverageSamples];
863 
864 private:
865     // functor inputs
866     DRAW_CONTEXT* pDC;
867     uint32_t      workerId;
868 
869     const SWR_TRIANGLE_DESC&   work;
870     const BarycentricCoeffs&   coeffs;
871     const API_STATE&           state;
872     const SWR_PS_STATE&        psState;
873     const SWR_MULTISAMPLE_POS& samplePos;
874     const uint8_t              clipDistanceMask;
875     uint8_t*&                  pDepthBuffer;
876     uint8_t*&                  pStencilBuffer;
877 };
878 
879 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT& psContext)
880 {
881     // evaluate I,J
882     psContext.vI.center =
883         vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
884     psContext.vJ.center =
885         vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
886     psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
887     psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
888 
889     // interpolate 1/w
890     psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW,
891                                           coeffs.vBOneOverW,
892                                           coeffs.vCOneOverW,
893                                           psContext.vI.center,
894                                           psContext.vJ.center);
895 }
896 
897 static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
898                                           SWR_PS_CONTEXT&          psContext)
899 {
900     // evaluate I,J
901     psContext.vI.sample =
902         vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
903     psContext.vJ.sample =
904         vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
905     psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
906     psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
907 
908     // interpolate 1/w
909     psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW,
910                                           coeffs.vBOneOverW,
911                                           coeffs.vCOneOverW,
912                                           psContext.vI.sample,
913                                           psContext.vJ.sample);
914 }
915 
916 // Merge Output to 8x2 SIMD16 Tile Format
917 INLINE void OutputMerger8x2(DRAW_CONTEXT*   pDC,
918                             SWR_PS_CONTEXT& psContext,
919                             uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS],
920                             uint32_t               sample,
921                             const SWR_BLEND_STATE* pBlendState,
922                             const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS],
923                             simdscalar&       coverageMask,
924                             simdscalar const& depthPassMask,
925                             uint32_t          renderTargetMask,
926                             bool              useAlternateOffset,
927                             uint32_t          workerId)
928 {
929     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
930     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
931 
932     if (useAlternateOffset)
933     {
934         rasterTileColorOffset += sizeof(simdscalar);
935     }
936 
937     simdvector blendSrc;
938     simdvector blendOut;
939 
940     unsigned long rt;
941     while (_BitScanForward(&rt, renderTargetMask))
942     {
943         renderTargetMask &= ~(1 << rt);
944 
945         const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt];
946 
947         simdscalar* pColorSample;
948         bool        hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed ||
949                              !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
950         if (hotTileEnable)
951         {
952             pColorSample = reinterpret_cast<simdscalar*>(pColorBase[rt] + rasterTileColorOffset);
953             blendSrc[0]  = pColorSample[0];
954             blendSrc[1]  = pColorSample[2];
955             blendSrc[2]  = pColorSample[4];
956             blendSrc[3]  = pColorSample[6];
957         }
958         else
959         {
960             pColorSample = nullptr;
961         }
962 
963         SWR_BLEND_CONTEXT blendContext = {0};
964         {
965             // pfnBlendFunc may not update all channels.  Initialize with PS output.
966             /// TODO: move this into the blend JIT.
967             blendOut = psContext.shaded[rt];
968 
969             blendContext.pBlendState = pBlendState;
970             blendContext.src         = &psContext.shaded[rt];
971             blendContext.src1        = &psContext.shaded[1];
972             blendContext.src0alpha   = reinterpret_cast<simdvector*>(&psContext.shaded[0].w);
973             blendContext.sampleNum   = sample;
974             blendContext.pDst        = &blendSrc;
975             blendContext.result      = &blendOut;
976             blendContext.oMask       = &psContext.oMask;
977             blendContext.pMask       = reinterpret_cast<simdscalari*>(&coverageMask);
978 
979             // Blend outputs and update coverage mask for alpha test
980             if (pfnBlendFunc[rt] != nullptr)
981             {
982                 pfnBlendFunc[rt](&blendContext);
983             }
984         }
985 
986         // Track alpha events
987         AR_EVENT(
988             AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
989 
990         // final write mask
991         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
992 
993         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
994         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
995                       "Unsupported hot tile format");
996 
997         // store with color mask
998         if (!pRTBlend->writeDisableRed)
999         {
1000             _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[0]), outputMask, blendOut.x);
1001         }
1002         if (!pRTBlend->writeDisableGreen)
1003         {
1004             _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[2]), outputMask, blendOut.y);
1005         }
1006         if (!pRTBlend->writeDisableBlue)
1007         {
1008             _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[4]), outputMask, blendOut.z);
1009         }
1010         if (!pRTBlend->writeDisableAlpha)
1011         {
1012             _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[6]), outputMask, blendOut.w);
1013         }
1014     }
1015 }
1016 
1017 template <typename T>
1018 void BackendPixelRate(DRAW_CONTEXT*        pDC,
1019                       uint32_t             workerId,
1020                       uint32_t             x,
1021                       uint32_t             y,
1022                       SWR_TRIANGLE_DESC&   work,
1023                       RenderOutputBuffers& renderBuffers)
1024 {
1025     ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the
1026     /// backend
1027 
1028 
1029     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelRateBackend, pDC->drawId);
1030     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BESetup, pDC->drawId);
1031 
1032     const API_STATE& state = GetApiState(pDC);
1033 
1034     BarycentricCoeffs coeffs;
1035     SetupBarycentricCoeffs(&coeffs, work);
1036 
1037     SWR_CONTEXT* pContext    = pDC->pContext;
1038     void*        pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1039 
1040     SWR_PS_CONTEXT             psContext;
1041     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
1042     SetupPixelShaderContext<T>(&psContext, samplePos, work);
1043 
1044     uint8_t *pDepthBuffer, *pStencilBuffer;
1045     SetupRenderBuffers(psContext.pColorBuffer,
1046                        &pDepthBuffer,
1047                        &pStencilBuffer,
1048                        state.colorHottileEnable,
1049                        renderBuffers);
1050 
1051     bool isTileDirty = false;
1052 
1053     RDTSC_END(pDC->pContext->pBucketMgr, BESetup, 0);
1054 
1055     PixelRateZTestLoop<T> PixelRateZTest(pDC,
1056                                          workerId,
1057                                          work,
1058                                          coeffs,
1059                                          state,
1060                                          pDepthBuffer,
1061                                          pStencilBuffer,
1062                                          state.backendState.clipDistanceMask);
1063 
1064     psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
1065     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
1066 
1067     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
1068 
1069     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
1070     {
1071         psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
1072         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
1073 
1074         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
1075 
1076         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
1077         {
1078             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
1079 
1080 
1081             simdscalar activeLanes;
1082             if (!(work.anyCoveredSamples & MASK))
1083             {
1084                 goto Endtile;
1085             };
1086             activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
1087 
1088             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
1089             {
1090                 const uint64_t* pCoverageMask =
1091                     (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1092                         ? &work.innerCoverageMask
1093                         : &work.coverageMask[0];
1094 
1095                 generateInputCoverage<T, T::InputCoverage>(
1096                     pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
1097             }
1098 
1099             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
1100 
1101             CalcPixelBarycentrics(coeffs, psContext);
1102 
1103             CalcCentroid<T, false>(
1104                 &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
1105 
1106             RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
1107 
1108             if (T::bForcedSampleCount)
1109             {
1110                 // candidate pixels (that passed coverage) will cause shader invocation if any bits
1111                 // in the samplemask are set
1112                 const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(
1113                     _simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
1114                 activeLanes                  = _simd_and_ps(activeLanes, vSampleMask);
1115             }
1116 
1117             // Early-Z?
1118             if (T::bCanEarlyZ && !T::bForcedSampleCount)
1119             {
1120                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
1121                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
1122                 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
1123             }
1124 
1125             // if we have no covered samples that passed depth at this point, go to next tile
1126             if (!_simd_movemask_ps(activeLanes))
1127             {
1128                 goto Endtile;
1129             };
1130 
1131             if (state.psState.usesSourceDepth)
1132             {
1133                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEBarycentric, pDC->drawId);
1134                 // interpolate and quantize z
1135                 psContext.vZ = vplaneps(
1136                     coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
1137                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
1138                 RDTSC_END(pDC->pContext->pBucketMgr, BEBarycentric, 0);
1139             }
1140 
1141             // pixels that are currently active
1142             psContext.activeMask = _simd_castps_si(activeLanes);
1143             psContext.oMask      = T::MultisampleT::FullSampleMask();
1144 
1145             // execute pixel shader
1146             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
1147             state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
1148             RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
1149 
1150             // update stats
1151             UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
1152             AR_EVENT(PSStats((HANDLE)&psContext.stats));
1153 
1154             // update active lanes to remove any discarded or oMask'd pixels
1155             activeLanes = _simd_castsi_ps(_simd_and_si(
1156                 psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
1157             if (!_simd_movemask_ps(activeLanes))
1158             {
1159                 goto Endtile;
1160             };
1161 
1162             isTileDirty = true;
1163 
1164             // late-Z
1165             if (!T::bCanEarlyZ && !T::bForcedSampleCount)
1166             {
1167                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
1168                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
1169                 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
1170             }
1171 
1172             // if we have no covered samples that passed depth at this point, skip OM and go to next
1173             // tile
1174             if (!_simd_movemask_ps(activeLanes))
1175             {
1176                 goto Endtile;
1177             };
1178 
1179             // output merger
1180             // loop over all samples, broadcasting the results of the PS to all passing pixels
1181             for (uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount);
1182                  sample++)
1183             {
1184                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEOutputMerger, pDC->drawId);
1185                 // center pattern does a single coverage/depth/stencil test, standard pattern tests
1186                 // all samples
1187                 uint32_t   coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
1188                 simdscalar coverageMask, depthMask;
1189                 if (T::bForcedSampleCount)
1190                 {
1191                     coverageMask = depthMask = activeLanes;
1192                 }
1193                 else
1194                 {
1195                     coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
1196                     depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
1197                     if (!_simd_movemask_ps(depthMask))
1198                     {
1199                         // stencil should already have been written in early/lateZ tests
1200                         RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
1201                         continue;
1202                     }
1203                 }
1204 
1205                 // broadcast the results of the PS to all passing pixels
1206 
1207                 OutputMerger8x2(pDC,
1208                                 psContext,
1209                                 psContext.pColorBuffer,
1210                                 sample,
1211                                 &state.blendState,
1212                                 state.pfnBlendFunc,
1213                                 coverageMask,
1214                                 depthMask,
1215                                 state.psState.renderTargetMask,
1216                                 useAlternateOffset,
1217                                 workerId);
1218 
1219 
1220                 if (!state.psState.forceEarlyZ && !T::bForcedSampleCount)
1221                 {
1222                     uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
1223                     uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
1224 
1225                     DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
1226                                       &state.depthStencilState,
1227                                       work.triFlags.frontFacing,
1228                                       PixelRateZTest.vZ[coverageSampleNum],
1229                                       pDepthSample,
1230                                       depthMask,
1231                                       coverageMask,
1232                                       pStencilSample,
1233                                       PixelRateZTest.stencilPassMask[coverageSampleNum]);
1234                 }
1235                 RDTSC_END(pDC->pContext->pBucketMgr, BEOutputMerger, 0);
1236             }
1237         Endtile:
1238             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEEndTile, pDC->drawId);
1239 
1240             for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
1241             {
1242                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1243             }
1244 
1245             if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
1246             {
1247                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1248             }
1249             work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
1250 
1251             if (useAlternateOffset)
1252             {
1253                 unsigned long rt;
1254                 uint32_t rtMask = state.colorHottileEnable;
1255                 while (_BitScanForward(&rt, rtMask))
1256                 {
1257                     rtMask &= ~(1 << rt);
1258                     psContext.pColorBuffer[rt] +=
1259                         (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
1260                 }
1261             }
1262 
1263             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
1264             pStencilBuffer +=
1265                 (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
1266 
1267             RDTSC_END(pDC->pContext->pBucketMgr, BEEndTile, 0);
1268 
1269             psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
1270             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
1271         }
1272 
1273         psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
1274         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
1275     }
1276 
1277     if (isTileDirty)
1278     {
1279         SetRenderHotTilesDirty(pDC, renderBuffers);
1280     }
1281 
1282     RDTSC_END(pDC->pContext->pBucketMgr, BEPixelRateBackend, 0);
1283 }
1284 
1285 template <uint32_t sampleCountT = SWR_MULTISAMPLE_1X,
1286           uint32_t isCenter     = 0,
1287           uint32_t coverage     = 0,
1288           uint32_t centroid     = 0,
1289           uint32_t forced       = 0,
1290           uint32_t canEarlyZ    = 0
1291           >
1292 struct SwrBackendTraits
1293 {
1294     static const bool     bIsCenterPattern   = (isCenter == 1);
1295     static const uint32_t InputCoverage      = coverage;
1296     static const bool     bCentroidPos       = (centroid == 1);
1297     static const bool     bForcedSampleCount = (forced == 1);
1298     static const bool     bCanEarlyZ         = (canEarlyZ == 1);
1299     typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
1300 };
1301