1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file rasterizer.cpp
24  *
25  * @brief Implementation for the rasterizer.
26  *
27  ******************************************************************************/
28 
29 #include <vector>
30 #include <algorithm>
31 
32 #include "rasterizer.h"
33 #include "rdtsc_core.h"
34 #include "backend.h"
35 #include "utils.h"
36 #include "frontend.h"
37 #include "tilemgr.h"
38 #include "memory/tilingtraits.h"
39 
40 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
41                                      [STATE_VALID_TRI_EDGE_COUNT][2];
42 
43 template <uint32_t numSamples = 1>
44 void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
45                        uint32_t             workerId,
46                        uint32_t             macroID,
47                        uint32_t             x,
48                        uint32_t             y,
49                        RenderOutputBuffers& renderBuffers,
50                        uint32_t             renderTargetArrayIndex);
51 template <typename RT>
52 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers);
53 template <typename RT>
54 void StepRasterTileY(uint32_t             colorHotTileMask,
55                      RenderOutputBuffers& buffers,
56                      RenderOutputBuffers& startBufferRow);
57 
58 #define MASKTOVEC(i3, i2, i1, i0) \
59     {                             \
60         -i0, -i1, -i2, -i3        \
61     }
62 static const __m256d gMaskToVecpd[] = {
63     MASKTOVEC(0, 0, 0, 0),
64     MASKTOVEC(0, 0, 0, 1),
65     MASKTOVEC(0, 0, 1, 0),
66     MASKTOVEC(0, 0, 1, 1),
67     MASKTOVEC(0, 1, 0, 0),
68     MASKTOVEC(0, 1, 0, 1),
69     MASKTOVEC(0, 1, 1, 0),
70     MASKTOVEC(0, 1, 1, 1),
71     MASKTOVEC(1, 0, 0, 0),
72     MASKTOVEC(1, 0, 0, 1),
73     MASKTOVEC(1, 0, 1, 0),
74     MASKTOVEC(1, 0, 1, 1),
75     MASKTOVEC(1, 1, 0, 0),
76     MASKTOVEC(1, 1, 0, 1),
77     MASKTOVEC(1, 1, 1, 0),
78     MASKTOVEC(1, 1, 1, 1),
79 };
80 
81 struct POS
82 {
83     int32_t x, y;
84 };
85 
86 struct EDGE
87 {
88     double a, b;            // a, b edge coefficients in fix8
89     double stepQuadX;       // step to adjacent horizontal quad in fix16
90     double stepQuadY;       // step to adjacent vertical quad in fix16
91     double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
92     double stepRasterTileY; // step to adjacent vertical raster tile in fix16
93 
94     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
95     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
96 };
97 
98 //////////////////////////////////////////////////////////////////////////
99 /// @brief rasterize a raster tile partially covered by the triangle
100 /// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster
101 /// tile
102 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
103 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
104 ///        Used to step between quads when sweeping over the raster tile.
105 template <uint32_t NumEdges, typename EdgeMaskT>
rasterizePartialTile(DRAW_CONTEXT * pDC,double startEdges[NumEdges],EDGE * pRastEdges)106 INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC,
107                                      double        startEdges[NumEdges],
108                                      EDGE*         pRastEdges)
109 {
110     uint64_t coverageMask = 0;
111 
112     __m256d vEdges[NumEdges];
113     __m256d vStepX[NumEdges];
114     __m256d vStepY[NumEdges];
115 
116     for (uint32_t e = 0; e < NumEdges; ++e)
117     {
118         // Step to the pixel sample locations of the 1st quad
119         vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
120 
121         // compute step to next quad (mul by 2 in x and y direction)
122         vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
123         vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
124     }
125 
126     // fast unrolled version for 8x8 tile
127 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
128     int      edgeMask[NumEdges];
129     uint64_t mask;
130 
131     auto eval_lambda   = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); };
132     auto update_lambda = [&](int e) { mask &= edgeMask[e]; };
133     auto incx_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); };
134     auto incy_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); };
135     auto decx_lambda   = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); };
136 
137 // evaluate which pixels in the quad are covered
138 #define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
139 
140     // update coverage mask
141     // if edge 0 is degenerate and will be skipped; init the mask
142 #define UPDATE_MASK(bit)                                                  \
143     if (std::is_same<EdgeMaskT, E1E2ValidT>::value ||                     \
144         std::is_same<EdgeMaskT, NoEdgesValidT>::value)                    \
145     {                                                                     \
146         mask = 0xf;                                                       \
147     }                                                                     \
148     else                                                                  \
149     {                                                                     \
150         mask = edgeMask[0];                                               \
151     }                                                                     \
152     UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
153     coverageMask |= (mask << bit);
154 
155     // step in the +x direction to the next quad
156 #define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
157 
158     // step in the +y direction to the next quad
159 #define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
160 
161     // step in the -x direction to the next quad
162 #define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
163 
164     // sweep 2x2 quad back and forth through the raster tile,
165     // computing coverage masks for the entire tile
166 
167     // raster tile
168     // 0  1  2  3  4  5  6  7
169     // x  x
170     // x  x ------------------>
171     //                   x  x  |
172     // <-----------------x  x  V
173     // ..
174 
175     // row 0
176     EVAL;
177     UPDATE_MASK(0);
178     INCX;
179     EVAL;
180     UPDATE_MASK(4);
181     INCX;
182     EVAL;
183     UPDATE_MASK(8);
184     INCX;
185     EVAL;
186     UPDATE_MASK(12);
187     INCY;
188 
189     // row 1
190     EVAL;
191     UPDATE_MASK(28);
192     DECX;
193     EVAL;
194     UPDATE_MASK(24);
195     DECX;
196     EVAL;
197     UPDATE_MASK(20);
198     DECX;
199     EVAL;
200     UPDATE_MASK(16);
201     INCY;
202 
203     // row 2
204     EVAL;
205     UPDATE_MASK(32);
206     INCX;
207     EVAL;
208     UPDATE_MASK(36);
209     INCX;
210     EVAL;
211     UPDATE_MASK(40);
212     INCX;
213     EVAL;
214     UPDATE_MASK(44);
215     INCY;
216 
217     // row 3
218     EVAL;
219     UPDATE_MASK(60);
220     DECX;
221     EVAL;
222     UPDATE_MASK(56);
223     DECX;
224     EVAL;
225     UPDATE_MASK(52);
226     DECX;
227     EVAL;
228     UPDATE_MASK(48);
229 #else
230     uint32_t bit = 0;
231     for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y)
232     {
233         __m256d vStartOfRowEdge[NumEdges];
234         for (uint32_t e = 0; e < NumEdges; ++e)
235         {
236             vStartOfRowEdge[e] = vEdges[e];
237         }
238 
239         for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x)
240         {
241             int edgeMask[NumEdges];
242             for (uint32_t e = 0; e < NumEdges; ++e)
243             {
244                 edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
245             }
246 
247             uint64_t mask = edgeMask[0];
248             for (uint32_t e = 1; e < NumEdges; ++e)
249             {
250                 mask &= edgeMask[e];
251             }
252             coverageMask |= (mask << bit);
253 
254             // step to the next pixel in the x
255             for (uint32_t e = 0; e < NumEdges; ++e)
256             {
257                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
258             }
259             bit += 4;
260         }
261 
262         // step to the next row
263         for (uint32_t e = 0; e < NumEdges; ++e)
264         {
265             vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
266         }
267     }
268 #endif
269     return coverageMask;
270 }
271 // Top left rule:
272 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
273 // Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it
274 // is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal &&
275 // above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and
276 // right
adjustTopLeftRuleIntFix16(const __m128i vA,const __m128i vB,__m256d & vEdge)277 INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge)
278 {
279     // if vA < 0, vC--
280     // if vA == 0 && vB < 0, vC--
281 
282     __m256d vEdgeOut    = vEdge;
283     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
284 
285     // if vA < 0 (line is not horizontal and below)
286     int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
287 
288     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
289     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
290     int     msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
291     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
292 
293     // if either of these are true and we're on the line (edge == 0), bump it outside the line
294     vEdge = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
295 }
296 
297 //////////////////////////////////////////////////////////////////////////
298 /// @brief calculates difference in precision between the result of manh
299 /// calculation and the edge precision, based on compile time trait values
300 template <typename RT>
ManhToEdgePrecisionAdjust()301 constexpr int64_t ManhToEdgePrecisionAdjust()
302 {
303     static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
304                       RT::EdgePrecisionT::BitsT::value,
305                   "Inadequate precision of result of manh calculation ");
306     return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) -
307             RT::EdgePrecisionT::BitsT::value);
308 }
309 
310 //////////////////////////////////////////////////////////////////////////
311 /// @struct adjustEdgeConservative
312 /// @brief Primary template definition used for partially specializing
313 /// the adjustEdgeConservative function. This struct should never
314 /// be instantiated.
315 /// @tparam RT: rasterizer traits
316 /// @tparam ConservativeEdgeOffsetT: does the edge need offsetting?
317 template <typename RT, typename ConservativeEdgeOffsetT>
318 struct adjustEdgeConservative
319 {
320     //////////////////////////////////////////////////////////////////////////
321     /// @brief Performs calculations to adjust each edge of a triangle away
322     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
323     /// direction.
324     ///
325     /// Uncertainty regions arise from fixed point rounding, which
326     /// can snap a vertex +/- by min fixed point value.
327     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
328     /// This allows the rasterizer to test for coverage only at the pixel center,
329     /// instead of having to test individual pixel corners for conservative coverage
adjustEdgeConservativeadjustEdgeConservative330     INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
331     {
332         // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge
333         // away from the pixel center (in the direction of the edge normal A/B)
334 
335         // edge = Ax + Bx + C - (manh/e)
336         // manh = manhattan distance = abs(A) + abs(B)
337         // e = absolute rounding error from snapping from float to fixed point precision
338 
339         // 'fixed point' multiply (in double to be avx1 friendly)
340         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
341         __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)),
342                 vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
343         __m256d manh =
344             _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
345                           _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
346 
347         static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
348                           RT::EdgePrecisionT::BitsT::value,
349                       "Inadequate precision of result of manh calculation ");
350 
351         // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the
352         // same precision since we're doing fixed math in double format, multiply by multiples of
353         // 1/2 instead of a bit shift right
354         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
355 
356         // move the edge away from the pixel center by the required conservative precision + 1/2
357         // pixel this allows the rasterizer to do a single conservative coverage test to see if the
358         // primitive intersects the pixel at all
359         vEdge = _mm256_sub_pd(vEdge, manh);
360     };
361 };
362 
363 //////////////////////////////////////////////////////////////////////////
364 /// @brief adjustEdgeConservative specialization where no edge offset is needed
365 template <typename RT>
366 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
367 {
368     INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){};
369 };
370 
371 //////////////////////////////////////////////////////////////////////////
372 /// @brief calculates the distance a degenerate BBox needs to be adjusted
373 /// for conservative rast based on compile time trait values
374 template <typename RT>
375 constexpr int64_t ConservativeScissorOffset()
376 {
377     static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0,
378                   "Rasterizer precision > conservative precision");
379     // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox
380     // when calculating scissor edges
381     typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1>
382         DegenerateEdgeOffsetT;
383     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
384     return RT::ConservativeEdgeOffsetT::value -
385            (DegenerateEdgeOffsetT::value
386             << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
387 }
388 
389 //////////////////////////////////////////////////////////////////////////
390 /// @brief Performs calculations to adjust each a vector of evaluated edges out
391 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
392 /// direction.
393 template <typename RT>
394 INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge)
395 {
396     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
397     int64_t manh =
398         ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >>
399         ManhToEdgePrecisionAdjust<RT>();
400     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
401 };
402 
403 //////////////////////////////////////////////////////////////////////////
404 /// @brief Performs calculations to adjust each a scalar evaluated edge out
405 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
406 /// direction.
407 template <typename RT, typename OffsetT>
408 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
409 {
410     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
411     int64_t manh =
412         ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
413     return (Edge - manh);
414 };
415 
416 //////////////////////////////////////////////////////////////////////////
417 /// @brief Perform any needed adjustments to evaluated triangle edges
418 template <typename RT, typename EdgeOffsetT>
419 struct adjustEdgesFix16
420 {
421     INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
422     {
423         static_assert(
424             std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
425             "Edge equation expected to be in x.16 fixed point");
426 
427         static_assert(RT::IsConservativeT::value,
428                       "Edge offset assumes conservative rasterization is enabled");
429 
430         // need to apply any edge offsets before applying the top-left rule
431         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
432 
433         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
434     }
435 };
436 
437 //////////////////////////////////////////////////////////////////////////
438 /// @brief Perform top left adjustments to evaluated triangle edges
439 template <typename RT>
440 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
441 {
442     INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
443     {
444         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
445     }
446 };
447 
448 // max(abs(dz/dx), abs(dz,dy)
449 INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
450 {
451     /*
452     // evaluate i,j at (0,0)
453     float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
454     float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
455 
456     // evaluate i,j at (1,0)
457     float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
458     float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
459 
460     // compute dz/dx
461     float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
462     float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
463     float dzdx = abs(d10 - d00);
464 
465     // evaluate i,j at (0,1)
466     float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
467     float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
468 
469     float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
470     float dzdy = abs(d01 - d00);
471     */
472 
473     // optimized version of above
474     float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
475     float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
476 
477     return std::max(dzdx, dzdy);
478 }
479 
480 INLINE float
481 ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
482 {
483     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
484     {
485         return (1.0f / (1 << 24));
486     }
487     else if (pState->depthFormat == R16_UNORM)
488     {
489         return (1.0f / (1 << 16));
490     }
491     else
492     {
493         SWR_ASSERT(pState->depthFormat == R32_FLOAT);
494 
495         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
496         float    zMax    = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
497         uint32_t zMaxInt = *(uint32_t*)&zMax;
498         zMaxInt &= 0x7f800000;
499         zMax = *(float*)&zMaxInt;
500 
501         return zMax * (1.0f / (1 << 23));
502     }
503 }
504 
505 INLINE float
506 ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
507 {
508     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
509     {
510         return 0.0f;
511     }
512 
513     float scale = pState->slopeScaledDepthBias;
514     if (scale != 0.0f)
515     {
516         scale *= ComputeMaxDepthSlope(pTri);
517     }
518 
519     float bias = pState->depthBias;
520     if (!pState->depthBiasPreAdjusted)
521     {
522         bias *= ComputeBiasFactor(pState, pTri, z);
523     }
524     bias += scale;
525 
526     if (pState->depthBiasClamp > 0.0f)
527     {
528         bias = std::min(bias, pState->depthBiasClamp);
529     }
530     else if (pState->depthBiasClamp < 0.0f)
531     {
532         bias = std::max(bias, pState->depthBiasClamp);
533     }
534 
535     return bias;
536 }
537 
538 // Prevent DCE by writing coverage mask from rasterizer to volatile
539 #if KNOB_ENABLE_TOSS_POINTS
540 __declspec(thread) volatile uint64_t gToss;
541 #endif
542 
543 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
544 // try to avoid _chkstk insertions; make this thread local
545 static THREAD
546 OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
547 
548 INLINE
549 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
550 {
551     edge.a = a;
552     edge.b = b;
553 
554     // compute constant steps to adjacent quads
555     edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
556     edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
557 
558     // compute constant steps to adjacent raster tiles
559     edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
560     edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
561 
562     // compute quad offsets
563     const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
564     const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
565 
566     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
567     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
568     edge.vQuadOffsets       = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
569 
570     // compute raster tile offsets
571     const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd(
572         (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0);
573     const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd(
574         (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0);
575 
576     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
577     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
578     edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
579 }
580 
581 INLINE
582 void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
583 {
584     ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
585 }
586 
587 //////////////////////////////////////////////////////////////////////////
588 /// @brief Primary template definition used for partially specializing
589 /// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
590 /// corner to sample position, and test for coverage
591 /// @tparam sampleCount: multisample count
592 template <typename NumSamplesT>
593 INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3],
594                             const __m256d* vEdgeFix16,
595                             int32_t&       mask0,
596                             int32_t&       mask1,
597                             int32_t&       mask2)
598 {
599     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
600     // evaluate edge equations at the tile multisample bounding box
601     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
602     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
603     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
604     mask0            = _mm256_movemask_pd(vSampleBboxTest0);
605     mask1            = _mm256_movemask_pd(vSampleBboxTest1);
606     mask2            = _mm256_movemask_pd(vSampleBboxTest2);
607 }
608 
609 //////////////////////////////////////////////////////////////////////////
610 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
611 /// when only rasterizing a single coverage test point
612 template <>
613 INLINE void UpdateEdgeMasks<SingleSampleT>(
614     const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2)
615 {
616     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
617     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
618     mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
619 }
620 
621 //////////////////////////////////////////////////////////////////////////
622 /// @struct ComputeScissorEdges
623 /// @brief Primary template definition. Allows the function to be generically
624 /// called. When paired with below specializations, will result in an empty
625 /// inlined function if scissor is not enabled
626 /// @tparam RasterScissorEdgesT: is scissor enabled?
627 /// @tparam IsConservativeT: is conservative rast enabled?
628 /// @tparam RT: rasterizer traits
629 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
630 struct ComputeScissorEdges
631 {
632     INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
633                                const SWR_RECT& scissorBBox,
634                                const int32_t   x,
635                                const int32_t   y,
636                                EDGE (&rastEdges)[RT::NumEdgesT::value],
637                                __m256d (&vEdgeFix16)[7]){};
638 };
639 
640 //////////////////////////////////////////////////////////////////////////
641 /// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
642 /// specialization. Instantiated when conservative rast and scissor are enabled
643 template <typename RT>
644 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
645 {
646     //////////////////////////////////////////////////////////////////////////
647     /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
648     /// evaluate edge equations and offset them away from pixel center.
649     INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
650                                const SWR_RECT& scissorBBox,
651                                const int32_t   x,
652                                const int32_t   y,
653                                EDGE (&rastEdges)[RT::NumEdgesT::value],
654                                __m256d (&vEdgeFix16)[7])
655     {
656         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
657         SWR_RECT scissor;
658         scissor.xmin = std::max(triBBox.xmin, scissorBBox.xmin);
659         scissor.xmax = std::min(triBBox.xmax, scissorBBox.xmax);
660         scissor.ymin = std::max(triBBox.ymin, scissorBBox.ymin);
661         scissor.ymax = std::min(triBBox.ymax, scissorBBox.ymax);
662 
663         POS topLeft{scissor.xmin, scissor.ymin};
664         POS bottomLeft{scissor.xmin, scissor.ymax};
665         POS topRight{scissor.xmax, scissor.ymin};
666         POS bottomRight{scissor.xmax, scissor.ymax};
667 
668         // construct 4 scissor edges in ccw direction
669         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
670         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
671         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
672         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
673 
674         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
675                                        (rastEdges[3].b * (y - scissor.ymin)));
676         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
677                                        (rastEdges[4].b * (y - scissor.ymax)));
678         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
679                                        (rastEdges[5].b * (y - scissor.ymax)));
680         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
681                                        (rastEdges[6].b * (y - scissor.ymin)));
682 
683         // if conservative rasterizing, need to bump the scissor edges out by the conservative
684         // uncertainty distance, else do nothing
685         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
686         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
687         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
688         adjustScissorEdge<RT>(rastEdges[6].a, rastEdges[6].b, vEdgeFix16[6]);
689 
690         // Upper left rule for scissor
691         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
692         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
693     }
694 };
695 
696 //////////////////////////////////////////////////////////////////////////
697 /// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
698 /// specialization. Instantiated when scissor is enabled and conservative rast
699 /// is disabled.
700 template <typename RT>
701 struct ComputeScissorEdges<std::true_type, std::false_type, RT>
702 {
703     //////////////////////////////////////////////////////////////////////////
704     /// @brief Compute scissor edge vectors and evaluate edge equations
705     INLINE ComputeScissorEdges(const SWR_RECT&,
706                                const SWR_RECT& scissorBBox,
707                                const int32_t   x,
708                                const int32_t   y,
709                                EDGE (&rastEdges)[RT::NumEdgesT::value],
710                                __m256d (&vEdgeFix16)[7])
711     {
712         const SWR_RECT& scissor = scissorBBox;
713         POS             topLeft{scissor.xmin, scissor.ymin};
714         POS             bottomLeft{scissor.xmin, scissor.ymax};
715         POS             topRight{scissor.xmax, scissor.ymin};
716         POS             bottomRight{scissor.xmax, scissor.ymax};
717 
718         // construct 4 scissor edges in ccw direction
719         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
720         ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
721         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
722         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
723 
724         vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
725                                        (rastEdges[3].b * (y - scissor.ymin)));
726         vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
727                                        (rastEdges[4].b * (y - scissor.ymax)));
728         vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
729                                        (rastEdges[5].b * (y - scissor.ymax)));
730         vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
731                                        (rastEdges[6].b * (y - scissor.ymin)));
732 
733         // Upper left rule for scissor
734         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
735         vEdgeFix16[6] = _mm256_sub_pd(vEdgeFix16[6], _mm256_set1_pd(1.0));
736     }
737 };
738 
739 //////////////////////////////////////////////////////////////////////////
740 /// @brief Primary function template for TrivialRejectTest. Should
741 /// never be called, but TemplateUnroller instantiates a few unused values,
742 /// so it calls a runtime assert instead of a static_assert.
743 template <typename ValidEdgeMaskT>
744 INLINE bool TrivialRejectTest(const int, const int, const int)
745 {
746     SWR_INVALID("Primary templated function should never be called");
747     return false;
748 };
749 
750 //////////////////////////////////////////////////////////////////////////
751 /// @brief E0E1ValidT specialization of TrivialRejectTest. Tests edge 0
752 /// and edge 1 for trivial coverage reject
753 template <>
754 INLINE bool TrivialRejectTest<E0E1ValidT>(const int mask0, const int mask1, const int)
755 {
756     return (!(mask0 && mask1)) ? true : false;
757 };
758 
759 //////////////////////////////////////////////////////////////////////////
760 /// @brief E0E2ValidT specialization of TrivialRejectTest. Tests edge 0
761 /// and edge 2 for trivial coverage reject
762 template <>
763 INLINE bool TrivialRejectTest<E0E2ValidT>(const int mask0, const int, const int mask2)
764 {
765     return (!(mask0 && mask2)) ? true : false;
766 };
767 
768 //////////////////////////////////////////////////////////////////////////
769 /// @brief E1E2ValidT specialization of TrivialRejectTest. Tests edge 1
770 /// and edge 2 for trivial coverage reject
771 template <>
772 INLINE bool TrivialRejectTest<E1E2ValidT>(const int, const int mask1, const int mask2)
773 {
774     return (!(mask1 && mask2)) ? true : false;
775 };
776 
777 //////////////////////////////////////////////////////////////////////////
778 /// @brief AllEdgesValidT specialization of TrivialRejectTest. Tests all
779 /// primitive edges for trivial coverage reject
780 template <>
781 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
782 {
783     return (!(mask0 && mask1 && mask2)) ? true : false;
784     ;
785 };
786 
787 //////////////////////////////////////////////////////////////////////////
788 /// @brief NoEdgesValidT specialization of TrivialRejectTest. Degenerate
789 /// point, so return false and rasterize against conservative BBox
790 template <>
791 INLINE bool TrivialRejectTest<NoEdgesValidT>(const int, const int, const int)
792 {
793     return false;
794 };
795 
796 //////////////////////////////////////////////////////////////////////////
797 /// @brief Primary function template for TrivialAcceptTest. Always returns
798 /// false, since it will only be called for degenerate tris, and as such
799 /// will never cover the entire raster tile
800 template <typename ScissorEnableT>
801 INLINE bool TrivialAcceptTest(const int, const int, const int)
802 {
803     return false;
804 };
805 
806 //////////////////////////////////////////////////////////////////////////
807 /// @brief AllEdgesValidT specialization for TrivialAcceptTest. Test all
808 /// edge masks for a fully covered raster tile
809 template <>
810 INLINE bool TrivialAcceptTest<std::false_type>(const int mask0, const int mask1, const int mask2)
811 {
812     return ((mask0 & mask1 & mask2) == 0xf);
813 };
814 
815 //////////////////////////////////////////////////////////////////////////
816 /// @brief Primary function template for GenerateSVInnerCoverage. Results
817 /// in an empty function call if SVInnerCoverage isn't requested
818 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
819 struct GenerateSVInnerCoverage
820 {
821     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){};
822 };
823 
824 //////////////////////////////////////////////////////////////////////////
825 /// @brief Specialization of GenerateSVInnerCoverage where all edges
826 /// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
827 /// edge values from OuterConservative to InnerConservative and rasterizes.
828 template <typename RT>
829 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
830 {
831     INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC,
832                                    uint32_t      workerId,
833                                    EDGE*         pRastEdges,
834                                    double*       pStartQuadEdges,
835                                    uint64_t&     innerCoverageMask)
836     {
837         double startQuadEdgesAdj[RT::NumEdgesT::value];
838         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
839         {
840             startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(
841                 pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
842         }
843 
844         // not trivial accept or reject, must rasterize full tile
845         RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId);
846         innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
847             pDC, startQuadEdgesAdj, pRastEdges);
848         RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0);
849     }
850 };
851 
852 //////////////////////////////////////////////////////////////////////////
853 /// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results
854 /// in an empty function call if SVInnerCoverage isn't requested
855 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
856 struct UpdateEdgeMasksInnerConservative
857 {
858     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
859                                             const __m256d*,
860                                             const __m128i,
861                                             const __m128i,
862                                             int32_t&,
863                                             int32_t&,
864                                             int32_t&){};
865 };
866 
867 //////////////////////////////////////////////////////////////////////////
868 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
869 /// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
870 /// evaluated at raster tile corners to inner conservative position and
871 /// updates edge masks
872 template <typename RT>
873 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
874 {
875     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
876                                             const __m256d* vEdgeFix16,
877                                             const __m128i  vAi,
878                                             const __m128i  vBi,
879                                             int32_t&       mask0,
880                                             int32_t&       mask1,
881                                             int32_t&       mask2)
882     {
883         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
884 
885         // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
886         // conservative evaluated edge when adjusting the edge in for inner conservative tests
887         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
888             vAi, vBi, vTempEdge[0]);
889         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
890             vAi, vBi, vTempEdge[1]);
891         adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
892             vAi, vBi, vTempEdge[2]);
893 
894         UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(
895             vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
896     }
897 };
898 
899 //////////////////////////////////////////////////////////////////////////
900 /// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
901 /// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
902 /// cover an entire raster tile, set mask0 to 0 to force it down the
903 /// rastierizePartialTile path
904 template <typename RT, typename ValidEdgeMaskT>
905 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
906 {
907     INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3],
908                                             const __m256d*,
909                                             const __m128i,
910                                             const __m128i,
911                                             int32_t& mask0,
912                                             int32_t&,
913                                             int32_t&)
914     {
915         // set one mask to zero to force the triangle down the rastierizePartialTile path
916         mask0 = 0;
917     }
918 };
919 
920 template <typename RT>
921 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
922 {
923     const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
924 #if KNOB_ENABLE_TOSS_POINTS
925     if (KNOB_TOSS_BIN_TRIS)
926     {
927         return;
928     }
929 #endif
930     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeTriangle, pDC->drawId);
931     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BETriangleSetup, pDC->drawId);
932 
933     const API_STATE&     state        = GetApiState(pDC);
934     const SWR_RASTSTATE& rastState    = state.rastState;
935     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
936 
937     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
938     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
939 
940     __m128 vX, vY, vZ, vRecipW;
941 
942     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
943     // eg: vX = [x0 x1 x2 dc]
944     vX      = _mm_load_ps(workDesc.pTriBuffer);
945     vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
946     vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
947     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
948 
949     // convert to fixed point
950     static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value,
951                   "Rasterizer expects 16.8 fixed point precision");
952     __m128i vXi = fpToFixedPoint(vX);
953     __m128i vYi = fpToFixedPoint(vY);
954 
955     // quantize floating point position to fixed point precision
956     // to prevent attribute creep around the triangle vertices
957     vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
958     vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
959 
960     // triangle setup - A and B edge equation coefs
961     __m128 vA, vB;
962     triangleSetupAB(vX, vY, vA, vB);
963 
964     __m128i vAi, vBi;
965     triangleSetupABInt(vXi, vYi, vAi, vBi);
966 
967     // determinant
968     float det = calcDeterminantInt(vAi, vBi);
969 
970     // Verts in Pixel Coordinate Space at this point
971     // Det > 0 = CW winding order
972     // Convert CW triangles to CCW
973     if (det > 0.0)
974     {
975         vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
976         vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
977         vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
978         vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
979         det = -det;
980     }
981 
982     __m128 vC;
983     // Finish triangle setup - C edge coef
984     triangleSetupC(vX, vY, vA, vB, vC);
985 
986     if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
987     {
988         // If we have degenerate edge(s) to rasterize, set I and J coefs
989         // to 0 for constant interpolation of attributes
990         triDesc.I[0] = 0.0f;
991         triDesc.I[1] = 0.0f;
992         triDesc.I[2] = 0.0f;
993         triDesc.J[0] = 0.0f;
994         triDesc.J[1] = 0.0f;
995         triDesc.J[2] = 0.0f;
996 
997         // Degenerate triangles have no area
998         triDesc.recipDet = 0.0f;
999     }
1000     else
1001     {
1002         // only extract coefs for 2 of the barycentrics; the 3rd can be
1003         // determined from the barycentric equation:
1004         // i + j + k = 1 <=> k = 1 - j - i
1005         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
1006         _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
1007         _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
1008         _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
1009         _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
1010         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
1011 
1012         // compute recipDet, used to calculate barycentric i and j in the backend
1013         triDesc.recipDet = 1.0f / det;
1014     }
1015 
1016     OSALIGNSIMD(float) oneOverW[4];
1017     _mm_store_ps(oneOverW, vRecipW);
1018     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
1019     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
1020     triDesc.OneOverW[2] = oneOverW[2];
1021 
1022     // calculate perspective correct coefs per vertex attrib
1023     float* pPerspAttribs  = perspAttribsTLS;
1024     float* pAttribs       = workDesc.pAttribs;
1025     triDesc.pPerspAttribs = pPerspAttribs;
1026     triDesc.pAttribs      = pAttribs;
1027     float* pRecipW        = workDesc.pTriBuffer + 12;
1028     triDesc.pRecipW       = pRecipW;
1029     __m128 vOneOverWV0    = _mm_broadcast_ss(pRecipW);
1030     __m128 vOneOverWV1    = _mm_broadcast_ss(pRecipW += 1);
1031     __m128 vOneOverWV2    = _mm_broadcast_ss(pRecipW += 1);
1032     for (uint32_t i = 0; i < workDesc.numAttribs; i++)
1033     {
1034         __m128 attribA = _mm_load_ps(pAttribs);
1035         __m128 attribB = _mm_load_ps(pAttribs += 4);
1036         __m128 attribC = _mm_load_ps(pAttribs += 4);
1037         pAttribs += 4;
1038 
1039         attribA = _mm_mul_ps(attribA, vOneOverWV0);
1040         attribB = _mm_mul_ps(attribB, vOneOverWV1);
1041         attribC = _mm_mul_ps(attribC, vOneOverWV2);
1042 
1043         _mm_store_ps(pPerspAttribs, attribA);
1044         _mm_store_ps(pPerspAttribs += 4, attribB);
1045         _mm_store_ps(pPerspAttribs += 4, attribC);
1046         pPerspAttribs += 4;
1047     }
1048 
1049     // compute bary Z
1050     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
1051     OSALIGNSIMD(float) a[4];
1052     _mm_store_ps(a, vZ);
1053     triDesc.Z[0] = a[0] - a[2];
1054     triDesc.Z[1] = a[1] - a[2];
1055     triDesc.Z[2] = a[2];
1056 
1057     // add depth bias
1058     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
1059 
1060     // Calc bounding box of triangle
1061     OSALIGNSIMD(SWR_RECT) bbox;
1062     calcBoundingBoxInt(vXi, vYi, bbox);
1063 
1064     const SWR_RECT& scissorInFixedPoint =
1065         state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
1066 
1067     if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
1068     {
1069         // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is
1070         // valid
1071         bbox.xmin--;
1072         bbox.xmax++;
1073         bbox.ymin--;
1074         bbox.ymax++;
1075         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
1076                    "Conservative rast degenerate handling requires a valid scissor rect");
1077     }
1078 
1079     // Intersect with scissor/viewport
1080     OSALIGNSIMD(SWR_RECT) intersect;
1081     intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
1082     intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
1083     intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
1084     intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);
1085 
1086     triDesc.triFlags = workDesc.triFlags;
1087 
1088     // further constrain backend to intersecting bounding box of macro tile and scissored triangle
1089     // bbox
1090     uint32_t macroX, macroY;
1091     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
1092     int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
1093     int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
1094     int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
1095     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
1096 
1097     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
1098     intersect.ymin = std::max(intersect.ymin, macroBoxTop);
1099     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
1100     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
1101 
1102     SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax &&
1103                intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 &&
1104                intersect.ymax >= 0);
1105 
1106     RDTSC_END(pDC->pContext->pBucketMgr, BETriangleSetup, 0);
1107 
1108     // update triangle desc
1109     uint32_t minTileX  = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1110     uint32_t minTileY  = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1111     uint32_t maxTileX  = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1112     uint32_t maxTileY  = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1113     uint32_t numTilesX = maxTileX - minTileX + 1;
1114     uint32_t numTilesY = maxTileY - minTileY + 1;
1115 
1116     if (numTilesX == 0 || numTilesY == 0)
1117     {
1118         RDTSC_EVENT(pDC->pContext->pBucketMgr, BEEmptyTriangle, 1, 0);
1119         RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1);
1120         return;
1121     }
1122 
1123     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEStepSetup, pDC->drawId);
1124 
1125     // Step to pixel center of top-left pixel of the triangle bbox
1126     // Align intersect bbox (top/left) to raster tile's (top/left).
1127     int32_t x = AlignDown(intersect.xmin, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
1128     int32_t y = AlignDown(intersect.ymin, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
1129 
1130     // convenience typedef
1131     typedef typename RT::NumCoverageSamplesT NumCoverageSamplesT;
1132 
1133     // single sample rasterization evaluates edges at pixel center,
1134     // multisample evaluates edges UL pixel corner and steps to each sample position
1135     if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1136     {
1137         // Add 0.5, in fixed point, to offset to pixel center
1138         x += (FIXED_POINT_SCALE / 2);
1139         y += (FIXED_POINT_SCALE / 2);
1140     }
1141 
1142     __m128i vTopLeftX = _mm_set1_epi32(x);
1143     __m128i vTopLeftY = _mm_set1_epi32(y);
1144 
1145     // evaluate edge equations at top-left pixel using 64bit math
1146     //
1147     // line = Ax + By + C
1148     // solving for C:
1149     // C = -Ax - By
1150     // we know x0 and y0 are on the line; plug them in:
1151     // C = -Ax0 - By0
1152     // plug C back into line equation:
1153     // line = Ax - By - Ax0 - By0
1154     // line = A(x - x0) + B(y - y0)
1155     // dX = (x-x0), dY = (y-y0)
1156     // so all this simplifies to
1157     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
1158 
1159     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
1160     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
1161 
1162     // evaluate A(dx) and B(dY) for all points
1163     __m256d vAipd     = _mm256_cvtepi32_pd(vAi);
1164     __m256d vBipd     = _mm256_cvtepi32_pd(vBi);
1165     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
1166     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
1167 
1168     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
1169     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
1170     __m256d vEdge          = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
1171 
1172     // apply any edge adjustments(top-left, crast, etc)
1173     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
1174 
1175     // broadcast respective edge results to all lanes
1176     double* pEdge = (double*)&vEdge;
1177     __m256d vEdgeFix16[7];
1178     vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
1179     vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
1180     vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
1181 
1182     OSALIGNSIMD(int32_t) aAi[4], aBi[4];
1183     _mm_store_si128((__m128i*)aAi, vAi);
1184     _mm_store_si128((__m128i*)aBi, vBi);
1185     EDGE rastEdges[RT::NumEdgesT::value];
1186 
1187     // Compute and store triangle edge data
1188     ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
1189     ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
1190     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
1191 
1192     // Compute and store triangle edge data if scissor needs to rasterized
1193     ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>(
1194         bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
1195 
1196     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
1197     // used to for testing if entire raster tile is inside a triangle
1198     for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1199     {
1200         vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
1201     }
1202 
1203     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
1204     // step sample positions to the raster tile bbox of multisample points
1205     // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
1206     //                             |      |
1207     //                             |      |
1208     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
1209     __m256d vEdgeTileBbox[3];
1210     if (NumCoverageSamplesT::value > 1)
1211     {
1212         const SWR_MULTISAMPLE_POS& samplePos         = rastState.samplePositions;
1213         const __m128i              vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
1214         const __m128i              vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
1215 
1216         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
1217         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
1218 
1219         // step edge equation tests from Tile
1220         // used to for testing if entire raster tile is inside a triangle
1221         for (uint32_t e = 0; e < 3; ++e)
1222         {
1223             __m256d vResultAxFix16 =
1224                 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
1225             __m256d vResultByFix16 =
1226                 _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
1227             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1228 
1229             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
1230             adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(
1231                 vAi, vBi, vEdgeTileBbox[e]);
1232         }
1233     }
1234 
1235     RDTSC_END(pDC->pContext->pBucketMgr, BEStepSetup, 0);
1236 
1237     uint32_t tY   = minTileY;
1238     uint32_t tX   = minTileX;
1239     uint32_t maxY = maxTileY;
1240     uint32_t maxX = maxTileX;
1241 
1242     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
1243     GetRenderHotTiles<RT::MT::numSamples>(pDC,
1244                                           workerId,
1245                                           macroTile,
1246                                           minTileX,
1247                                           minTileY,
1248                                           renderBuffers,
1249                                           triDesc.triFlags.renderTargetArrayIndex);
1250     currentRenderBufferRow = renderBuffers;
1251 
1252     // rasterize and generate coverage masks per sample
1253     for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
1254     {
1255         __m256d vStartOfRowEdge[RT::NumEdgesT::value];
1256         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1257         {
1258             vStartOfRowEdge[e] = vEdgeFix16[e];
1259         }
1260 
1261         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
1262         {
1263             triDesc.anyCoveredSamples = 0;
1264 
1265             // is the corner of the edge outside of the raster tile? (vEdge < 0)
1266             int mask0, mask1, mask2;
1267             UpdateEdgeMasks<NumCoverageSamplesT>(vEdgeTileBbox, vEdgeFix16, mask0, mask1, mask2);
1268 
1269             for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
1270             {
1271                 // trivial reject, at least one edge has all 4 corners of raster tile outside
1272                 bool trivialReject =
1273                     TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
1274 
1275                 if (!trivialReject)
1276                 {
1277                     // trivial accept mask
1278                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
1279 
1280                     // Update the raster tile edge masks based on inner conservative edge offsets,
1281                     // if enabled
1282                     UpdateEdgeMasksInnerConservative<RT,
1283                                                      typename RT::ValidEdgeMaskT,
1284                                                      typename RT::InputCoverageT>(
1285                         vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
1286 
1287                     // @todo Make this a bit smarter to allow use of trivial accept when:
1288                     //   1) scissor/vp intersection rect is raster tile aligned
1289                     //   2) raster tile is entirely within scissor/vp intersection rect
1290                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
1291                     {
1292                         // trivial accept, all 4 corners of all 3 edges are negative
1293                         // i.e. raster tile completely inside triangle
1294                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
1295                         if (std::is_same<typename RT::InputCoverageT,
1296                                          InnerConservativeCoverageT>::value)
1297                         {
1298                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
1299                         }
1300                         RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialAccept, 1, 0);
1301                     }
1302                     else
1303                     {
1304                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
1305                         if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
1306                         {
1307                             // should get optimized out for single sample case (global value
1308                             // numbering or copy propagation)
1309                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1310                             {
1311                                 vEdgeAtSample[e] = vEdgeFix16[e];
1312                             }
1313                         }
1314                         else
1315                         {
1316                             const SWR_MULTISAMPLE_POS& samplePos       = rastState.samplePositions;
1317                             __m128i                    vSampleOffsetXh = samplePos.vXi(sampleNum);
1318                             __m128i                    vSampleOffsetYh = samplePos.vYi(sampleNum);
1319                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
1320                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
1321 
1322                             // step edge equation tests from UL tile corner to pixel sample position
1323                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1324                             {
1325                                 __m256d vResultAxFix16 =
1326                                     _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
1327                                 __m256d vResultByFix16 =
1328                                     _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
1329                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
1330                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
1331                             }
1332                         }
1333 
1334                         double        startQuadEdges[RT::NumEdgesT::value];
1335                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
1336                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1337                         {
1338                             _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
1339                         }
1340 
1341                         // not trivial accept or reject, must rasterize full tile
1342                         RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizePartial, pDC->drawId);
1343                         triDesc.coverageMask[sampleNum] =
1344                             rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
1345                                 pDC, startQuadEdges, rastEdges);
1346                         RDTSC_END(pDC->pContext->pBucketMgr, BERasterizePartial, 0);
1347 
1348                         triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
1349 
1350                         // Output SV InnerCoverage, if needed
1351                         GenerateSVInnerCoverage<RT,
1352                                                 typename RT::ValidEdgeMaskT,
1353                                                 typename RT::InputCoverageT>(
1354                             pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
1355                     }
1356                 }
1357                 else
1358                 {
1359                     // if we're calculating coverage per sample, need to store it off. otherwise no
1360                     // covered samples, don't need to do anything
1361                     if (NumCoverageSamplesT::value > 1)
1362                     {
1363                         triDesc.coverageMask[sampleNum] = 0;
1364                     }
1365                     RDTSC_EVENT(pDC->pContext->pBucketMgr, BETrivialReject, 1, 0);
1366                 }
1367             }
1368 
1369 #if KNOB_ENABLE_TOSS_POINTS
1370             if (KNOB_TOSS_RS)
1371             {
1372                 gToss = triDesc.coverageMask[0];
1373             }
1374             else
1375 #endif
1376                 if (triDesc.anyCoveredSamples)
1377             {
1378                 // if conservative rast and MSAA are enabled, conservative coverage for a pixel
1379                 // means all samples in that pixel are covered copy conservative coverage result to
1380                 // all samples
1381                 if (RT::IsConservativeT::value)
1382                 {
1383                     auto copyCoverage = [&](int sample) {
1384                         triDesc.coverageMask[sample] = triDesc.coverageMask[0];
1385                     };
1386                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
1387                 }
1388 
1389                 // Track rasterized subspans
1390                 AR_EVENT(RasterTileCount(pDC->drawId, 1));
1391 
1392                 RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
1393                 backendFuncs.pfnBackend(pDC,
1394                                         workerId,
1395                                         tileX << KNOB_TILE_X_DIM_SHIFT,
1396                                         tileY << KNOB_TILE_Y_DIM_SHIFT,
1397                                         triDesc,
1398                                         renderBuffers);
1399                 RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
1400             }
1401 
1402             // step to the next tile in X
1403             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1404             {
1405                 vEdgeFix16[e] =
1406                     _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
1407             }
1408             StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
1409         }
1410 
1411         // step to the next tile in Y
1412         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
1413         {
1414             vEdgeFix16[e] =
1415                 _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
1416         }
1417         StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
1418     }
1419 
1420     RDTSC_END(pDC->pContext->pBucketMgr, BERasterizeTriangle, 1);
1421 }
1422 
1423 // Get pointers to hot tile memory for color RT, depth, stencil
1424 template <uint32_t numSamples>
1425 void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
1426                        uint32_t             workerId,
1427                        uint32_t             macroID,
1428                        uint32_t             tileX,
1429                        uint32_t             tileY,
1430                        RenderOutputBuffers& renderBuffers,
1431                        uint32_t             renderTargetArrayIndex)
1432 {
1433     const API_STATE& state    = GetApiState(pDC);
1434     SWR_CONTEXT*     pContext = pDC->pContext;
1435     HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
1436 
1437     uint32_t mx, my;
1438     MacroTileMgr::getTileIndices(macroID, mx, my);
1439     tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
1440     tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
1441 
1442     // compute tile offset for active hottile buffers
1443     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
1444     uint32_t       offset = ComputeTileOffset2D<
1445         TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>(
1446         pitch, tileX, tileY);
1447     offset *= numSamples;
1448 
1449     unsigned long rtSlot                 = 0;
1450     uint32_t      colorHottileEnableMask = state.colorHottileEnable;
1451     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
1452     {
1453         HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile(
1454             pContext,
1455             pDC,
1456             hWorkerPrivateData,
1457             macroID,
1458             (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
1459             true,
1460             numSamples,
1461             renderTargetArrayIndex);
1462         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
1463         renderBuffers.pColorHotTile[rtSlot] = pColor;
1464 
1465         colorHottileEnableMask &= ~(1 << rtSlot);
1466     }
1467     if (state.depthHottileEnable)
1468     {
1469         const uint32_t pitch =
1470             KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
1471         uint32_t offset = ComputeTileOffset2D<
1472             TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>(
1473             pitch, tileX, tileY);
1474         offset *= numSamples;
1475         HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext,
1476                                                             pDC,
1477                                                             hWorkerPrivateData,
1478                                                             macroID,
1479                                                             SWR_ATTACHMENT_DEPTH,
1480                                                             true,
1481                                                             numSamples,
1482                                                             renderTargetArrayIndex);
1483         pDepth->state   = HOTTILE_DIRTY;
1484         SWR_ASSERT(pDepth->pBuffer != nullptr);
1485         renderBuffers.pDepth = pDepth->pBuffer + offset;
1486         renderBuffers.pDepthHotTile = pDepth;
1487     }
1488     if (state.stencilHottileEnable)
1489     {
1490         const uint32_t pitch =
1491             KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
1492         uint32_t offset = ComputeTileOffset2D<
1493             TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>(
1494             pitch, tileX, tileY);
1495         offset *= numSamples;
1496         HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext,
1497                                                               pDC,
1498                                                               hWorkerPrivateData,
1499                                                               macroID,
1500                                                               SWR_ATTACHMENT_STENCIL,
1501                                                               true,
1502                                                               numSamples,
1503                                                               renderTargetArrayIndex);
1504         pStencil->state   = HOTTILE_DIRTY;
1505         SWR_ASSERT(pStencil->pBuffer != nullptr);
1506         renderBuffers.pStencil = pStencil->pBuffer + offset;
1507         renderBuffers.pStencilHotTile = pStencil;
1508     }
1509 }
1510 
1511 template <typename RT>
1512 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
1513 {
1514     unsigned long rt = 0;
1515     while (_BitScanForward(&rt, colorHotTileMask))
1516     {
1517         colorHotTileMask &= ~(1 << rt);
1518         buffers.pColor[rt] += RT::colorRasterTileStep;
1519     }
1520 
1521     buffers.pDepth += RT::depthRasterTileStep;
1522     buffers.pStencil += RT::stencilRasterTileStep;
1523 }
1524 
1525 template <typename RT>
1526 INLINE void StepRasterTileY(uint32_t             colorHotTileMask,
1527                             RenderOutputBuffers& buffers,
1528                             RenderOutputBuffers& startBufferRow)
1529 {
1530     unsigned long rt = 0;
1531     while (_BitScanForward(&rt, colorHotTileMask))
1532     {
1533         colorHotTileMask &= ~(1 << rt);
1534         startBufferRow.pColor[rt] += RT::colorRasterTileRowStep;
1535         buffers.pColor[rt] = startBufferRow.pColor[rt];
1536     }
1537     startBufferRow.pDepth += RT::depthRasterTileRowStep;
1538     buffers.pDepth = startBufferRow.pDepth;
1539 
1540     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
1541     buffers.pStencil = startBufferRow.pStencil;
1542 }
1543