1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file binner.cpp
24  *
25  * @brief Implementation for the macrotile binner
26  *
27  ******************************************************************************/
28 
29 #include "binner.h"
30 #include "context.h"
31 #include "frontend.h"
32 #include "conservativeRast.h"
33 #include "pa.h"
34 #include "rasterizer.h"
35 #include "rdtsc_core.h"
36 #include "tilemgr.h"
37 
38 // Function Prototype
39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
40 void BinPostSetupLinesImpl(DRAW_CONTEXT*          pDC,
41                            PA_STATE&              pa,
42                            uint32_t               workerId,
43                            Vec4<SIMD_T>           prim[],
44                            Float<SIMD_T>          recipW[],
45                            uint32_t               primMask,
46                            Integer<SIMD_T> const& primID,
47                            Integer<SIMD_T> const& viewportIdx,
48                            Integer<SIMD_T> const& rtIdx);
49 
50 template <typename SIMD_T, uint32_t SIMD_WIDTH>
51 void BinPostSetupPointsImpl(DRAW_CONTEXT*          pDC,
52                             PA_STATE&              pa,
53                             uint32_t               workerId,
54                             Vec4<SIMD_T>           prim[],
55                             uint32_t               primMask,
56                             Integer<SIMD_T> const& primID,
57                             Integer<SIMD_T> const& viewportIdx,
58                             Integer<SIMD_T> const& rtIdx);
59 
60 //////////////////////////////////////////////////////////////////////////
61 /// @brief Processes attributes for the backend based on linkage mask and
62 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
63 /// @param pDC - Draw context
64 /// @param pa - Primitive Assembly state
65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
66 /// @param pLinkageMap - maps VS attribute slot to PS slot
67 /// @param triIndex - Triangle to process attributes for
68 /// @param pBuffer - Output result
69 template <typename NumVertsT,
70           typename IsSwizzledT,
71           typename HasConstantInterpT,
72           typename IsDegenerate>
ProcessAttributes(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t triIndex,uint32_t primId,float * pBuffer)73 INLINE void ProcessAttributes(
74     DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t triIndex, uint32_t primId, float* pBuffer)
75 {
76     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
77     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
78     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
79     uint32_t constantInterpMask =
80         IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
81     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
82     const PRIMITIVE_TOPOLOGY topo  = pa.binTopology;
83 
84     static const float constTable[3][4] = {
85         {0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 1.0f}, {1.0f, 1.0f, 1.0f, 1.0f}};
86 
87     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
88     {
89         uint32_t inputSlot;
90         if (IsSwizzledT::value)
91         {
92             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
93             inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
94         }
95         else
96         {
97             inputSlot = backendState.vertexAttribOffset + i;
98         }
99 
100         simd4scalar attrib[3]; // triangle attribs (always 4 wide)
101         float*      pAttribStart = pBuffer;
102 
103         if (HasConstantInterpT::value || IsDegenerate::value)
104         {
105             if (CheckBit(constantInterpMask, i))
106             {
107                 uint32_t              vid;
108                 uint32_t              adjustedTriIndex;
109                 static const uint32_t tristripProvokingVertex[]   = {0, 2, 1};
110                 static const int32_t  quadProvokingTri[2][4]      = {{0, 0, 0, 1}, {0, -1, 0, 0}};
111                 static const uint32_t quadProvokingVertex[2][4]   = {{0, 1, 2, 2}, {0, 1, 1, 2}};
112                 static const int32_t  qstripProvokingTri[2][4]    = {{0, 0, 0, 1}, {-1, 0, 0, 0}};
113                 static const uint32_t qstripProvokingVertex[2][4] = {{0, 1, 2, 1}, {0, 0, 2, 1}};
114 
115                 switch (topo)
116                 {
117                 case TOP_QUAD_LIST:
118                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
119                     vid              = quadProvokingVertex[triIndex & 1][provokingVertex];
120                     break;
121                 case TOP_QUAD_STRIP:
122                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
123                     vid              = qstripProvokingVertex[triIndex & 1][provokingVertex];
124                     break;
125                 case TOP_TRIANGLE_STRIP:
126                     adjustedTriIndex = triIndex;
127                     vid =
128                         (triIndex & 1) ? tristripProvokingVertex[provokingVertex] : provokingVertex;
129                     break;
130                 default:
131                     adjustedTriIndex = triIndex;
132                     vid              = provokingVertex;
133                     break;
134                 }
135 
136                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
137 
138                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
139                 {
140                     SIMD128::store_ps(pBuffer, attrib[vid]);
141                     pBuffer += 4;
142                 }
143             }
144             else
145             {
146                 pa.AssembleSingle(inputSlot, triIndex, attrib);
147 
148                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
149                 {
150                     SIMD128::store_ps(pBuffer, attrib[i]);
151                     pBuffer += 4;
152                 }
153             }
154         }
155         else
156         {
157             pa.AssembleSingle(inputSlot, triIndex, attrib);
158 
159             for (uint32_t i = 0; i < NumVertsT::value; ++i)
160             {
161                 SIMD128::store_ps(pBuffer, attrib[i]);
162                 pBuffer += 4;
163             }
164         }
165 
166         // pad out the attrib buffer to 3 verts to ensure the triangle
167         // interpolation code in the pixel shader works correctly for the
168         // 3 topologies - point, line, tri.  This effectively zeros out the
169         // effect of the missing vertices in the triangle interpolation.
170         for (uint32_t v = NumVertsT::value; v < 3; ++v)
171         {
172             SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
173             pBuffer += 4;
174         }
175 
176         // check for constant source overrides
177         if (IsSwizzledT::value)
178         {
179             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
180             if (mask)
181             {
182                 unsigned long comp;
183                 while (_BitScanForward(&comp, mask))
184                 {
185                     mask &= ~(1 << comp);
186 
187                     float constantValue = 0.0f;
188                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
189                     {
190                     case SWR_CONSTANT_SOURCE_CONST_0000:
191                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
192                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
193                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
194                         break;
195                     case SWR_CONSTANT_SOURCE_PRIM_ID:
196                         constantValue = *(float*)&primId;
197                         break;
198                     }
199 
200                     // apply constant value to all 3 vertices
201                     for (uint32_t v = 0; v < 3; ++v)
202                     {
203                         pAttribStart[comp + v * 4] = constantValue;
204                     }
205                 }
206             }
207         }
208     }
209 }
210 
211 typedef void (*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
212 
213 struct ProcessAttributesChooser
214 {
215     typedef PFN_PROCESS_ATTRIBUTES FuncType;
216 
217     template <typename... ArgsB>
GetFuncProcessAttributesChooser218     static FuncType GetFunc()
219     {
220         return ProcessAttributes<ArgsB...>;
221     }
222 };
223 
GetProcessAttributesFunc(uint32_t NumVerts,bool IsSwizzled,bool HasConstantInterp,bool IsDegenerate=false)224 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts,
225                                                 bool     IsSwizzled,
226                                                 bool     HasConstantInterp,
227                                                 bool     IsDegenerate = false)
228 {
229     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(
230         IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
231 }
232 
233 //////////////////////////////////////////////////////////////////////////
234 /// @brief Processes enabled user clip distances. Loads the active clip
235 ///        distances from the PA, sets up barycentric equations, and
236 ///        stores the results to the output buffer
237 /// @param pa - Primitive Assembly state
238 /// @param primIndex - primitive index to process
239 /// @param clipDistMask - mask of enabled clip distances
240 /// @param pUserClipBuffer - buffer to store results
241 template <uint32_t NumVerts>
ProcessUserClipDist(const SWR_BACKEND_STATE & state,PA_STATE & pa,uint32_t primIndex,float * pRecipW,float * pUserClipBuffer)242 void ProcessUserClipDist(const SWR_BACKEND_STATE& state,
243                          PA_STATE&                pa,
244                          uint32_t                 primIndex,
245                          float*                   pRecipW,
246                          float*                   pUserClipBuffer)
247 {
248     unsigned long clipDist;
249     uint32_t clipDistMask = state.clipDistanceMask;
250     while (_BitScanForward(&clipDist, clipDistMask))
251     {
252         clipDistMask &= ~(1 << clipDist);
253         uint32_t clipSlot = clipDist >> 2;
254         uint32_t clipComp = clipDist & 0x3;
255         uint32_t clipAttribSlot =
256             clipSlot == 0 ? state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
257 
258         simd4scalar primClipDist[3];
259         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
260 
261         float vertClipDist[NumVerts];
262         for (uint32_t e = 0; e < NumVerts; ++e)
263         {
264             OSALIGNSIMD(float) aVertClipDist[4];
265             SIMD128::store_ps(aVertClipDist, primClipDist[e]);
266             vertClipDist[e] = aVertClipDist[clipComp];
267         };
268 
269         // setup plane equations for barycentric interpolation in the backend
270         float baryCoeff[NumVerts];
271         float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
272         for (uint32_t e = 0; e < NumVerts - 1; ++e)
273         {
274             baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
275         }
276         baryCoeff[NumVerts - 1] = last;
277 
278         for (uint32_t e = 0; e < NumVerts; ++e)
279         {
280             *(pUserClipBuffer++) = baryCoeff[e];
281         }
282     }
283 }
284 
285 INLINE
TransposeVertices(simd4scalar (& dst)[8],const simdscalar & src0,const simdscalar & src1,const simdscalar & src2)286 void TransposeVertices(simd4scalar (&dst)[8],
287                        const simdscalar& src0,
288                        const simdscalar& src1,
289                        const simdscalar& src2)
290 {
291     vTranspose3x8(dst, src0, src1, src2);
292 }
293 
294 INLINE
TransposeVertices(simd4scalar (& dst)[16],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2)295 void TransposeVertices(simd4scalar (&dst)[16],
296                        const simd16scalar& src0,
297                        const simd16scalar& src1,
298                        const simd16scalar& src2)
299 {
300     vTranspose4x16(
301         reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
302 }
303 
304 #if KNOB_ENABLE_EARLY_RAST
305 
306 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
307 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
308 
309 template <typename SIMD_T>
310 struct EarlyRastHelper
311 {
312 };
313 
314 template <>
315 struct EarlyRastHelper<SIMD256>
316 {
InitShiftCntrlEarlyRastHelper317     static SIMD256::Integer InitShiftCntrl()
318     {
319         return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
320     }
321 };
322 
323 #if USE_SIMD16_FRONTEND
324 template <>
325 struct EarlyRastHelper<SIMD512>
326 {
InitShiftCntrlEarlyRastHelper327     static SIMD512::Integer InitShiftCntrl()
328     {
329         return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
330     }
331 };
332 
333 #endif
334 //////////////////////////////////////////////////////////////////////////
335 /// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
336 ///        (ER tile) can be rasterized as early as in binner to check if
337 ///        they cover any  pixels. If not - the triangles can be
338 ///        culled in binner.
339 ///
340 /// @param er_bbox - coordinates of ER tile for each triangle
341 /// @param vAi - A coefficients of triangle edges
342 /// @param vBi - B coefficients of triangle edges
343 /// @param vXi - X coordinates of triangle vertices
344 /// @param vYi - Y coordinates of triangle vertices
345 /// @param frontWindingTris - mask indicating CCW/CW triangles
346 /// @param triMask - mask for valid SIMD lanes (triangles)
347 /// @param oneTileMask - defines triangles for ER to work on
348 ///                      (tris that fit into ER tile)
349 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
EarlyRasterizer(DRAW_CONTEXT * pDC,SIMDBBOX_T<SIMD_T> & er_bbox,Integer<SIMD_T> (& vAi)[3],Integer<SIMD_T> (& vBi)[3],Integer<SIMD_T> (& vXi)[3],Integer<SIMD_T> (& vYi)[3],uint32_t cwTrisMask,uint32_t triMask,uint32_t oneTileMask)350 uint32_t SIMDCALL EarlyRasterizer(DRAW_CONTEXT*       pDC,
351                                   SIMDBBOX_T<SIMD_T>& er_bbox,
352                                   Integer<SIMD_T> (&vAi)[3],
353                                   Integer<SIMD_T> (&vBi)[3],
354                                   Integer<SIMD_T> (&vXi)[3],
355                                   Integer<SIMD_T> (&vYi)[3],
356                                   uint32_t cwTrisMask,
357                                   uint32_t triMask,
358                                   uint32_t oneTileMask)
359 {
360     // step to pixel center of top-left pixel of the triangle bbox
361     Integer<SIMD_T> vTopLeftX =
362         SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
363     vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
364 
365     Integer<SIMD_T> vTopLeftY =
366         SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
367     vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
368 
369     // negate A and B for CW tris
370     Integer<SIMD_T> vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
371     Integer<SIMD_T> vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
372     Integer<SIMD_T> vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
373     Integer<SIMD_T> vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
374     Integer<SIMD_T> vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
375     Integer<SIMD_T> vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
376 
377     RDTSC_EVENT(pDC->pContext->pBucketMgr,
378                 FEEarlyRastEnter,
379                 _mm_popcnt_u32(oneTileMask & triMask),
380                 0);
381 
382     Integer<SIMD_T> vShiftCntrl = EarlyRastHelper<SIMD_T>::InitShiftCntrl();
383     Integer<SIMD_T> vCwTris     = SIMD_T::set1_epi32(cwTrisMask);
384     Integer<SIMD_T> vMask       = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
385 
386     vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(
387         SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
388     vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(
389         SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
390     vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(
391         SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
392     vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(
393         SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
394     vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(
395         SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
396     vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(
397         SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
398 
399     // evaluate edge equations at top-left pixel
400     Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
401     Integer<SIMD_T> vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
402     Integer<SIMD_T> vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
403 
404     Integer<SIMD_T> vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
405     Integer<SIMD_T> vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
406     Integer<SIMD_T> vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
407 
408     Integer<SIMD_T> vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
409     Integer<SIMD_T> vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
410     Integer<SIMD_T> vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
411 
412     Integer<SIMD_T> vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
413     Integer<SIMD_T> vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
414     Integer<SIMD_T> vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
415 
416     Integer<SIMD_T> vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
417     Integer<SIMD_T> vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
418     Integer<SIMD_T> vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
419 
420     vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
421     vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
422     vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
423 
424     // top left rule
425     Integer<SIMD_T> vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
426     Integer<SIMD_T> vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
427     Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
428 
429     // vA < 0
430     vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(
431         SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
432     vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(
433         SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
434     vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(
435         SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
436 
437     // vA == 0 && vB < 0
438     Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
439     Integer<SIMD_T> vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
440     Integer<SIMD_T> vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
441 
442     vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
443     vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
444     vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
445 
446     vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(
447         SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
448     vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(
449         SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
450     vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(
451         SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
452 
453 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
454     // Go down
455     // coverage pixel 0
456     Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
457     vMask0                 = SIMD_T::and_si(vMask0, vEdge2);
458 
459     // coverage pixel 1
460     Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
461     Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
462     Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
463     Integer<SIMD_T> vMask1  = SIMD_T::and_si(vEdge0N, vEdge1N);
464     vMask1                  = SIMD_T::and_si(vMask1, vEdge2N);
465 
466     // coverage pixel 2
467     vEdge0N                = SIMD_T::add_epi32(vEdge0N, vBi[0]);
468     vEdge1N                = SIMD_T::add_epi32(vEdge1N, vBi[1]);
469     vEdge2N                = SIMD_T::add_epi32(vEdge2N, vBi[2]);
470     Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
471     vMask2                 = SIMD_T::and_si(vMask2, vEdge2N);
472 
473     // coverage pixel 3
474     vEdge0N                = SIMD_T::add_epi32(vEdge0N, vBi[0]);
475     vEdge1N                = SIMD_T::add_epi32(vEdge1N, vBi[1]);
476     vEdge2N                = SIMD_T::add_epi32(vEdge2N, vBi[2]);
477     Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
478     vMask3                 = SIMD_T::and_si(vMask3, vEdge2N);
479 
480     // One step to the right and then up
481 
482     // coverage pixel 4
483     vEdge0N                = SIMD_T::add_epi32(vEdge0N, vAi[0]);
484     vEdge1N                = SIMD_T::add_epi32(vEdge1N, vAi[1]);
485     vEdge2N                = SIMD_T::add_epi32(vEdge2N, vAi[2]);
486     Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
487     vMask4                 = SIMD_T::and_si(vMask4, vEdge2N);
488 
489     // coverage pixel 5
490     vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
491     vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
492     vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
493     Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
494     vMask5                 = SIMD_T::and_si(vMask5, vEdge2N);
495 
496     // coverage pixel 6
497     vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
498     vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
499     vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
500     Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
501     vMask6                 = SIMD_T::and_si(vMask6, vEdge2N);
502 
503     // coverage pixel 7
504     vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
505     vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
506     vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
507     Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
508     vMask7                 = SIMD_T::and_si(vMask7, vEdge2N);
509 
510     Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1);
511     vLit1                 = SIMD_T::or_si(vLit1, vMask2);
512     vLit1                 = SIMD_T::or_si(vLit1, vMask3);
513     vLit1                 = SIMD_T::or_si(vLit1, vMask4);
514     vLit1                 = SIMD_T::or_si(vLit1, vMask5);
515     vLit1                 = SIMD_T::or_si(vLit1, vMask6);
516     vLit1                 = SIMD_T::or_si(vLit1, vMask7);
517 
518     // Step to the right and go down again
519 
520     // coverage pixel 0
521     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
522     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
523     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
524     vMask0  = SIMD_T::and_si(vEdge0N, vEdge1N);
525     vMask0  = SIMD_T::and_si(vMask0, vEdge2N);
526 
527     // coverage pixel 1
528     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
529     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
530     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
531     vMask1  = SIMD_T::and_si(vEdge0N, vEdge1N);
532     vMask1  = SIMD_T::and_si(vMask1, vEdge2N);
533 
534     // coverage pixel 2
535     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
536     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
537     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
538     vMask2  = SIMD_T::and_si(vEdge0N, vEdge1N);
539     vMask2  = SIMD_T::and_si(vMask2, vEdge2N);
540 
541     // coverage pixel 3
542     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
543     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
544     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
545     vMask3  = SIMD_T::and_si(vEdge0N, vEdge1N);
546     vMask3  = SIMD_T::and_si(vMask3, vEdge2N);
547 
548     // And for the last time - to the right and up
549 
550     // coverage pixel 4
551     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
552     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
553     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
554     vMask4  = SIMD_T::and_si(vEdge0N, vEdge1N);
555     vMask4  = SIMD_T::and_si(vMask4, vEdge2N);
556 
557     // coverage pixel 5
558     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
559     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
560     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
561     vMask5  = SIMD_T::and_si(vEdge0N, vEdge1N);
562     vMask5  = SIMD_T::and_si(vMask5, vEdge2N);
563 
564     // coverage pixel 6
565     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
566     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
567     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
568     vMask6  = SIMD_T::and_si(vEdge0N, vEdge1N);
569     vMask6  = SIMD_T::and_si(vMask6, vEdge2N);
570 
571     // coverage pixel 7
572     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
573     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
574     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
575     vMask7  = SIMD_T::and_si(vEdge0N, vEdge1N);
576     vMask7  = SIMD_T::and_si(vMask7, vEdge2N);
577 
578     Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1);
579     vLit2                 = SIMD_T::or_si(vLit2, vMask2);
580     vLit2                 = SIMD_T::or_si(vLit2, vMask3);
581     vLit2                 = SIMD_T::or_si(vLit2, vMask4);
582     vLit2                 = SIMD_T::or_si(vLit2, vMask5);
583     vLit2                 = SIMD_T::or_si(vLit2, vMask6);
584     vLit2                 = SIMD_T::or_si(vLit2, vMask7);
585 
586     Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2);
587 
588 #else
589     // Generic algorithm sweeping in row by row order
590     Integer<SIMD_T> vRowMask[ER_SIMD_TILE_Y_DIM];
591 
592     Integer<SIMD_T> vEdge0N = vEdge0;
593     Integer<SIMD_T> vEdge1N = vEdge1;
594     Integer<SIMD_T> vEdge2N = vEdge2;
595 
596     for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
597     {
598         // Store edge values at the beginning of the row
599         Integer<SIMD_T> vRowEdge0 = vEdge0N;
600         Integer<SIMD_T> vRowEdge1 = vEdge1N;
601         Integer<SIMD_T> vRowEdge2 = vEdge2N;
602 
603         Integer<SIMD_T> vColMask[ER_SIMD_TILE_X_DIM];
604 
605         for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
606         {
607             vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N);
608             vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N);
609 
610             vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
611             vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
612             vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
613         }
614         vRowMask[row] = vColMask[0];
615         for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++)
616         {
617             vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]);
618         }
619         // Restore values and go to the next row
620         vEdge0N = vRowEdge0;
621         vEdge1N = vRowEdge1;
622         vEdge2N = vRowEdge2;
623 
624         vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
625         vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
626         vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
627     }
628 
629     // compress all masks
630     Integer<SIMD_T> vLit = vRowMask[0];
631     for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
632     {
633         vLit = SIMD_T::or_si(vLit, vRowMask[row]);
634     }
635 
636 #endif
637     // Check which triangles has any pixel lit
638     uint32_t maskLit   = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
639     uint32_t maskUnlit = ~maskLit & oneTileMask;
640 
641     uint32_t oldTriMask = triMask;
642     triMask &= ~maskUnlit;
643 
644     if (triMask ^ oldTriMask)
645     {
646         RDTSC_EVENT(pDC->pContext->pBucketMgr,
647                     FEEarlyRastExit,
648                     _mm_popcnt_u32(triMask & oneTileMask),
649                     0);
650     }
651     return triMask;
652 }
653 
654 #endif // Early rasterizer
655 
656 //////////////////////////////////////////////////////////////////////////
657 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
658 ///        culling, viewport transform, etc.
659 /// @param pDC - pointer to draw context.
660 /// @param pa - The primitive assembly object.
661 /// @param workerId - thread's worker id. Even thread has a unique id.
662 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
663 /// @param primID - Primitive ID for each triangle.
664 /// @param viewportIdx - viewport array index for each triangle.
665 /// @tparam CT - ConservativeRastFETraits
666 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
BinTrianglesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> tri[3],uint32_t triMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)667 void SIMDCALL BinTrianglesImpl(DRAW_CONTEXT*          pDC,
668                                PA_STATE&              pa,
669                                uint32_t               workerId,
670                                Vec4<SIMD_T>           tri[3],
671                                uint32_t               triMask,
672                                Integer<SIMD_T> const& primID,
673                                Integer<SIMD_T> const& viewportIdx,
674                                Integer<SIMD_T> const& rtIdx)
675 {
676     const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
677 
678     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinTriangles, pDC->drawId);
679 
680     const API_STATE&          state     = GetApiState(pDC);
681     const SWR_RASTSTATE&      rastState = state.rastState;
682     const SWR_FRONTEND_STATE& feState   = state.frontendState;
683 
684     MacroTileMgr* pTileMgr = pDC->pTileMgr;
685 
686     Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f);
687     Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f);
688     Float<SIMD_T> vRecipW2 = SIMD_T::set1_ps(1.0f);
689 
690     if (feState.vpTransformDisable)
691     {
692         // RHW is passed in directly when VP transform is disabled
693         vRecipW0 = tri[0].v[3];
694         vRecipW1 = tri[1].v[3];
695         vRecipW2 = tri[2].v[3];
696     }
697     else
698     {
699         // Perspective divide
700         vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
701         vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
702         vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
703 
704         tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
705         tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
706         tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
707 
708         tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
709         tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
710         tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
711 
712         tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
713         tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
714         tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
715 
716         // Viewport transform to screen space coords
717         if (pa.viewportArrayActive)
718         {
719             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
720         }
721         else
722         {
723             viewportTransform<3>(tri, state.vpMatrices);
724         }
725     }
726 
727     // Adjust for pixel center location
728     Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
729 
730     tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
731     tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
732 
733     tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
734     tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
735 
736     tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
737     tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
738 
739     // Set vXi, vYi to required fixed point precision
740     Integer<SIMD_T> vXi[3], vYi[3];
741     FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
742 
743     // triangle setup
744     Integer<SIMD_T> vAi[3], vBi[3];
745     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
746 
747     // determinant
748     Integer<SIMD_T> vDet[2];
749     calcDeterminantIntVertical(vAi, vBi, vDet);
750 
751     // cull zero area
752     uint32_t maskLo =
753         SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
754     uint32_t maskHi =
755         SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
756 
757     uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
758 
759     // don't cull degenerate triangles if we're conservatively rasterizing
760     uint32_t origTriMask = triMask;
761     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
762     {
763         triMask &= ~cullZeroAreaMask;
764     }
765 
766     // determine front winding tris
767     // CW  +det
768     // CCW det < 0;
769     // 0 area triangles are marked as backfacing regardless of winding order,
770     // which is required behavior for conservative rast and wireframe rendering
771     uint32_t frontWindingTris;
772     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
773     {
774         maskLo = SIMD_T::movemask_pd(
775             SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
776         maskHi = SIMD_T::movemask_pd(
777             SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
778     }
779     else
780     {
781         maskLo = SIMD_T::movemask_pd(
782             SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
783         maskHi = SIMD_T::movemask_pd(
784             SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
785     }
786     frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
787 
788     // cull
789     uint32_t cullTris;
790     switch ((SWR_CULLMODE)rastState.cullMode)
791     {
792     case SWR_CULLMODE_BOTH:
793         cullTris = 0xffffffff;
794         break;
795     case SWR_CULLMODE_NONE:
796         cullTris = 0x0;
797         break;
798     case SWR_CULLMODE_FRONT:
799         cullTris = frontWindingTris;
800         break;
801         // 0 area triangles are marked as backfacing, which is required behavior for conservative
802         // rast
803     case SWR_CULLMODE_BACK:
804         cullTris = ~frontWindingTris;
805         break;
806     default:
807         SWR_INVALID("Invalid cull mode: %d", rastState.cullMode);
808         cullTris = 0x0;
809         break;
810     }
811 
812     triMask &= ~cullTris;
813 
814     if (origTriMask ^ triMask)
815     {
816         RDTSC_EVENT(pDC->pContext->pBucketMgr,
817                     FECullZeroAreaAndBackface,
818                     _mm_popcnt_u32(origTriMask ^ triMask),
819                     0);
820     }
821 
822     AR_EVENT(CullInfoEvent(pDC->drawId, cullZeroAreaMask, cullTris, origTriMask));
823 
824     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
825     // compute per tri backface
826     uint32_t        frontFaceMask  = frontWindingTris;
827     uint32_t*       pPrimID        = (uint32_t*)&primID;
828     const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
829     uint32_t        triIndex       = 0;
830 
831     uint32_t      edgeEnable;
832     PFN_WORK_FUNC pfnWork;
833     if (CT::IsConservativeT::value)
834     {
835         // determine which edges of the degenerate tri, if any, are valid to rasterize.
836         // used to call the appropriate templated rasterizer function
837         if (cullZeroAreaMask > 0)
838         {
839             // e0 = v1-v0
840             const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
841             const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
842 
843             uint32_t e0Mask =
844                 SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
845 
846             // e1 = v2-v1
847             const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
848             const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
849 
850             uint32_t e1Mask =
851                 SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
852 
853             // e2 = v0-v2
854             // if v0 == v1 & v1 == v2, v0 == v2
855             uint32_t e2Mask = e0Mask & e1Mask;
856             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
857 
858             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
859             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
860             e0Mask = pdep_u32(e0Mask, 0x00249249);
861 
862             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
863             e1Mask = pdep_u32(e1Mask, 0x00492492);
864 
865             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
866             e2Mask = pdep_u32(e2Mask, 0x00924924);
867 
868             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
869         }
870         else
871         {
872             edgeEnable = 0x00FFFFFF;
873         }
874     }
875     else
876     {
877         // degenerate triangles won't be sent to rasterizer; just enable all edges
878         pfnWork = GetRasterizerFunc(rastState.sampleCount,
879                                     rastState.bIsCenterPattern,
880                                     (rastState.conservativeRast > 0),
881                                     (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage,
882                                     EdgeValToEdgeState(ALL_EDGES_VALID),
883                                     (state.scissorsTileAligned == false));
884     }
885 
886     SIMDBBOX_T<SIMD_T> bbox;
887 
888     if (!triMask)
889     {
890         goto endBinTriangles;
891     }
892 
893     // Calc bounding box of triangles
894     calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
895 
896     // determine if triangle falls between pixel centers and discard
897     // only discard for non-MSAA case and when conservative rast is disabled
898     // (xmin + 127) & ~255
899     // (xmax + 128) & ~255
900     if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
901         (!CT::IsConservativeT::value))
902     {
903         origTriMask = triMask;
904 
905         int cullCenterMask;
906 
907         {
908             Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
909             xmin                 = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
910             Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
911             xmax                 = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
912 
913             Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
914 
915             Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
916             ymin                 = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
917             Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
918             ymax                 = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
919 
920             Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
921 
922             vMaskV         = SIMD_T::or_si(vMaskH, vMaskV);
923             cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
924         }
925 
926         triMask &= ~cullCenterMask;
927 
928         if (origTriMask ^ triMask)
929         {
930             RDTSC_EVENT(pDC->pContext->pBucketMgr,
931                         FECullBetweenCenters,
932                         _mm_popcnt_u32(origTriMask ^ triMask),
933                         0);
934         }
935     }
936 
937     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is
938     // exclusive. Gather the AOS effective scissor rects based on the per-prim VP index.
939     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
940     {
941         Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
942         if (pa.viewportArrayActive)
943 
944         {
945             GatherScissors(&state.scissorsInFixedPoint[0],
946                            pViewportIndex,
947                            scisXmin,
948                            scisYmin,
949                            scisXmax,
950                            scisYmax);
951         }
952         else // broadcast fast path for non-VPAI case.
953         {
954             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
955             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
956             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
957             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
958         }
959 
960         // Make triangle bbox inclusive
961         bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
962         bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
963 
964         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
965         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
966         bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
967         bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
968     }
969 
970     if (CT::IsConservativeT::value)
971     {
972         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the
973         // primitive bbox has some area. Bump the xmax/ymax edges out
974 
975         Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
976         bbox.ymax                       = SIMD_T::blendv_epi32(
977             bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
978 
979         Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
980         bbox.xmax                       = SIMD_T::blendv_epi32(
981             bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
982     }
983 
984     // Cull tris completely outside scissor
985     {
986         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
987         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
988         Integer<SIMD_T> maskOutsideScissorXY =
989             SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
990         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
991         triMask                     = triMask & ~maskOutsideScissor;
992     }
993 
994 #if KNOB_ENABLE_EARLY_RAST
995     if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value)
996     {
997         // Try early rasterization - culling small triangles which do not cover any pixels
998 
999         // convert to ER tiles
1000         SIMDBBOX_T<SIMD_T> er_bbox;
1001 
1002         er_bbox.xmin =
1003             SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
1004         er_bbox.xmax =
1005             SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
1006         er_bbox.ymin =
1007             SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
1008         er_bbox.ymax =
1009             SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
1010 
1011         Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
1012         Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
1013 
1014         // Take only triangles that fit into ER tile
1015         uint32_t oneTileMask =
1016             triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
1017 
1018         if (oneTileMask)
1019         {
1020             // determine CW tris (det > 0)
1021             uint32_t maskCwLo = SIMD_T::movemask_pd(
1022                 SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
1023             uint32_t maskCwHi = SIMD_T::movemask_pd(
1024                 SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
1025             uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
1026 
1027             // Try early rasterization
1028             triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(
1029                 pDC, er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
1030 
1031             if (!triMask)
1032             {
1033                 RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1034                 return;
1035             }
1036         }
1037     }
1038 #endif
1039 
1040 endBinTriangles:
1041 
1042 
1043     if (!triMask)
1044     {
1045         RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1046         return;
1047     }
1048 
1049     // Send surviving triangles to the line or point binner based on fill mode
1050     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
1051     {
1052         // Simple non-conformant wireframe mode, useful for debugging
1053         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
1054         Vec4<SIMD_T>  line[2];
1055         Float<SIMD_T> recipW[2];
1056 
1057         line[0]   = tri[0];
1058         line[1]   = tri[1];
1059         recipW[0] = vRecipW0;
1060         recipW[1] = vRecipW1;
1061 
1062         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1063             pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1064 
1065         line[0]   = tri[1];
1066         line[1]   = tri[2];
1067         recipW[0] = vRecipW1;
1068         recipW[1] = vRecipW2;
1069 
1070         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1071             pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1072 
1073         line[0]   = tri[2];
1074         line[1]   = tri[0];
1075         recipW[0] = vRecipW2;
1076         recipW[1] = vRecipW0;
1077 
1078         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1079             pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1080 
1081         RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1082         return;
1083     }
1084     else if (rastState.fillMode == SWR_FILLMODE_POINT)
1085     {
1086         // Bin 3 points
1087         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1088             pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
1089         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1090             pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
1091         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1092             pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
1093 
1094         RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1095         return;
1096     }
1097 
1098     // Convert triangle bbox to macrotile units.
1099     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1100     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1101     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1102     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1103 
1104     OSALIGNSIMD16(uint32_t)
1105     aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1106 
1107     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
1108     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
1109     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
1110     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
1111 
1112     // transpose verts needed for backend
1113     /// @todo modify BE to take non-transformed verts
1114     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1115     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1116     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1117     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1118 
1119     TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1120     TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1121     TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1122     TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1123 
1124     // scan remaining valid triangles and bin each separately
1125     while (_BitScanForward((unsigned long*)&triIndex, triMask))
1126     {
1127         uint32_t linkageCount     = state.backendState.numAttributes;
1128         uint32_t numScalarAttribs = linkageCount * 4;
1129 
1130         BE_WORK work;
1131         work.type = DRAW;
1132 
1133         bool isDegenerate;
1134         if (CT::IsConservativeT::value)
1135         {
1136             // only rasterize valid edges if we have a degenerate primitive
1137             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1138             work.pfnWork =
1139                 GetRasterizerFunc(rastState.sampleCount,
1140                                   rastState.bIsCenterPattern,
1141                                   (rastState.conservativeRast > 0),
1142                                   (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage,
1143                                   EdgeValToEdgeState(triEdgeEnable),
1144                                   (state.scissorsTileAligned == false));
1145 
1146             // Degenerate triangles are required to be constant interpolated
1147             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1148         }
1149         else
1150         {
1151             isDegenerate = false;
1152             work.pfnWork = pfnWork;
1153         }
1154 
1155         // Select attribute processor
1156         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs =
1157             GetProcessAttributesFunc(3,
1158                                      state.backendState.swizzleEnable,
1159                                      state.backendState.constantInterpolationMask,
1160                                      isDegenerate);
1161 
1162         TRIANGLE_WORK_DESC& desc = work.desc.tri;
1163 
1164         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1165         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1166         desc.triFlags.viewportIndex          = pViewportIndex[triIndex];
1167 
1168         auto pArena = pDC->pArena;
1169         SWR_ASSERT(pArena != nullptr);
1170 
1171         // store active attribs
1172         float* pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1173         desc.pAttribs   = pAttribs;
1174         desc.numAttribs = linkageCount;
1175         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1176 
1177         // store triangle vertex data
1178         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1179 
1180         SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1181         SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1182         SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1183         SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1184 
1185         // store user clip distances
1186         if (state.backendState.clipDistanceMask)
1187         {
1188             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1189             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1190             ProcessUserClipDist<3>(
1191                 state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1192         }
1193 
1194         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1195         {
1196             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1197             {
1198 #if KNOB_ENABLE_TOSS_POINTS
1199                 if (!KNOB_TOSS_SETUP_TRIS)
1200 #endif
1201                 {
1202                     pTileMgr->enqueue(x, y, &work);
1203                 }
1204             }
1205         }
1206 
1207         triMask &= ~(1 << triIndex);
1208     }
1209 
1210     RDTSC_END(pDC->pContext->pBucketMgr, FEBinTriangles, 1);
1211 }
1212 
1213 template <typename CT>
BinTriangles(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector tri[3],uint32_t triMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1214 void BinTriangles(DRAW_CONTEXT*      pDC,
1215                   PA_STATE&          pa,
1216                   uint32_t           workerId,
1217                   simdvector         tri[3],
1218                   uint32_t           triMask,
1219                   simdscalari const& primID,
1220                   simdscalari const& viewportIdx,
1221                   simdscalari const& rtIdx)
1222 {
1223     BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(
1224         pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1225 }
1226 
1227 #if USE_SIMD16_FRONTEND
1228 template <typename CT>
BinTriangles_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector tri[3],uint32_t triMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1229 void SIMDCALL BinTriangles_simd16(DRAW_CONTEXT*        pDC,
1230                                   PA_STATE&            pa,
1231                                   uint32_t             workerId,
1232                                   simd16vector         tri[3],
1233                                   uint32_t             triMask,
1234                                   simd16scalari const& primID,
1235                                   simd16scalari const& viewportIdx,
1236                                   simd16scalari const& rtIdx)
1237 {
1238     BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(
1239         pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1240 }
1241 
1242 #endif
1243 struct FEBinTrianglesChooser
1244 {
1245     typedef PFN_PROCESS_PRIMS FuncType;
1246 
1247     template <typename... ArgsB>
GetFuncFEBinTrianglesChooser1248     static FuncType GetFunc()
1249     {
1250         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1251     }
1252 };
1253 
1254 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc(bool IsConservative)1255 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1256 {
1257     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1258 }
1259 
1260 #if USE_SIMD16_FRONTEND
1261 struct FEBinTrianglesChooser_simd16
1262 {
1263     typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1264 
1265     template <typename... ArgsB>
GetFuncFEBinTrianglesChooser_simd161266     static FuncType GetFunc()
1267     {
1268         return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1269     }
1270 };
1271 
1272 // Selector for correct templated BinTrinagles function
GetBinTrianglesFunc_simd16(bool IsConservative)1273 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1274 {
1275     return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1276 }
1277 
1278 #endif
1279 
1280 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPostSetupPointsImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> prim[],uint32_t primMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)1281 void BinPostSetupPointsImpl(DRAW_CONTEXT*          pDC,
1282                             PA_STATE&              pa,
1283                             uint32_t               workerId,
1284                             Vec4<SIMD_T>           prim[],
1285                             uint32_t               primMask,
1286                             Integer<SIMD_T> const& primID,
1287                             Integer<SIMD_T> const& viewportIdx,
1288                             Integer<SIMD_T> const& rtIdx)
1289 {
1290     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinPoints, pDC->drawId);
1291 
1292     Vec4<SIMD_T>& primVerts = prim[0];
1293 
1294     const API_STATE&     state          = GetApiState(pDC);
1295     const SWR_RASTSTATE& rastState      = state.rastState;
1296     const uint32_t*      pViewportIndex = (uint32_t*)&viewportIdx;
1297 
1298     // Select attribute processor
1299     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(
1300         1, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1301 
1302     // convert to fixed point
1303     Integer<SIMD_T> vXi, vYi;
1304 
1305     vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1306     vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1307 
1308     if (CanUseSimplePoints(pDC))
1309     {
1310         // adjust for ymin-xmin rule
1311         vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1312         vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1313 
1314         // cull points off the ymin-xmin edge of the viewport
1315         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1316         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1317 
1318         // compute macro tile coordinates
1319         Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1320         Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1321 
1322         OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1323 
1324         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroX), macroX);
1325         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroY), macroY);
1326 
1327         // compute raster tile coordinates
1328         Integer<SIMD_T> rasterX =
1329             SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1330         Integer<SIMD_T> rasterY =
1331             SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1332 
1333         // compute raster tile relative x,y for coverage mask
1334         Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1335         Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1336 
1337         Integer<SIMD_T> tileRelativeX =
1338             SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1339         Integer<SIMD_T> tileRelativeY =
1340             SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1341 
1342         OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1343         OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1344 
1345         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeX), tileRelativeX);
1346         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeY), tileRelativeY);
1347 
1348         OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1349         OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1350 
1351         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedX), tileAlignedX);
1352         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedY), tileAlignedY);
1353 
1354         OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1355         SIMD_T::store_ps(reinterpret_cast<float*>(aZ), primVerts.z);
1356 
1357         // store render target array index
1358         const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
1359 
1360         uint32_t* pPrimID   = (uint32_t*)&primID;
1361         uint32_t  primIndex = 0;
1362 
1363         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1364 
1365         // scan remaining valid triangles and bin each separately
1366         while (_BitScanForward((unsigned long*)&primIndex, primMask))
1367         {
1368             uint32_t linkageCount     = backendState.numAttributes;
1369             uint32_t numScalarAttribs = linkageCount * 4;
1370 
1371             BE_WORK work;
1372             work.type = DRAW;
1373 
1374             TRIANGLE_WORK_DESC& desc = work.desc.tri;
1375 
1376             // points are always front facing
1377             desc.triFlags.frontFacing            = 1;
1378             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1379             desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
1380 
1381             work.pfnWork = RasterizeSimplePoint;
1382 
1383             auto pArena = pDC->pArena;
1384             SWR_ASSERT(pArena != nullptr);
1385 
1386             // store attributes
1387             float* pAttribs =
1388                 (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1389             desc.pAttribs   = pAttribs;
1390             desc.numAttribs = linkageCount;
1391 
1392             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1393 
1394             // store raster tile aligned x, y, perspective correct z
1395             float* pTriBuffer        = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1396             desc.pTriBuffer          = pTriBuffer;
1397             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1398             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1399             *pTriBuffer              = aZ[primIndex];
1400 
1401             uint32_t tX = aTileRelativeX[primIndex];
1402             uint32_t tY = aTileRelativeY[primIndex];
1403 
1404             // pack the relative x,y into the coverageMask, the rasterizer will
1405             // generate the true coverage mask from it
1406             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1407 
1408             // bin it
1409             MacroTileMgr* pTileMgr = pDC->pTileMgr;
1410 #if KNOB_ENABLE_TOSS_POINTS
1411             if (!KNOB_TOSS_SETUP_TRIS)
1412 #endif
1413             {
1414                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1415             }
1416 
1417             primMask &= ~(1 << primIndex);
1418         }
1419     }
1420     else
1421     {
1422         // non simple points need to be potentially binned to multiple macro tiles
1423         Float<SIMD_T> vPointSize;
1424 
1425         if (rastState.pointParam)
1426         {
1427             Vec4<SIMD_T> size[3];
1428             pa.Assemble(VERTEX_SGV_SLOT, size);
1429             vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1430         }
1431         else
1432         {
1433             vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1434         }
1435 
1436         // bloat point to bbox
1437         SIMDBBOX_T<SIMD_T> bbox;
1438 
1439         bbox.xmin = bbox.xmax = vXi;
1440         bbox.ymin = bbox.ymax = vYi;
1441 
1442         Float<SIMD_T>   vHalfWidth  = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1443         Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1444 
1445         bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1446         bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1447         bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1448         bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1449 
1450         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge
1451         // is exclusive. Gather the AOS effective scissor rects based on the per-prim VP index.
1452         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
1453         {
1454             Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
1455 
1456             if (pa.viewportArrayActive)
1457             {
1458                 GatherScissors(&state.scissorsInFixedPoint[0],
1459                                pViewportIndex,
1460                                scisXmin,
1461                                scisYmin,
1462                                scisXmax,
1463                                scisYmax);
1464             }
1465             else // broadcast fast path for non-VPAI case.
1466             {
1467                 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1468                 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1469                 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1470                 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1471             }
1472 
1473             bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1474             bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1475             bbox.xmax =
1476                 SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1477             bbox.ymax =
1478                 SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1479         }
1480 
1481         // Cull bloated points completely outside scissor
1482         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1483         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1484         Integer<SIMD_T> maskOutsideScissorXY =
1485             SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1486         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1487         primMask                    = primMask & ~maskOutsideScissor;
1488 
1489         // Convert bbox to macrotile units.
1490         bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1491         bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1492         bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1493         bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1494 
1495         OSALIGNSIMD16(uint32_t)
1496         aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1497 
1498         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
1499         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
1500         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
1501         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
1502 
1503         // store render target array index
1504         const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
1505 
1506         OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1507         SIMD_T::store_ps(reinterpret_cast<float*>(aPointSize), vPointSize);
1508 
1509         uint32_t* pPrimID = (uint32_t*)&primID;
1510 
1511         OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1512         OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1513         OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1514 
1515         SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsX), primVerts.x);
1516         SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsY), primVerts.y);
1517         SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsZ), primVerts.z);
1518 
1519         // scan remaining valid prims and bin each separately
1520         const SWR_BACKEND_STATE& backendState = state.backendState;
1521         uint32_t                 primIndex;
1522         while (_BitScanForward((unsigned long*)&primIndex, primMask))
1523         {
1524             uint32_t linkageCount     = backendState.numAttributes;
1525             uint32_t numScalarAttribs = linkageCount * 4;
1526 
1527             BE_WORK work;
1528             work.type = DRAW;
1529 
1530             TRIANGLE_WORK_DESC& desc = work.desc.tri;
1531 
1532             desc.triFlags.frontFacing            = 1;
1533             desc.triFlags.pointSize              = aPointSize[primIndex];
1534             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1535             desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
1536 
1537             work.pfnWork = RasterizeTriPoint;
1538 
1539             auto pArena = pDC->pArena;
1540             SWR_ASSERT(pArena != nullptr);
1541 
1542             // store active attribs
1543             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1544             desc.numAttribs = linkageCount;
1545             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1546 
1547             // store point vertex data
1548             float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1549             desc.pTriBuffer   = pTriBuffer;
1550             *pTriBuffer++     = aPrimVertsX[primIndex];
1551             *pTriBuffer++     = aPrimVertsY[primIndex];
1552             *pTriBuffer       = aPrimVertsZ[primIndex];
1553 
1554             // store user clip distances
1555             if (backendState.clipDistanceMask)
1556             {
1557                 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1558                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1559                 float dists[8];
1560                 float one = 1.0f;
1561                 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1562                 for (uint32_t i = 0; i < numClipDist; i++)
1563                 {
1564                     desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1565                     desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1566                     desc.pUserClipBuffer[3 * i + 2] = dists[i];
1567                 }
1568             }
1569 
1570             MacroTileMgr* pTileMgr = pDC->pTileMgr;
1571             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1572             {
1573                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1574                 {
1575 #if KNOB_ENABLE_TOSS_POINTS
1576                     if (!KNOB_TOSS_SETUP_TRIS)
1577 #endif
1578                     {
1579                         pTileMgr->enqueue(x, y, &work);
1580                     }
1581                 }
1582             }
1583 
1584             primMask &= ~(1 << primIndex);
1585         }
1586     }
1587 
1588     RDTSC_END(pDC->pContext->pBucketMgr, FEBinPoints, 1);
1589 }
1590 
1591 //////////////////////////////////////////////////////////////////////////
1592 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1593 /// @param pDC - pointer to draw context.
1594 /// @param pa - The primitive assembly object.
1595 /// @param workerId - thread's worker id. Even thread has a unique id.
1596 /// @param tri - Contains point position data for SIMDs worth of points.
1597 /// @param primID - Primitive ID for each point.
1598 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPointsImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> prim[3],uint32_t primMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)1599 void BinPointsImpl(DRAW_CONTEXT*          pDC,
1600                    PA_STATE&              pa,
1601                    uint32_t               workerId,
1602                    Vec4<SIMD_T>           prim[3],
1603                    uint32_t               primMask,
1604                    Integer<SIMD_T> const& primID,
1605                    Integer<SIMD_T> const& viewportIdx,
1606                    Integer<SIMD_T> const& rtIdx)
1607 {
1608     const API_STATE&          state     = GetApiState(pDC);
1609     const SWR_FRONTEND_STATE& feState   = state.frontendState;
1610     const SWR_RASTSTATE&      rastState = state.rastState;
1611 
1612     if (!feState.vpTransformDisable)
1613     {
1614         // perspective divide
1615         Float<SIMD_T> vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1616 
1617         prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1618         prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1619         prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1620 
1621         // viewport transform to screen coords
1622         if (pa.viewportArrayActive)
1623         {
1624             viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1625         }
1626         else
1627         {
1628             viewportTransform<1>(prim, state.vpMatrices);
1629         }
1630     }
1631 
1632     Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1633 
1634     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1635     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1636 
1637     BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1638         pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1639 }
1640 
BinPoints(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[3],uint32_t primMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1641 void BinPoints(DRAW_CONTEXT*      pDC,
1642                PA_STATE&          pa,
1643                uint32_t           workerId,
1644                simdvector         prim[3],
1645                uint32_t           primMask,
1646                simdscalari const& primID,
1647                simdscalari const& viewportIdx,
1648                simdscalari const& rtIdx)
1649 {
1650     BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1651         pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1652 }
1653 
1654 #if USE_SIMD16_FRONTEND
BinPoints_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector prim[3],uint32_t primMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1655 void SIMDCALL BinPoints_simd16(DRAW_CONTEXT*        pDC,
1656                                PA_STATE&            pa,
1657                                uint32_t             workerId,
1658                                simd16vector         prim[3],
1659                                uint32_t             primMask,
1660                                simd16scalari const& primID,
1661                                simd16scalari const& viewportIdx,
1662                                simd16scalari const& rtIdx)
1663 {
1664     BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1665         pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1666 }
1667 
1668 #endif
1669 //////////////////////////////////////////////////////////////////////////
1670 /// @brief Bin SIMD lines to the backend.
1671 /// @param pDC - pointer to draw context.
1672 /// @param pa - The primitive assembly object.
1673 /// @param workerId - thread's worker id. Even thread has a unique id.
1674 /// @param tri - Contains line position data for SIMDs worth of points.
1675 /// @param primID - Primitive ID for each line.
1676 /// @param viewportIdx - Viewport Array Index for each line.
1677 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinPostSetupLinesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> prim[],Float<SIMD_T> recipW[],uint32_t primMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)1678 void BinPostSetupLinesImpl(DRAW_CONTEXT*          pDC,
1679                            PA_STATE&              pa,
1680                            uint32_t               workerId,
1681                            Vec4<SIMD_T>           prim[],
1682                            Float<SIMD_T>          recipW[],
1683                            uint32_t               primMask,
1684                            Integer<SIMD_T> const& primID,
1685                            Integer<SIMD_T> const& viewportIdx,
1686                            Integer<SIMD_T> const& rtIdx)
1687 {
1688     const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
1689 
1690     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEBinLines, pDC->drawId);
1691 
1692     const API_STATE&     state     = GetApiState(pDC);
1693     const SWR_RASTSTATE& rastState = state.rastState;
1694 
1695     // Select attribute processor
1696     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(
1697         2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1698 
1699     Float<SIMD_T>& vRecipW0 = recipW[0];
1700     Float<SIMD_T>& vRecipW1 = recipW[1];
1701 
1702     // convert to fixed point
1703     Integer<SIMD_T> vXi[2], vYi[2];
1704 
1705     vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1706     vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1707     vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1708     vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1709 
1710     // compute x-major vs y-major mask
1711     Integer<SIMD_T> xLength     = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1712     Integer<SIMD_T> yLength     = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1713     Float<SIMD_T>   vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1714     uint32_t        yMajorMask  = SIMD_T::movemask_ps(vYmajorMask);
1715 
1716     // cull zero-length lines
1717     Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1718     vZeroLengthMask =
1719         SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1720 
1721     primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1722 
1723     uint32_t*       pPrimID        = (uint32_t*)&primID;
1724     const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
1725 
1726     // Calc bounding box of lines
1727     SIMDBBOX_T<SIMD_T> bbox;
1728     bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1729     bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1730     bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1731     bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1732 
1733     // bloat bbox by line width along minor axis
1734     Float<SIMD_T>   vHalfWidth  = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1735     Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1736 
1737     SIMDBBOX_T<SIMD_T> bloatBox;
1738 
1739     bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1740     bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1741     bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1742     bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1743 
1744     bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1745     bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1746     bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1747     bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1748 
1749     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is
1750     // exclusive.
1751     {
1752         Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
1753 
1754         if (pa.viewportArrayActive)
1755         {
1756             GatherScissors(&state.scissorsInFixedPoint[0],
1757                            pViewportIndex,
1758                            scisXmin,
1759                            scisYmin,
1760                            scisXmax,
1761                            scisYmax);
1762         }
1763         else // broadcast fast path for non-VPAI case.
1764         {
1765             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1766             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1767             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1768             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1769         }
1770 
1771         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1772         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1773         bbox.xmax =
1774             SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1775         bbox.ymax =
1776             SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1777     }
1778 
1779     // Cull prims completely outside scissor
1780     {
1781         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1782         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1783         Integer<SIMD_T> maskOutsideScissorXY =
1784             SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1785         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1786         primMask                    = primMask & ~maskOutsideScissor;
1787     }
1788 
1789     // transpose verts needed for backend
1790     /// @todo modify BE to take non-transformed verts
1791     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1792     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1793     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1794     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1795 
1796     if (!primMask)
1797     {
1798         goto endBinLines;
1799     }
1800 
1801     // Convert triangle bbox to macrotile units.
1802     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1803     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1804     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1805     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1806 
1807     OSALIGNSIMD16(uint32_t)
1808     aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1809 
1810     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
1811     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
1812     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
1813     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
1814 
1815     TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1816     TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1817     TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1818     TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
1819 
1820     // scan remaining valid prims and bin each separately
1821     unsigned long primIndex;
1822     while (_BitScanForward(&primIndex, primMask))
1823     {
1824         uint32_t linkageCount     = state.backendState.numAttributes;
1825         uint32_t numScalarAttribs = linkageCount * 4;
1826 
1827         BE_WORK work;
1828         work.type = DRAW;
1829 
1830         TRIANGLE_WORK_DESC& desc = work.desc.tri;
1831 
1832         desc.triFlags.frontFacing            = 1;
1833         desc.triFlags.yMajor                 = (yMajorMask >> primIndex) & 1;
1834         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1835         desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
1836 
1837         work.pfnWork = RasterizeLine;
1838 
1839         auto pArena = pDC->pArena;
1840         SWR_ASSERT(pArena != nullptr);
1841 
1842         // store active attribs
1843         desc.pAttribs   = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1844         desc.numAttribs = linkageCount;
1845         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1846 
1847         // store line vertex data
1848         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1849 
1850         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
1851         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
1852         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
1853         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1854 
1855         // store user clip distances
1856         if (state.backendState.clipDistanceMask)
1857         {
1858             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1859             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1860             ProcessUserClipDist<2>(
1861                 state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1862         }
1863 
1864         MacroTileMgr* pTileMgr = pDC->pTileMgr;
1865         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1866         {
1867             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1868             {
1869 #if KNOB_ENABLE_TOSS_POINTS
1870                 if (!KNOB_TOSS_SETUP_TRIS)
1871 #endif
1872                 {
1873                     pTileMgr->enqueue(x, y, &work);
1874                 }
1875             }
1876         }
1877 
1878         primMask &= ~(1 << primIndex);
1879     }
1880 
1881 endBinLines:
1882 
1883     RDTSC_END(pDC->pContext->pBucketMgr, FEBinLines, 1);
1884 }
1885 
1886 //////////////////////////////////////////////////////////////////////////
1887 /// @brief Bin SIMD lines to the backend.
1888 /// @param pDC - pointer to draw context.
1889 /// @param pa - The primitive assembly object.
1890 /// @param workerId - thread's worker id. Even thread has a unique id.
1891 /// @param tri - Contains line position data for SIMDs worth of points.
1892 /// @param primID - Primitive ID for each line.
1893 /// @param viewportIdx - Viewport Array Index for each line.
1894 template <typename SIMD_T, uint32_t SIMD_WIDTH>
BinLinesImpl(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,Vec4<SIMD_T> prim[3],uint32_t primMask,Integer<SIMD_T> const & primID,Integer<SIMD_T> const & viewportIdx,Integer<SIMD_T> const & rtIdx)1895 void SIMDCALL BinLinesImpl(DRAW_CONTEXT*          pDC,
1896                            PA_STATE&              pa,
1897                            uint32_t               workerId,
1898                            Vec4<SIMD_T>           prim[3],
1899                            uint32_t               primMask,
1900                            Integer<SIMD_T> const& primID,
1901                            Integer<SIMD_T> const& viewportIdx,
1902                            Integer<SIMD_T> const& rtIdx)
1903 {
1904     const API_STATE&          state     = GetApiState(pDC);
1905     const SWR_RASTSTATE&      rastState = state.rastState;
1906     const SWR_FRONTEND_STATE& feState   = state.frontendState;
1907 
1908     Float<SIMD_T> vRecipW[2] = {SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f)};
1909 
1910     if (!feState.vpTransformDisable)
1911     {
1912         // perspective divide
1913         vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1914         vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1915 
1916         prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1917         prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1918 
1919         prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1920         prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1921 
1922         prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1923         prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1924 
1925         // viewport transform to screen coords
1926         if (pa.viewportArrayActive)
1927         {
1928             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1929         }
1930         else
1931         {
1932             viewportTransform<2>(prim, state.vpMatrices);
1933         }
1934     }
1935 
1936     // adjust for pixel center location
1937     Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1938 
1939     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1940     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1941 
1942     prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1943     prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1944 
1945     BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1946         pDC, pa, workerId, prim, vRecipW, primMask, primID, viewportIdx, rtIdx);
1947 }
1948 
BinLines(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simdvector prim[],uint32_t primMask,simdscalari const & primID,simdscalari const & viewportIdx,simdscalari const & rtIdx)1949 void BinLines(DRAW_CONTEXT*      pDC,
1950               PA_STATE&          pa,
1951               uint32_t           workerId,
1952               simdvector         prim[],
1953               uint32_t           primMask,
1954               simdscalari const& primID,
1955               simdscalari const& viewportIdx,
1956               simdscalari const& rtIdx)
1957 {
1958     BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(
1959         pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1960 }
1961 
1962 #if USE_SIMD16_FRONTEND
BinLines_simd16(DRAW_CONTEXT * pDC,PA_STATE & pa,uint32_t workerId,simd16vector prim[3],uint32_t primMask,simd16scalari const & primID,simd16scalari const & viewportIdx,simd16scalari const & rtIdx)1963 void SIMDCALL BinLines_simd16(DRAW_CONTEXT*        pDC,
1964                               PA_STATE&            pa,
1965                               uint32_t             workerId,
1966                               simd16vector         prim[3],
1967                               uint32_t             primMask,
1968                               simd16scalari const& primID,
1969                               simd16scalari const& viewportIdx,
1970                               simd16scalari const& rtIdx)
1971 {
1972     BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1973         pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1974 }
1975 
1976 #endif
1977