1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file clip.h
24  *
25  * @brief Definitions for clipping
26  *
27  ******************************************************************************/
28 #pragma once
29 
30 #include "common/simdintrin.h"
31 #include "core/context.h"
32 #include "core/pa.h"
33 #include "rdtsc_core.h"
34 
35 enum SWR_CLIPCODES
36 {
37 // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
38 // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
39 // rather than intersection, of clipcodes.
40 #define CLIPCODE_SHIFT 23
41     FRUSTUM_LEFT   = (0x01 << CLIPCODE_SHIFT),
42     FRUSTUM_TOP    = (0x02 << CLIPCODE_SHIFT),
43     FRUSTUM_RIGHT  = (0x04 << CLIPCODE_SHIFT),
44     FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
45 
46     FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
47     FRUSTUM_FAR  = (0x20 << CLIPCODE_SHIFT),
48 
49     NEGW = (0x40 << CLIPCODE_SHIFT),
50 
51     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
52     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
53     GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
54     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
55 };
56 
57 #define GUARDBAND_CLIP_MASK                                                          \
58     (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
59      GUARDBAND_BOTTOM | NEGW)
60 #define FRUSTUM_CLIP_MASK \
61     (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
62 
63 template <typename SIMD_T>
ComputeClipCodes(const API_STATE & state,const Vec4<SIMD_T> & vertex,Float<SIMD_T> & clipCodes,Integer<SIMD_T> const & viewportIndexes)64 void ComputeClipCodes(const API_STATE&       state,
65                       const Vec4<SIMD_T>&    vertex,
66                       Float<SIMD_T>&         clipCodes,
67                       Integer<SIMD_T> const& viewportIndexes)
68 {
69     clipCodes = SIMD_T::setzero_ps();
70 
71     // -w
72     Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f));
73 
74     // FRUSTUM_LEFT
75     Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
76     clipCodes          = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
77 
78     // FRUSTUM_TOP
79     vRes      = SIMD_T::cmplt_ps(vertex.y, vNegW);
80     clipCodes = SIMD_T::or_ps(
81         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
82 
83     // FRUSTUM_RIGHT
84     vRes      = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
85     clipCodes = SIMD_T::or_ps(
86         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
87 
88     // FRUSTUM_BOTTOM
89     vRes      = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
90     clipCodes = SIMD_T::or_ps(
91         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
92 
93     if (state.rastState.depthClipEnable)
94     {
95         // FRUSTUM_NEAR
96         // DX clips depth [0..w], GL clips [-w..w]
97         if (state.rastState.clipHalfZ)
98         {
99             vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
100         }
101         else
102         {
103             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
104         }
105         clipCodes = SIMD_T::or_ps(
106             clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
107 
108         // FRUSTUM_FAR
109         vRes      = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
110         clipCodes = SIMD_T::or_ps(
111             clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
112     }
113 
114     // NEGW
115     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
116     clipCodes =
117         SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
118 
119     // GUARDBAND_LEFT
120     Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW,
121                                           SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
122                                               &state.gbState.left[0], viewportIndexes));
123     vRes                 = SIMD_T::cmplt_ps(vertex.x, gbMult);
124     clipCodes            = SIMD_T::or_ps(
125         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
126 
127     // GUARDBAND_TOP
128     gbMult    = SIMD_T::mul_ps(vNegW,
129                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
130                                 &state.gbState.top[0], viewportIndexes));
131     vRes      = SIMD_T::cmplt_ps(vertex.y, gbMult);
132     clipCodes = SIMD_T::or_ps(
133         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
134 
135     // GUARDBAND_RIGHT
136     gbMult    = SIMD_T::mul_ps(vertex.w,
137                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
138                                 &state.gbState.right[0], viewportIndexes));
139     vRes      = SIMD_T::cmpgt_ps(vertex.x, gbMult);
140     clipCodes = SIMD_T::or_ps(
141         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
142 
143     // GUARDBAND_BOTTOM
144     gbMult    = SIMD_T::mul_ps(vertex.w,
145                             SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
146                                 &state.gbState.bottom[0], viewportIndexes));
147     vRes      = SIMD_T::cmpgt_ps(vertex.y, gbMult);
148     clipCodes = SIMD_T::or_ps(
149         clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
150 }
151 
152 template <typename SIMD_T>
153 struct BinnerChooser
154 {
155 };
156 
157 template <>
158 struct BinnerChooser<SIMD256>
159 {
160     PFN_PROCESS_PRIMS pfnBinFunc;
161 
162     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
163         :
164         pfnBinFunc(nullptr)
165     {
166         if (numVertsPerPrim == 3)
167         {
168             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
169 
170         }
171         else if (numVertsPerPrim == 2)
172         {
173             pfnBinFunc = BinLines;
174         }
175         else
176         {
177             SWR_ASSERT(0 && "Unexpected points in clipper.");
178         }
179     }
180 
181     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
182         :
183         pfnBinFunc(nullptr)
184     {
185         switch (topology)
186         {
187         case TOP_POINT_LIST:
188             pfnBinFunc = BinPoints;
189             break;
190         case TOP_LINE_LIST:
191         case TOP_LINE_STRIP:
192         case TOP_LINE_LOOP:
193         case TOP_LINE_LIST_ADJ:
194         case TOP_LISTSTRIP_ADJ:
195             pfnBinFunc = BinLines;
196             break;
197         default:
198             pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
199             break;
200         };
201     }
202 
203     void BinFunc(DRAW_CONTEXT*           pDC,
204                  PA_STATE&               pa,
205                  uint32_t                workerId,
206                  SIMD256::Vec4           prims[],
207                  uint32_t                primMask,
208                  SIMD256::Integer const& primID,
209                  SIMD256::Integer&       viewportIdx,
210                  SIMD256::Integer&       rtIdx)
211     {
212         SWR_ASSERT(pfnBinFunc != nullptr);
213 
214         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
215     }
216 };
217 
218 #if USE_SIMD16_FRONTEND
219 template <>
220 struct BinnerChooser<SIMD512>
221 {
222     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
223 
224     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
225         :
226         pfnBinFunc(nullptr)
227     {
228         if (numVertsPerPrim == 3)
229         {
230             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
231 
232         }
233         else if (numVertsPerPrim == 2)
234         {
235             pfnBinFunc = BinLines_simd16;
236         }
237         else
238         {
239             SWR_ASSERT(0 && "Unexpected points in clipper.");
240         }
241     }
242 
243     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
244         :
245         pfnBinFunc(nullptr)
246     {
247         switch (topology)
248         {
249         case TOP_POINT_LIST:
250             pfnBinFunc = BinPoints_simd16;
251             break;
252         case TOP_LINE_LIST:
253         case TOP_LINE_STRIP:
254         case TOP_LINE_LOOP:
255         case TOP_LINE_LIST_ADJ:
256         case TOP_LISTSTRIP_ADJ:
257             pfnBinFunc = BinLines_simd16;
258             break;
259         default:
260             pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
261             break;
262         };
263     }
264 
265     void BinFunc(DRAW_CONTEXT*           pDC,
266                  PA_STATE&               pa,
267                  uint32_t                workerId,
268                  SIMD512::Vec4           prims[],
269                  uint32_t                primMask,
270                  SIMD512::Integer const& primID,
271                  SIMD512::Integer&       viewportIdx,
272                  SIMD512::Integer&       rtIdx)
273     {
274         SWR_ASSERT(pfnBinFunc != nullptr);
275 
276         pfnBinFunc(pDC, pa, workerId, prims, primMask, primID, viewportIdx, rtIdx);
277     }
278 };
279 
280 #endif
281 template <typename SIMD_T>
282 struct SimdHelper
283 {
284 };
285 
286 template <>
287 struct SimdHelper<SIMD256>
288 {
289     static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; }
290 
291     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
292     {
293         return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
294     }
295 };
296 
297 #if USE_SIMD16_FRONTEND
298 template <>
299 struct SimdHelper<SIMD512>
300 {
301     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
302     {
303         return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
304     }
305 
306     static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
307     {
308         return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
309     }
310 };
311 #endif
312 
313 template <typename SIMD_T, uint32_t NumVertsPerPrimT>
314 class Clipper
315 {
316 public:
317     INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
318         workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
319     {
320         static_assert(NumVertsPerPrimT >= 1 && NumVertsPerPrimT <= 3, "Invalid NumVertsPerPrim");
321         THREAD_DATA &thread_data = in_pDC->pContext->threadPool.pThreadData[workerId];
322 
323         if (thread_data.clipperData == nullptr)
324         {
325             // 7 vertex temp data
326             // 7 post-clipped vertices
327             // 2 transposed verts for binning
328             size_t alloc_size = sizeof(SIMDVERTEX_T<SIMD_T>) * (7 + 7 + 2);
329             thread_data.clipperData = AlignedMalloc(alloc_size, KNOB_SIMD16_BYTES);
330         }
331         SWR_ASSERT(thread_data.clipperData);
332 
333         this->clippedVerts = (SIMDVERTEX_T<SIMD_T>*)thread_data.clipperData;
334         this->tmpVerts = this->clippedVerts + 7;
335         this->transposedVerts = this->tmpVerts + 7;
336     }
337 
338     void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
339     {
340         for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
341         {
342             ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
343         }
344     }
345 
346     Float<SIMD_T> ComputeClipCodeIntersection()
347     {
348         Float<SIMD_T> result = clipCodes[0];
349 
350         for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
351         {
352             result = SIMD_T::and_ps(result, clipCodes[i]);
353         }
354 
355         return result;
356     }
357 
358     Float<SIMD_T> ComputeClipCodeUnion()
359     {
360         Float<SIMD_T> result = clipCodes[0];
361 
362         for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
363         {
364             result = SIMD_T::or_ps(result, clipCodes[i]);
365         }
366 
367         return result;
368     }
369 
370     int ComputeClipMask()
371     {
372         Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
373 
374         clipUnion =
375             SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
376 
377         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
378     }
379 
380     // clipper is responsible for culling any prims with NAN coordinates
381     int ComputeNaNMask(Vec4<SIMD_T> prim[])
382     {
383         Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
384 
385         for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
386         {
387             Float<SIMD_T> vNan01 =
388                 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
389             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
390 
391             Float<SIMD_T> vNan23 =
392                 SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
393             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
394         }
395 
396         return SIMD_T::movemask_ps(vNanMask);
397     }
398 
399     int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[])
400     {
401         uint8_t  cullMask             = state.backendState.cullDistanceMask;
402         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
403 
404         Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
405 
406         Vec4<SIMD_T> vClipCullDistLo[3];
407         Vec4<SIMD_T> vClipCullDistHi[3];
408 
409         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
410         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
411 
412         unsigned long index;
413         while (_BitScanForward(&index, cullMask))
414         {
415             cullMask &= ~(1 << index);
416             uint32_t slot      = index >> 2;
417             uint32_t component = index & 0x3;
418 
419             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
420             for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
421             {
422                 Float<SIMD_T> vCullComp;
423                 if (slot == 0)
424                 {
425                     vCullComp = vClipCullDistLo[e][component];
426                 }
427                 else
428                 {
429                     vCullComp = vClipCullDistHi[e][component];
430                 }
431 
432                 // cull if cull distance < 0 || NAN
433                 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
434                     SIMD_T::setzero_ps(), vCullComp);
435                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
436             }
437             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
438         }
439 
440         // clipper should also discard any primitive with NAN clip distance
441         uint8_t clipMask = state.backendState.clipDistanceMask;
442         while (_BitScanForward(&index, clipMask))
443         {
444             clipMask &= ~(1 << index);
445             uint32_t slot      = index >> 2;
446             uint32_t component = index & 0x3;
447 
448             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
449             for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
450             {
451                 Float<SIMD_T> vClipComp;
452                 if (slot == 0)
453                 {
454                     vClipComp = vClipCullDistLo[e][component];
455                 }
456                 else
457                 {
458                     vClipComp = vClipCullDistHi[e][component];
459                 }
460 
461                 Float<SIMD_T> vClip =
462                     SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
463                 Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
464                     SIMD_T::setzero_ps(), vClipComp);
465                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
466                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
467             }
468             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
469         }
470 
471         return SIMD_T::movemask_ps(vClipCullMask);
472     }
473 
474     void ClipSimd(const Vec4<SIMD_T>     prim[],
475                   const Float<SIMD_T>&   vPrimMask,
476                   const Float<SIMD_T>&   vClipMask,
477                   PA_STATE&              pa,
478                   const Integer<SIMD_T>& vPrimId,
479                   const Integer<SIMD_T>& vViewportIdx,
480                   const Integer<SIMD_T>& vRtIdx)
481     {
482         // input/output vertex store for clipper
483         SIMDVERTEX_T<SIMD_T>* vertices = this->clippedVerts;
484 
485         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
486         uint32_t provokingVertex    = 0;
487         if (pa.binTopology == TOP_TRIANGLE_FAN)
488         {
489             provokingVertex = state.frontendState.provokingVertex.triFan;
490         }
491         ///@todo: line topology for wireframe?
492 
493         // assemble pos
494         Vec4<SIMD_T> tmpVector[NumVertsPerPrimT];
495         for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
496         {
497             vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
498         }
499 
500         // assemble attribs
501         const SWR_BACKEND_STATE& backendState = state.backendState;
502 
503         int32_t maxSlot = -1;
504         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
505         {
506             // Compute absolute attrib slot in vertex array
507             uint32_t mapSlot =
508                 backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
509             maxSlot            = std::max<int32_t>(maxSlot, mapSlot);
510             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
511 
512             pa.Assemble(inputSlot, tmpVector);
513 
514             // if constant interpolation enabled for this attribute, assign the provoking
515             // vertex values to all edges
516             if (CheckBit(constantInterpMask, slot))
517             {
518                 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
519                 {
520                     vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
521                 }
522             }
523             else
524             {
525                 for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
526                 {
527                     vertices[i].attrib[inputSlot] = tmpVector[i];
528                 }
529             }
530         }
531 
532         // assemble user clip distances if enabled
533         uint32_t vertexClipCullSlot = state.backendState.vertexClipCullOffset;
534         if (state.backendState.clipDistanceMask & 0xf)
535         {
536             pa.Assemble(vertexClipCullSlot, tmpVector);
537             for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
538             {
539                 vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
540             }
541         }
542 
543         if (state.backendState.clipDistanceMask & 0xf0)
544         {
545             pa.Assemble(vertexClipCullSlot + 1, tmpVector);
546             for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
547             {
548                 vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
549             }
550         }
551 
552         uint32_t numAttribs = maxSlot + 1;
553 
554         Integer<SIMD_T> vNumClippedVerts =
555             ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
556 
557         BinnerChooser<SIMD_T> binner(NumVertsPerPrimT,
558                                      pa.pDC->pState->state.rastState.conservativeRast);
559 
560         // set up new PA for binning clipped primitives
561         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
562         if (NumVertsPerPrimT == 3)
563         {
564             clipTopology = TOP_TRIANGLE_FAN;
565 
566             // so that the binner knows to bloat wide points later
567             if (pa.binTopology == TOP_POINT_LIST)
568             {
569                 clipTopology = TOP_POINT_LIST;
570             }
571             else if (pa.binTopology == TOP_RECT_LIST)
572             {
573                 clipTopology = TOP_RECT_LIST;
574             }
575         }
576         else if (NumVertsPerPrimT == 2)
577         {
578             clipTopology = TOP_LINE_LIST;
579         }
580         else
581         {
582             SWR_ASSERT(0 && "Unexpected points in clipper.");
583         }
584 
585         const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts);
586         const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId);
587         const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx);
588         const uint32_t* pRtIdx       = reinterpret_cast<const uint32_t*>(&vRtIdx);
589 
590         const SIMD256::Integer vOffsets =
591             SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
592                                6 * sizeof(SIMDVERTEX_T<SIMD_T>),
593                                5 * sizeof(SIMDVERTEX_T<SIMD_T>),
594                                4 * sizeof(SIMDVERTEX_T<SIMD_T>),
595                                3 * sizeof(SIMDVERTEX_T<SIMD_T>),
596                                2 * sizeof(SIMDVERTEX_T<SIMD_T>),
597                                1 * sizeof(SIMDVERTEX_T<SIMD_T>),
598                                0 * sizeof(SIMDVERTEX_T<SIMD_T>));
599 
600         // only need to gather 7 verts
601         // @todo dynamic mask based on actual # of verts generated per lane
602         const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
603 
604         uint32_t numClippedPrims = 0;
605 
606         // transpose clipper output so that each lane's vertices are in SIMD order
607         // set aside space for 2 vertices, as the PA will try to read up to 16 verts
608         // for triangle fan
609         SIMDVERTEX_T<SIMD_T>*  transposedPrims = this->transposedVerts;
610 
611         uint32_t              numInputPrims = pa.NumPrims();
612         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
613         {
614             uint32_t numEmittedVerts = pVertexCount[inputPrim];
615             if (numEmittedVerts < NumVertsPerPrimT)
616             {
617                 continue;
618             }
619             SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
620 
621             uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
622             SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
623 
624             numClippedPrims += numEmittedPrims;
625 
626             // tranpose clipper output so that each lane's vertices are in SIMD order
627             // set aside space for 2 vertices, as the PA will try to read up to 16 verts
628             // for triangle fan
629 
630             // transpose pos
631             float const* pBase =
632                 reinterpret_cast<float const*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
633                 inputPrim;
634 
635             for (uint32_t c = 0; c < 4; ++c)
636             {
637                 SIMD256::Float temp =
638                     SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
639                 transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
640                     SimdHelper<SIMD_T>::insert_lo_ps(temp);
641                 pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
642             }
643 
644             // transpose attribs
645             pBase = reinterpret_cast<float const*>(
646                         &vertices[0].attrib[backendState.vertexAttribOffset]) +
647                     inputPrim;
648 
649             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
650             {
651                 uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
652 
653                 for (uint32_t c = 0; c < 4; ++c)
654                 {
655                     SIMD256::Float temp =
656                         SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
657                     transposedPrims[0].attrib[attribSlot][c] =
658                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
659                     pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
660                 }
661             }
662 
663             // transpose user clip distances if enabled
664             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
665             if (state.backendState.clipDistanceMask & 0x0f)
666             {
667                 pBase = reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot]) +
668                         inputPrim;
669 
670                 for (uint32_t c = 0; c < 4; ++c)
671                 {
672                     SIMD256::Float temp =
673                         SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
674                     transposedPrims[0].attrib[vertexClipCullSlot][c] =
675                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
676                     pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
677                 }
678             }
679 
680             if (state.backendState.clipDistanceMask & 0xf0)
681             {
682                 pBase =
683                     reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
684                     inputPrim;
685 
686                 for (uint32_t c = 0; c < 4; ++c)
687                 {
688                     SIMD256::Float temp =
689                         SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
690                     transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
691                         SimdHelper<SIMD_T>::insert_lo_ps(temp);
692                     pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
693                 }
694             }
695 
696             PA_STATE_OPT clipPA(pDC,
697                                 numEmittedPrims,
698                                 reinterpret_cast<uint8_t*>(&transposedPrims[0]),
699                                 numEmittedVerts,
700                                 SWR_VTX_NUM_SLOTS,
701                                 true,
702                                 NumVertsPerPrimT,
703                                 clipTopology);
704             clipPA.viewportArrayActive = pa.viewportArrayActive;
705             clipPA.rtArrayActive       = pa.rtArrayActive;
706 
707             static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
708 
709             const uint32_t primMask = primMaskMap[numEmittedPrims];
710 
711             const Integer<SIMD_T> primID      = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
712             const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
713             const Integer<SIMD_T> rtIdx       = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
714 
715             while (clipPA.GetNextStreamOutput())
716             {
717                 do
718                 {
719                     Vec4<SIMD_T> attrib[NumVertsPerPrimT];
720 
721                     bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
722 
723                     if (assemble)
724                     {
725                         binner.pfnBinFunc(
726                             pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
727                     }
728 
729                 } while (clipPA.NextPrim());
730             }
731         }
732 
733         // update global pipeline stat
734         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
735     }
736 
737     void ExecuteStage(PA_STATE&              pa,
738                       Vec4<SIMD_T>           prim[],
739                       uint32_t               primMask,
740                       Integer<SIMD_T> const& primId,
741                       Integer<SIMD_T> const& viewportIdx,
742                       Integer<SIMD_T> const& rtIdx)
743     {
744         SWR_ASSERT(pa.pDC != nullptr);
745 
746         BinnerChooser<SIMD_T> binner(pa.binTopology,
747                                      pa.pDC->pState->state.rastState.conservativeRast);
748 
749         // update clipper invocations pipeline stat
750         uint32_t numInvoc = _mm_popcnt_u32(primMask);
751         UPDATE_STAT_FE(CInvocations, numInvoc);
752 
753         ComputeClipCodes(prim, viewportIdx);
754 
755         // cull prims with NAN coords
756         primMask &= ~ComputeNaNMask(prim);
757 
758         // user cull distance cull
759         if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
760         {
761             primMask &= ~ComputeUserClipCullMask(pa, prim);
762         }
763 
764         Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
765         // Mask out non-frustum codes
766         clipIntersection = SIMD_T::and_ps(clipIntersection,
767                                           SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
768 
769         // cull prims outside view frustum
770         int validMask =
771             primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
772 
773         // skip clipping for points
774         uint32_t clipMask = 0;
775         if (NumVertsPerPrimT != 1)
776         {
777             clipMask = validMask & ComputeClipMask();
778         }
779 
780         AR_EVENT(ClipInfoEvent(numInvoc, validMask, clipMask));
781 
782         if (clipMask)
783         {
784             RDTSC_BEGIN(pa.pDC->pContext->pBucketMgr, FEGuardbandClip, pa.pDC->drawId);
785             // we have to clip tris, execute the clipper, which will also
786             // call the binner
787             ClipSimd(prim,
788                      SIMD_T::vmask_ps(validMask),
789                      SIMD_T::vmask_ps(clipMask),
790                      pa,
791                      primId,
792                      viewportIdx,
793                      rtIdx);
794             RDTSC_END(pa.pDC->pContext->pBucketMgr, FEGuardbandClip, 1);
795         }
796         else if (validMask)
797         {
798             // update CPrimitives pipeline state
799             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
800 
801             // forward valid prims directly to binner
802             binner.pfnBinFunc(
803                 this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
804         }
805     }
806 
807 private:
808     Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0,
809                                       Float<SIMD_T> const& boundaryCoord1)
810     {
811         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
812     }
813 
814     Integer<SIMD_T>
815     ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component)
816     {
817         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
818         const uint32_t componentStride  = sizeof(Float<SIMD_T>);
819         const uint32_t attribStride     = sizeof(Vec4<SIMD_T>);
820 
821         static const OSALIGNSIMD16(uint32_t) elemOffset[16] = {
822             0 * sizeof(float),
823             1 * sizeof(float),
824             2 * sizeof(float),
825             3 * sizeof(float),
826             4 * sizeof(float),
827             5 * sizeof(float),
828             6 * sizeof(float),
829             7 * sizeof(float),
830             8 * sizeof(float),
831             9 * sizeof(float),
832             10 * sizeof(float),
833             11 * sizeof(float),
834             12 * sizeof(float),
835             13 * sizeof(float),
836             14 * sizeof(float),
837             15 * sizeof(float),
838         };
839 
840         static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset),
841                       "Clipper::ComputeOffsets, Increase number of element offsets.");
842 
843         Integer<SIMD_T> vElemOffset =
844             SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset));
845 
846         // step to the simdvertex
847         Integer<SIMD_T> vOffsets =
848             SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
849 
850         // step to the attribute and component
851         vOffsets = SIMD_T::add_epi32(
852             vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
853 
854         // step to the lane
855         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
856 
857         return vOffsets;
858     }
859 
860     Float<SIMD_T> GatherComponent(const float*           pBuffer,
861                                   uint32_t               attrib,
862                                   Float<SIMD_T> const&   vMask,
863                                   Integer<SIMD_T> const& vIndices,
864                                   uint32_t               component)
865     {
866         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
867         Float<SIMD_T>   vSrc     = SIMD_T::setzero_ps();
868 
869         return SIMD_T::mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask);
870     }
871 
872     void ScatterComponent(const float*           pBuffer,
873                           uint32_t               attrib,
874                           Float<SIMD_T> const&   vMask,
875                           Integer<SIMD_T> const& vIndices,
876                           uint32_t               component,
877                           Float<SIMD_T> const&   vSrc)
878     {
879         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
880 
881         const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
882         const float*    pSrc     = reinterpret_cast<const float*>(&vSrc);
883         uint32_t        mask     = SIMD_T::movemask_ps(vMask);
884         unsigned long  lane;
885         while (_BitScanForward(&lane, mask))
886         {
887             mask &= ~(1 << lane);
888             const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane];
889             *(float*)pBuf       = pSrc[lane];
890         }
891     }
892 
893     template <SWR_CLIPCODES ClippingPlane>
894     void intersect(const Float<SIMD_T>&   vActiveMask,  // active lanes to operate on
895                    const Integer<SIMD_T>& s,            // index to first edge vertex v0 in pInPts.
896                    const Integer<SIMD_T>& p,            // index to second edge vertex v1 in pInPts.
897                    const Vec4<SIMD_T>&    v1,           // vertex 0 position
898                    const Vec4<SIMD_T>&    v2,           // vertex 1 position
899                    Integer<SIMD_T>&       outIndex,     // output index.
900                    const float*           pInVerts,     // array of all the input positions.
901                    uint32_t               numInAttribs, // number of attributes per vertex.
902                    float* pOutVerts) // array of output positions. We'll write our new intersection
903                                      // point at i*4.
904     {
905         uint32_t vertexAttribOffset   = this->state.backendState.vertexAttribOffset;
906         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
907 
908         // compute interpolation factor
909         Float<SIMD_T> t;
910         switch (ClippingPlane)
911         {
912         case FRUSTUM_LEFT:
913             t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0]));
914             break;
915         case FRUSTUM_RIGHT:
916             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0]));
917             break;
918         case FRUSTUM_TOP:
919             t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1]));
920             break;
921         case FRUSTUM_BOTTOM:
922             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1]));
923             break;
924         case FRUSTUM_NEAR:
925             // DX Znear plane is 0, GL is -w
926             if (this->state.rastState.clipHalfZ)
927             {
928                 t = ComputeInterpFactor(v1[2], v2[2]);
929             }
930             else
931             {
932                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
933             }
934             break;
935         case FRUSTUM_FAR:
936             t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2]));
937             break;
938         default:
939             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
940         };
941 
942         // interpolate position and store
943         for (uint32_t c = 0; c < 4; ++c)
944         {
945             Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
946             ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
947         }
948 
949         // interpolate attributes and store
950         for (uint32_t a = 0; a < numInAttribs; ++a)
951         {
952             uint32_t attribSlot = vertexAttribOffset + a;
953             for (uint32_t c = 0; c < 4; ++c)
954             {
955                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
956                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
957                 Float<SIMD_T> vOutAttrib =
958                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
959                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
960             }
961         }
962 
963         // interpolate clip distance if enabled
964         if (this->state.backendState.clipDistanceMask & 0xf)
965         {
966             uint32_t attribSlot = vertexClipCullOffset;
967             for (uint32_t c = 0; c < 4; ++c)
968             {
969                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
970                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
971                 Float<SIMD_T> vOutAttrib =
972                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
973                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
974             }
975         }
976 
977         if (this->state.backendState.clipDistanceMask & 0xf0)
978         {
979             uint32_t attribSlot = vertexClipCullOffset + 1;
980             for (uint32_t c = 0; c < 4; ++c)
981             {
982                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
983                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
984                 Float<SIMD_T> vOutAttrib =
985                     SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
986                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
987             }
988         }
989     }
990 
991     template <SWR_CLIPCODES ClippingPlane>
992     Float<SIMD_T> inside(const Vec4<SIMD_T>& v)
993     {
994         switch (ClippingPlane)
995         {
996         case FRUSTUM_LEFT:
997             return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
998         case FRUSTUM_RIGHT:
999             return SIMD_T::cmple_ps(v[0], v[3]);
1000         case FRUSTUM_TOP:
1001             return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1002         case FRUSTUM_BOTTOM:
1003             return SIMD_T::cmple_ps(v[1], v[3]);
1004         case FRUSTUM_NEAR:
1005             return SIMD_T::cmpge_ps(v[2],
1006                                     this->state.rastState.clipHalfZ
1007                                         ? SIMD_T::setzero_ps()
1008                                         : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
1009         case FRUSTUM_FAR:
1010             return SIMD_T::cmple_ps(v[2], v[3]);
1011         default:
1012             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
1013             return SIMD_T::setzero_ps();
1014         }
1015     }
1016 
1017     template <SWR_CLIPCODES ClippingPlane>
1018     Integer<SIMD_T> ClipTriToPlane(const float*           pInVerts,
1019                                    const Integer<SIMD_T>& vNumInPts,
1020                                    uint32_t               numInAttribs,
1021                                    float*                 pOutVerts)
1022     {
1023         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1024 
1025         Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
1026         Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
1027         Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1028 
1029         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
1030         {
1031             Integer<SIMD_T> s             = vCurIndex;
1032             Integer<SIMD_T> p             = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1033             Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
1034             p                             = SIMD_T::castps_si(SIMD_T::blendv_ps(
1035                 SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
1036 
1037             // gather position
1038             Vec4<SIMD_T> vInPos0, vInPos1;
1039             for (uint32_t c = 0; c < 4; ++c)
1040             {
1041                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1042                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1043             }
1044 
1045             // compute inside mask
1046             Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1047             Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1048 
1049             // compute intersection mask (s_in != p_in)
1050             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1051             intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
1052 
1053             // store s if inside
1054             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1055             if (!SIMD_T::testz_ps(s_in, s_in))
1056             {
1057                 // store position
1058                 for (uint32_t c = 0; c < 4; ++c)
1059                 {
1060                     ScatterComponent(
1061                         pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1062                 }
1063 
1064                 // store attribs
1065                 for (uint32_t a = 0; a < numInAttribs; ++a)
1066                 {
1067                     uint32_t attribSlot = vertexAttribOffset + a;
1068                     for (uint32_t c = 0; c < 4; ++c)
1069                     {
1070                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1071                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1072                     }
1073                 }
1074 
1075                 // store clip distance if enabled
1076                 uint32_t vertexClipCullSlot = this->state.backendState.vertexClipCullOffset;
1077                 if (this->state.backendState.clipDistanceMask & 0xf)
1078                 {
1079                     uint32_t attribSlot = vertexClipCullSlot;
1080                     for (uint32_t c = 0; c < 4; ++c)
1081                     {
1082                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1083                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1084                     }
1085                 }
1086 
1087                 if (this->state.backendState.clipDistanceMask & 0xf0)
1088                 {
1089                     uint32_t attribSlot = vertexClipCullSlot + 1;
1090                     for (uint32_t c = 0; c < 4; ++c)
1091                     {
1092                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1093                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1094                     }
1095                 }
1096 
1097                 // increment outIndex
1098                 vOutIndex = SIMD_T::blendv_epi32(
1099                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1100             }
1101 
1102             // compute and store intersection
1103             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1104             {
1105                 intersect<ClippingPlane>(intersectMask,
1106                                          s,
1107                                          p,
1108                                          vInPos0,
1109                                          vInPos1,
1110                                          vOutIndex,
1111                                          pInVerts,
1112                                          numInAttribs,
1113                                          pOutVerts);
1114 
1115                 // increment outIndex for active lanes
1116                 vOutIndex = SIMD_T::blendv_epi32(
1117                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1118             }
1119 
1120             // increment loop index and update active mask
1121             vCurIndex   = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
1122             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1123         }
1124 
1125         return vOutIndex;
1126     }
1127 
1128     template <SWR_CLIPCODES ClippingPlane>
1129     Integer<SIMD_T> ClipLineToPlane(const float*           pInVerts,
1130                                     const Integer<SIMD_T>& vNumInPts,
1131                                     uint32_t               numInAttribs,
1132                                     float*                 pOutVerts)
1133     {
1134         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
1135 
1136         Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
1137         Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
1138         Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
1139 
1140         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
1141         {
1142             Integer<SIMD_T> s = vCurIndex;
1143             Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
1144 
1145             // gather position
1146             Vec4<SIMD_T> vInPos0, vInPos1;
1147             for (uint32_t c = 0; c < 4; ++c)
1148             {
1149                 vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
1150                 vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
1151             }
1152 
1153             // compute inside mask
1154             Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
1155             Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
1156 
1157             // compute intersection mask (s_in != p_in)
1158             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
1159             intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
1160 
1161             // store s if inside
1162             s_in = SIMD_T::and_ps(s_in, vActiveMask);
1163             if (!SIMD_T::testz_ps(s_in, s_in))
1164             {
1165                 for (uint32_t c = 0; c < 4; ++c)
1166                 {
1167                     ScatterComponent(
1168                         pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
1169                 }
1170 
1171                 // interpolate attributes and store
1172                 for (uint32_t a = 0; a < numInAttribs; ++a)
1173                 {
1174                     uint32_t attribSlot = vertexAttribOffset + a;
1175                     for (uint32_t c = 0; c < 4; ++c)
1176                     {
1177                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
1178                         ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
1179                     }
1180                 }
1181 
1182                 // increment outIndex
1183                 vOutIndex = SIMD_T::blendv_epi32(
1184                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
1185             }
1186 
1187             // compute and store intersection
1188             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
1189             {
1190                 intersect<ClippingPlane>(intersectMask,
1191                                          s,
1192                                          p,
1193                                          vInPos0,
1194                                          vInPos1,
1195                                          vOutIndex,
1196                                          pInVerts,
1197                                          numInAttribs,
1198                                          pOutVerts);
1199 
1200                 // increment outIndex for active lanes
1201                 vOutIndex = SIMD_T::blendv_epi32(
1202                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
1203             }
1204 
1205             // store p if inside
1206             p_in = SIMD_T::and_ps(p_in, vActiveMask);
1207             if (!SIMD_T::testz_ps(p_in, p_in))
1208             {
1209                 for (uint32_t c = 0; c < 4; ++c)
1210                 {
1211                     ScatterComponent(
1212                         pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
1213                 }
1214 
1215                 // interpolate attributes and store
1216                 for (uint32_t a = 0; a < numInAttribs; ++a)
1217                 {
1218                     uint32_t attribSlot = vertexAttribOffset + a;
1219                     for (uint32_t c = 0; c < 4; ++c)
1220                     {
1221                         Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
1222                         ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
1223                     }
1224                 }
1225 
1226                 // increment outIndex
1227                 vOutIndex = SIMD_T::blendv_epi32(
1228                     vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
1229             }
1230         }
1231 
1232         return vOutIndex;
1233     }
1234 
1235     Integer<SIMD_T> ClipPrims(float*               pVertices,
1236                               const Float<SIMD_T>& vPrimMask,
1237                               const Float<SIMD_T>& vClipMask,
1238                               int                  numAttribs)
1239     {
1240         // temp storage
1241         float* pTempVerts = reinterpret_cast<float*>(this->tmpVerts);
1242 
1243         // zero out num input verts for non-active lanes
1244         Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrimT);
1245         vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
1246 
1247         // clip prims to frustum
1248         Integer<SIMD_T> vNumOutPts;
1249         if (NumVertsPerPrimT == 3)
1250         {
1251             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1252             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1253             vNumOutPts =
1254                 ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1255             vNumOutPts =
1256                 ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1257             vNumOutPts =
1258                 ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1259             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1260         }
1261         else
1262         {
1263             SWR_ASSERT(NumVertsPerPrimT == 2);
1264             vNumOutPts =
1265                 ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
1266             vNumOutPts =
1267                 ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1268             vNumOutPts =
1269                 ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1270             vNumOutPts =
1271                 ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1272             vNumOutPts =
1273                 ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
1274             vNumOutPts =
1275                 ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
1276         }
1277 
1278         // restore num verts for non-clipped, active lanes
1279         Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
1280         vNumOutPts =
1281             SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrimT), vNonClippedMask);
1282 
1283         return vNumOutPts;
1284     }
1285 
1286     const uint32_t   workerId{0};
1287     DRAW_CONTEXT*    pDC{nullptr};
1288     const API_STATE& state;
1289     Float<SIMD_T>    clipCodes[NumVertsPerPrimT];
1290     SIMDVERTEX_T<SIMD_T>* clippedVerts;
1291     SIMDVERTEX_T<SIMD_T>* tmpVerts;
1292     SIMDVERTEX_T<SIMD_T>* transposedVerts;
1293 };
1294 
1295 // pipeline stage functions
1296 void ClipRectangles(DRAW_CONTEXT*      pDC,
1297                     PA_STATE&          pa,
1298                     uint32_t           workerId,
1299                     simdvector         prims[],
1300                     uint32_t           primMask,
1301                     simdscalari const& primId,
1302                     simdscalari const& viewportIdx,
1303                     simdscalari const& rtIdx);
1304 void ClipTriangles(DRAW_CONTEXT*      pDC,
1305                    PA_STATE&          pa,
1306                    uint32_t           workerId,
1307                    simdvector         prims[],
1308                    uint32_t           primMask,
1309                    simdscalari const& primId,
1310                    simdscalari const& viewportIdx,
1311                    simdscalari const& rtIdx);
1312 void ClipLines(DRAW_CONTEXT*      pDC,
1313                PA_STATE&          pa,
1314                uint32_t           workerId,
1315                simdvector         prims[],
1316                uint32_t           primMask,
1317                simdscalari const& primId,
1318                simdscalari const& viewportIdx,
1319                simdscalari const& rtIdx);
1320 void ClipPoints(DRAW_CONTEXT*      pDC,
1321                 PA_STATE&          pa,
1322                 uint32_t           workerId,
1323                 simdvector         prims[],
1324                 uint32_t           primMask,
1325                 simdscalari const& primId,
1326                 simdscalari const& viewportIdx,
1327                 simdscalari const& rtIdx);
1328 #if USE_SIMD16_FRONTEND
1329 void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT*        pDC,
1330                                     PA_STATE&            pa,
1331                                     uint32_t             workerId,
1332                                     simd16vector         prims[],
1333                                     uint32_t             primMask,
1334                                     simd16scalari const& primId,
1335                                     simd16scalari const& viewportIdx,
1336                                     simd16scalari const& rtIdx);
1337 void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT*        pDC,
1338                                    PA_STATE&            pa,
1339                                    uint32_t             workerId,
1340                                    simd16vector         prims[],
1341                                    uint32_t             primMask,
1342                                    simd16scalari const& primId,
1343                                    simd16scalari const& viewportIdx,
1344                                    simd16scalari const& rtIdx);
1345 void SIMDCALL ClipLines_simd16(DRAW_CONTEXT*        pDC,
1346                                PA_STATE&            pa,
1347                                uint32_t             workerId,
1348                                simd16vector         prims[],
1349                                uint32_t             primMask,
1350                                simd16scalari const& primId,
1351                                simd16scalari const& viewportIdx,
1352                                simd16scalari const& rtIdx);
1353 void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT*        pDC,
1354                                 PA_STATE&            pa,
1355                                 uint32_t             workerId,
1356                                 simd16vector         prims[],
1357                                 uint32_t             primMask,
1358                                 simd16scalari const& primId,
1359                                 simd16scalari const& viewportIdx,
1360                                 simd16scalari const& rtIdx);
1361 #endif
1362