1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file frontend.h
24  *
25  * @brief Definitions for Frontend which handles vertex processing,
26  *        primitive assembly, clipping, binning, etc.
27  *
28  ******************************************************************************/
29 #pragma once
30 #include "context.h"
31 #include "common/simdintrin.h"
32 #include <type_traits>
33 
34 //////////////////////////////////////////////////////////////////////////
35 /// @brief Helper macro to generate a bitmask
36 static INLINE uint32_t
GenMask(uint32_t numBits)37               GenMask(uint32_t numBits)
38 {
39     SWR_ASSERT(
40         numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
41     return ((1U << numBits) - 1);
42 }
43 
44 // Calculates the A and B coefficients for the 3 edges of the triangle
45 //
46 // maths for edge equations:
47 //   standard form of a line in 2d
48 //   Ax + By + C = 0
49 //   A = y0 - y1
50 //   B = x1 - x0
51 //   C = x0y1 - x1y0
52 INLINE
triangleSetupAB(const __m128 vX,const __m128 vY,__m128 & vA,__m128 & vB)53 void triangleSetupAB(const __m128 vX, const __m128 vY, __m128& vA, __m128& vB)
54 {
55     // vYsub = y1 y2 y0 dc
56     __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
57     // vY =    y0 y1 y2 dc
58     vA = _mm_sub_ps(vY, vYsub);
59 
60     // Result:
61     // A[0] = y0 - y1
62     // A[1] = y1 - y2
63     // A[2] = y2 - y0
64 
65     // vXsub = x1 x2 x0 dc
66     __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
67     // vX =    x0 x1 x2 dc
68     vB = _mm_sub_ps(vXsub, vX);
69 
70     // Result:
71     // B[0] = x1 - x0
72     // B[1] = x2 - x1
73     // B[2] = x0 - x2
74 }
75 
76 INLINE
triangleSetupABInt(const __m128i vX,const __m128i vY,__m128i & vA,__m128i & vB)77 void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i& vA, __m128i& vB)
78 {
79     // generate edge equations
80     // A = y0 - y1
81     // B = x1 - x0
82     // C = x0y1 - x1y0
83     __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
84     vA            = _mm_sub_epi32(vY, vYsub);
85 
86     __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
87     vB            = _mm_sub_epi32(vXsub, vX);
88 }
89 
90 INLINE
triangleSetupABIntVertical(const simdscalari vX[3],const simdscalari vY[3],simdscalari (& vA)[3],simdscalari (& vB)[3])91 void triangleSetupABIntVertical(const simdscalari vX[3],
92                                 const simdscalari vY[3],
93                                 simdscalari (&vA)[3],
94                                 simdscalari (&vB)[3])
95 {
96     // A = y0 - y1
97     // B = x1 - x0
98     vA[0] = _simd_sub_epi32(vY[0], vY[1]);
99     vA[1] = _simd_sub_epi32(vY[1], vY[2]);
100     vA[2] = _simd_sub_epi32(vY[2], vY[0]);
101 
102     vB[0] = _simd_sub_epi32(vX[1], vX[0]);
103     vB[1] = _simd_sub_epi32(vX[2], vX[1]);
104     vB[2] = _simd_sub_epi32(vX[0], vX[2]);
105 }
106 
107 #if ENABLE_AVX512_SIMD16
108 INLINE
triangleSetupABIntVertical(const simd16scalari vX[3],const simd16scalari vY[3],simd16scalari (& vA)[3],simd16scalari (& vB)[3])109 void triangleSetupABIntVertical(const simd16scalari vX[3],
110                                 const simd16scalari vY[3],
111                                 simd16scalari (&vA)[3],
112                                 simd16scalari (&vB)[3])
113 {
114     // A = y0 - y1
115     // B = x1 - x0
116     vA[0] = _simd16_sub_epi32(vY[0], vY[1]);
117     vA[1] = _simd16_sub_epi32(vY[1], vY[2]);
118     vA[2] = _simd16_sub_epi32(vY[2], vY[0]);
119 
120     vB[0] = _simd16_sub_epi32(vX[1], vX[0]);
121     vB[1] = _simd16_sub_epi32(vX[2], vX[1]);
122     vB[2] = _simd16_sub_epi32(vX[0], vX[2]);
123 }
124 
125 #endif
126 // Calculate the determinant of the triangle
127 // 2 vectors between the 3 points: P, Q
128 // Px = x0-x2, Py = y0-y2
129 // Qx = x1-x2, Qy = y1-y2
130 //       |Px Qx|
131 // det = |     | = PxQy - PyQx
132 //       |Py Qy|
133 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
134 //               try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
135 //               : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
136 //               : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
137 //               : B[2]*A[1] - A[2]*B[1]
138 INLINE
calcDeterminantInt(const __m128i vA,const __m128i vB)139 float calcDeterminantInt(const __m128i vA, const __m128i vB)
140 {
141     // vAShuf = [A1, A0, A2, A0]
142     __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
143     // vBShuf = [B2, B0, B1, B0]
144     __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
145     // vMul = [A1*B2, B1*A2]
146     __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
147 
148     // shuffle upper to lower
149     // vMul2 = [B1*A2, B1*A2]
150     __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
151     // vMul = [A1*B2 - B1*A2]
152     vMul = _mm_sub_epi64(vMul, vMul2);
153 
154     int64_t result;
155     _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
156 
157     double dResult = (double)result;
158     dResult        = dResult * (1.0 / FIXED_POINT16_SCALE);
159 
160     return (float)dResult;
161 }
162 
163 INLINE
calcDeterminantIntVertical(const simdscalari vA[3],const simdscalari vB[3],simdscalari * pvDet)164 void calcDeterminantIntVertical(const simdscalari vA[3],
165                                 const simdscalari vB[3],
166                                 simdscalari*      pvDet)
167 {
168     // refer to calcDeterminantInt comment for calculation explanation
169 
170     // A1*B2
171     simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
172     simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
173 
174     simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
175     simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
176 
177     simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
178     simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
179 
180     // B1*A2
181     simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
182     simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
183 
184     simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
185     simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
186 
187     simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
188     simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
189 
190     // A1*B2 - A2*B1
191     simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
192     simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
193 
194     // shuffle 0 1 4 5 2 3 6 7 -> 0 1 2 3
195     simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
196 
197     // shuffle 0 1 4 5 2 3 6 7 -> 4 5 6 7
198     simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
199 
200     pvDet[0] = vResultLo;
201     pvDet[1] = vResultHi;
202 }
203 
204 #if ENABLE_AVX512_SIMD16
205 INLINE
calcDeterminantIntVertical(const simd16scalari vA[3],const simd16scalari vB[3],simd16scalari * pvDet)206 void calcDeterminantIntVertical(const simd16scalari vA[3],
207                                 const simd16scalari vB[3],
208                                 simd16scalari*      pvDet)
209 {
210     // refer to calcDeterminantInt comment for calculation explanation
211 
212     // A1*B2
213     simd16scalari vA1_lo =
214         _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
215     simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
216 
217     simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
218     simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
219 
220     simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b)
221     simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F
222 
223     // B1*A2
224     simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
225     simd16scalari vA2_hi = _simd16_unpackhi_epi32(vA[2], vA[2]);
226 
227     simd16scalari vB1_lo = _simd16_unpacklo_epi32(vB[1], vB[1]);
228     simd16scalari vB1_hi = _simd16_unpackhi_epi32(vB[1], vB[1]);
229 
230     simd16scalari vA2B1_lo = _simd16_mul_epi32(vA2_lo, vB1_lo);
231     simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
232 
233     // A1*B2 - A2*B1
234     simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b)
235     simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F
236 
237     // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
238     simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b)
239     simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F
240 
241     // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
242     pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
243     pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F
244 }
245 
246 #endif
247 INLINE
triangleSetupC(const __m128 vX,const __m128 vY,const __m128 vA,const __m128 & vB,__m128 & vC)248 void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128& vB, __m128& vC)
249 {
250     // C = -Ax - By
251     vC         = _mm_mul_ps(vA, vX);
252     __m128 vCy = _mm_mul_ps(vB, vY);
253     vC         = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
254     vC         = _mm_sub_ps(vC, vCy);
255 }
256 
257 template <uint32_t NumVerts>
viewportTransform(simdvector * v,const SWR_VIEWPORT_MATRICES & vpMatrices)258 INLINE void viewportTransform(simdvector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
259 {
260     simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
261     simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
262     simdscalar m11 = _simd_load1_ps(&vpMatrices.m11[0]);
263     simdscalar m31 = _simd_load1_ps(&vpMatrices.m31[0]);
264     simdscalar m22 = _simd_load1_ps(&vpMatrices.m22[0]);
265     simdscalar m32 = _simd_load1_ps(&vpMatrices.m32[0]);
266 
267     for (uint32_t i = 0; i < NumVerts; ++i)
268     {
269         v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
270         v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
271         v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
272     }
273 }
274 
275 #if USE_SIMD16_FRONTEND
276 template <uint32_t NumVerts>
viewportTransform(simd16vector * v,const SWR_VIEWPORT_MATRICES & vpMatrices)277 INLINE void viewportTransform(simd16vector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
278 {
279     const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
280     const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
281     const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]);
282     const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]);
283     const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]);
284     const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]);
285 
286     for (uint32_t i = 0; i < NumVerts; ++i)
287     {
288         v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
289         v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
290         v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
291     }
292 }
293 
294 #endif
295 template <uint32_t NumVerts>
viewportTransform(simdvector * v,const SWR_VIEWPORT_MATRICES & vpMatrices,simdscalari const & vViewportIdx)296 INLINE void viewportTransform(simdvector*                  v,
297                               const SWR_VIEWPORT_MATRICES& vpMatrices,
298                               simdscalari const&           vViewportIdx)
299 {
300     // perform a gather of each matrix element based on the viewport array indexes
301     simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
302     simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
303     simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
304     simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
305     simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
306     simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
307 
308     for (uint32_t i = 0; i < NumVerts; ++i)
309     {
310         v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
311         v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
312         v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
313     }
314 }
315 
316 #if USE_SIMD16_FRONTEND
317 template <uint32_t NumVerts>
viewportTransform(simd16vector * v,const SWR_VIEWPORT_MATRICES & vpMatrices,simd16scalari const & vViewportIdx)318 INLINE void viewportTransform(simd16vector*                v,
319                               const SWR_VIEWPORT_MATRICES& vpMatrices,
320                               simd16scalari const&         vViewportIdx)
321 {
322     // perform a gather of each matrix element based on the viewport array indexes
323     const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
324     const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4);
325     const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4);
326     const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4);
327     const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4);
328     const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4);
329 
330     for (uint32_t i = 0; i < NumVerts; ++i)
331     {
332         v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30);
333         v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31);
334         v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32);
335     }
336 }
337 
338 #endif
339 INLINE
calcBoundingBoxInt(const __m128i & vX,const __m128i & vY,SWR_RECT & bbox)340 void calcBoundingBoxInt(const __m128i& vX, const __m128i& vY, SWR_RECT& bbox)
341 {
342     // Need horizontal fp min here
343     __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
344     __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
345 
346     __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
347     __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
348 
349     __m128i vMinX = _mm_min_epi32(vX, vX1);
350     vMinX         = _mm_min_epi32(vMinX, vX2);
351 
352     __m128i vMaxX = _mm_max_epi32(vX, vX1);
353     vMaxX         = _mm_max_epi32(vMaxX, vX2);
354 
355     __m128i vMinY = _mm_min_epi32(vY, vY1);
356     vMinY         = _mm_min_epi32(vMinY, vY2);
357 
358     __m128i vMaxY = _mm_max_epi32(vY, vY1);
359     vMaxY         = _mm_max_epi32(vMaxY, vY2);
360 
361     bbox.xmin = _mm_extract_epi32(vMinX, 0);
362     bbox.xmax = _mm_extract_epi32(vMaxX, 0);
363     bbox.ymin = _mm_extract_epi32(vMinY, 0);
364     bbox.ymax = _mm_extract_epi32(vMaxY, 0);
365 }
366 
367 INLINE
CanUseSimplePoints(DRAW_CONTEXT * pDC)368 bool CanUseSimplePoints(DRAW_CONTEXT* pDC)
369 {
370     const API_STATE& state = GetApiState(pDC);
371 
372     return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
373             state.rastState.pointSize == 1.0f && !state.rastState.pointParam &&
374             !state.rastState.pointSpriteEnable && !state.backendState.clipDistanceMask);
375 }
376 
377 INLINE
vHasNaN(const __m128 & vec)378 bool vHasNaN(const __m128& vec)
379 {
380     const __m128  result = _mm_cmpunord_ps(vec, vec);
381     const int32_t mask   = _mm_movemask_ps(result);
382     return (mask != 0);
383 }
384 
385 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
386 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
387 
388 // ProcessDraw front-end function.  All combinations of parameter values are available
389 PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
390                                     bool IsCutIndexEnabled,
391                                     bool HasTessellation,
392                                     bool HasGeometryShader,
393                                     bool HasStreamOut,
394                                     bool HasRasterization);
395 
396 void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
397 void ProcessStoreTiles(SWR_CONTEXT*  pContext,
398                        DRAW_CONTEXT* pDC,
399                        uint32_t      workerId,
400                        void*         pUserData);
401 void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
402                                    DRAW_CONTEXT* pDC,
403                                    uint32_t      workerId,
404                                    void*         pUserData);
405 void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
406 void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
407 
408 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
409 #if USE_SIMD16_FRONTEND
410 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
411 #endif
412 
413 struct PA_STATE_BASE; // forward decl
414 void BinPoints(DRAW_CONTEXT*      pDC,
415                PA_STATE&          pa,
416                uint32_t           workerId,
417                simdvector         prims[3],
418                uint32_t           primMask,
419                simdscalari const& primID,
420                simdscalari const& viewportIdx,
421                simdscalari const& rtIdx);
422 void BinLines(DRAW_CONTEXT*      pDC,
423               PA_STATE&          pa,
424               uint32_t           workerId,
425               simdvector         prims[3],
426               uint32_t           primMask,
427               simdscalari const& primID,
428               simdscalari const& viewportIdx,
429               simdscalari const& rtIdx);
430 #if USE_SIMD16_FRONTEND
431 void SIMDCALL BinPoints_simd16(DRAW_CONTEXT*        pDC,
432                                PA_STATE&            pa,
433                                uint32_t             workerId,
434                                simd16vector         prims[3],
435                                uint32_t             primMask,
436                                simd16scalari const& primID,
437                                simd16scalari const& viewportIdx,
438                                simd16scalari const& rtIdx);
439 void SIMDCALL BinLines_simd16(DRAW_CONTEXT*        pDC,
440                               PA_STATE&            pa,
441                               uint32_t             workerId,
442                               simd16vector         prims[3],
443                               uint32_t             primMask,
444                               simd16scalari const& primID,
445                               simd16scalari const& viewportIdx,
446                               simd16scalari const& rtIdx);
447 #endif
448 
449