1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file rasterizer.cpp
24  *
25  * @brief Implementation for the rasterizer.
26  *
27  ******************************************************************************/
28 
29 #include <vector>
30 #include <algorithm>
31 
32 #include "rasterizer.h"
33 #include "backends/gen_rasterizer.hpp"
34 #include "rdtsc_core.h"
35 #include "backend.h"
36 #include "utils.h"
37 #include "frontend.h"
38 #include "tilemgr.h"
39 #include "memory/tilingtraits.h"
40 #include "rasterizer_impl.h"
41 
42 PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
43                               [STATE_VALID_TRI_EDGE_COUNT][2];
44 
RasterizeLine(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)45 void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
46 {
47     const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData);
48 #if KNOB_ENABLE_TOSS_POINTS
49     if (KNOB_TOSS_BIN_TRIS)
50     {
51         return;
52     }
53 #endif
54 
55     // bloat line to two tris and call the triangle rasterizer twice
56     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, pDC->drawId);
57 
58     const API_STATE&     state     = GetApiState(pDC);
59     const SWR_RASTSTATE& rastState = state.rastState;
60 
61     // macrotile dimensioning
62     uint32_t macroX, macroY;
63     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
64     int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
65     int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
66     int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
67     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
68 
69     const SWR_RECT& scissorInFixedPoint =
70         state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
71 
72     // create a copy of the triangle buffer to write our adjusted vertices to
73     OSALIGNSIMD(float) newTriBuffer[4 * 4];
74     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
75     newWorkDesc.pTriBuffer         = &newTriBuffer[0];
76 
77     // create a copy of the attrib buffer to write our adjusted attribs to
78     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
79     newWorkDesc.pAttribs = &newAttribBuffer[0];
80 
81     const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
82     const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
83 
84     __m128 vX, vY, vZ, vRecipW;
85 
86     vX      = _mm_load_ps(workDesc.pTriBuffer);
87     vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
88     vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
89     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
90 
91     // triangle 0
92     // v0,v1 -> v0,v0,v1
93     __m128 vXa      = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
94     __m128 vYa      = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
95     __m128 vZa      = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
96     __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
97 
98     __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
99     __m128 vAdjust    = _mm_mul_ps(vLineWidth, vBloat0);
100     if (workDesc.triFlags.yMajor)
101     {
102         vXa = _mm_add_ps(vAdjust, vXa);
103     }
104     else
105     {
106         vYa = _mm_add_ps(vAdjust, vYa);
107     }
108 
109     // Store triangle description for rasterizer
110     _mm_store_ps((float*)&newTriBuffer[0], vXa);
111     _mm_store_ps((float*)&newTriBuffer[4], vYa);
112     _mm_store_ps((float*)&newTriBuffer[8], vZa);
113     _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
114 
115     // binner bins 3 edges for lines as v0, v1, v1
116     // tri0 needs v0, v0, v1
117     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
118     {
119         __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
120         __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
121 
122         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib0);
123         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib0);
124         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib1);
125     }
126 
127     // Store user clip distances for triangle 0
128     float    newClipBuffer[3 * 8];
129     uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
130     if (numClipDist)
131     {
132         newWorkDesc.pUserClipBuffer = newClipBuffer;
133 
134         float* pOldBuffer = workDesc.pUserClipBuffer;
135         float* pNewBuffer = newClipBuffer;
136         for (uint32_t i = 0; i < numClipDist; ++i)
137         {
138             // read barycentric coeffs from binner
139             float a = *(pOldBuffer++);
140             float b = *(pOldBuffer++);
141 
142             // reconstruct original clip distance at vertices
143             float c0 = a + b;
144             float c1 = b;
145 
146             // construct triangle barycentrics
147             *(pNewBuffer++) = c0 - c1;
148             *(pNewBuffer++) = c0 - c1;
149             *(pNewBuffer++) = c1;
150         }
151     }
152 
153     // setup triangle rasterizer function
154     PFN_WORK_FUNC pfnTriRast;
155     // conservative rast not supported for points/lines
156     pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
157                                    rastState.bIsCenterPattern,
158                                    false,
159                                    SWR_INPUT_COVERAGE_NONE,
160                                    EdgeValToEdgeState(ALL_EDGES_VALID),
161                                    (pDC->pState->state.scissorsTileAligned == false));
162 
163     // make sure this macrotile intersects the triangle
164     __m128i vXai = fpToFixedPoint(vXa);
165     __m128i vYai = fpToFixedPoint(vYa);
166     OSALIGNSIMD(SWR_RECT) bboxA;
167     calcBoundingBoxInt(vXai, vYai, bboxA);
168 
169     if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
170           bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
171           bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
172           bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
173     {
174         // rasterize triangle
175         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
176     }
177 
178     // triangle 1
179     // v0,v1 -> v1,v1,v0
180     vXa      = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
181     vYa      = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
182     vZa      = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
183     vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
184 
185     vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
186     if (workDesc.triFlags.yMajor)
187     {
188         vXa = _mm_add_ps(vAdjust, vXa);
189     }
190     else
191     {
192         vYa = _mm_add_ps(vAdjust, vYa);
193     }
194 
195     // Store triangle description for rasterizer
196     _mm_store_ps((float*)&newTriBuffer[0], vXa);
197     _mm_store_ps((float*)&newTriBuffer[4], vYa);
198     _mm_store_ps((float*)&newTriBuffer[8], vZa);
199     _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
200 
201     // binner bins 3 edges for lines as v0, v1, v1
202     // tri1 needs v1, v1, v0
203     for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
204     {
205         __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
206         __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
207 
208         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
209         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
210         _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
211     }
212 
213     // store user clip distance for triangle 1
214     if (numClipDist)
215     {
216         float* pOldBuffer = workDesc.pUserClipBuffer;
217         float* pNewBuffer = newClipBuffer;
218         for (uint32_t i = 0; i < numClipDist; ++i)
219         {
220             // read barycentric coeffs from binner
221             float a = *(pOldBuffer++);
222             float b = *(pOldBuffer++);
223 
224             // reconstruct original clip distance at vertices
225             float c0 = a + b;
226             float c1 = b;
227 
228             // construct triangle barycentrics
229             *(pNewBuffer++) = c1 - c0;
230             *(pNewBuffer++) = c1 - c0;
231             *(pNewBuffer++) = c0;
232         }
233     }
234 
235     vXai = fpToFixedPoint(vXa);
236     vYai = fpToFixedPoint(vYa);
237     calcBoundingBoxInt(vXai, vYai, bboxA);
238 
239     if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
240           bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
241           bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
242           bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
243     {
244         // rasterize triangle
245         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
246     }
247 
248     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BERasterizeLine, 1);
249 }
250 
RasterizeSimplePoint(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)251 void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
252 {
253 #if KNOB_ENABLE_TOSS_POINTS
254     if (KNOB_TOSS_BIN_TRIS)
255     {
256         return;
257     }
258 #endif
259 
260     const TRIANGLE_WORK_DESC& workDesc     = *(const TRIANGLE_WORK_DESC*)pData;
261     const BACKEND_FUNCS&      backendFuncs = pDC->pState->backendFuncs;
262 
263     // map x,y relative offsets from start of raster tile to bit position in
264     // coverage mask for the point
265     static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13},
266                                                {2, 3, 6, 7, 10, 11, 14, 15},
267                                                {16, 17, 20, 21, 24, 25, 28, 29},
268                                                {18, 19, 22, 23, 26, 27, 30, 31},
269                                                {32, 33, 36, 37, 40, 41, 44, 45},
270                                                {34, 35, 38, 39, 42, 43, 46, 47},
271                                                {48, 49, 52, 53, 56, 57, 60, 61},
272                                                {50, 51, 54, 55, 58, 59, 62, 63}};
273 
274     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {};
275 
276     // pull point information from triangle buffer
277     // @todo use structs for readability
278     uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
279     uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
280     float    z            = *(workDesc.pTriBuffer + 2);
281 
282     // construct triangle descriptor for point
283     // no interpolation, set up i,j for constant interpolation of z and attribs
284     // @todo implement an optimized backend that doesn't require triangle information
285 
286     // compute coverage mask from x,y packed into the coverageMask flag
287     // mask indices by the maximum valid index for x/y of coveragemap.
288     uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
289     uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
290     for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i)
291     {
292         triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX];
293     }
294     triDesc.anyCoveredSamples = triDesc.coverageMask[0];
295     triDesc.innerCoverageMask = triDesc.coverageMask[0];
296 
297     // no persp divide needed for points
298     triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
299     triDesc.triFlags                         = workDesc.triFlags;
300     triDesc.recipDet                         = 1.0f;
301     triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
302     triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
303     triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
304     triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
305 
306     RenderOutputBuffers renderBuffers;
307     GetRenderHotTiles(pDC,
308                       workerId,
309                       macroTile,
310                       tileAlignedX >> KNOB_TILE_X_DIM_SHIFT,
311                       tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
312                       renderBuffers,
313                       triDesc.triFlags.renderTargetArrayIndex);
314 
315     RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelBackend, pDC->drawId);
316     backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
317     RDTSC_END(pDC->pContext->pBucketMgr, BEPixelBackend, 0);
318 }
319 
RasterizeTriPoint(DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroTile,void * pData)320 void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
321 {
322     const TRIANGLE_WORK_DESC& workDesc     = *(const TRIANGLE_WORK_DESC*)pData;
323     const SWR_RASTSTATE&      rastState    = pDC->pState->state.rastState;
324     const SWR_BACKEND_STATE&  backendState = pDC->pState->state.backendState;
325 
326     bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
327 
328     // load point vertex
329     float x = *workDesc.pTriBuffer;
330     float y = *(workDesc.pTriBuffer + 1);
331     float z = *(workDesc.pTriBuffer + 2);
332 
333     // create a copy of the triangle buffer to write our adjusted vertices to
334     OSALIGNSIMD(float) newTriBuffer[4 * 4];
335     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
336     newWorkDesc.pTriBuffer         = &newTriBuffer[0];
337 
338     // create a copy of the attrib buffer to write our adjusted attribs to
339     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
340     newWorkDesc.pAttribs = &newAttribBuffer[0];
341 
342     newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
343     newWorkDesc.numAttribs      = workDesc.numAttribs;
344     newWorkDesc.triFlags        = workDesc.triFlags;
345 
346     // construct two tris by bloating point by point size
347     float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
348     float lowerX        = x - halfPointSize;
349     float upperX        = x + halfPointSize;
350     float lowerY        = y - halfPointSize;
351     float upperY        = y + halfPointSize;
352 
353     // tri 0
354     float* pBuf = &newTriBuffer[0];
355     *pBuf++     = lowerX;
356     *pBuf++     = lowerX;
357     *pBuf++     = upperX;
358     pBuf++;
359     *pBuf++ = lowerY;
360     *pBuf++ = upperY;
361     *pBuf++ = upperY;
362     pBuf++;
363     _mm_store_ps(pBuf, _mm_set1_ps(z));
364     _mm_store_ps(pBuf += 4, _mm_set1_ps(1.0f));
365 
366     // setup triangle rasterizer function
367     PFN_WORK_FUNC pfnTriRast;
368     // conservative rast not supported for points/lines
369     pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
370                                    rastState.bIsCenterPattern,
371                                    false,
372                                    SWR_INPUT_COVERAGE_NONE,
373                                    EdgeValToEdgeState(ALL_EDGES_VALID),
374                                    (pDC->pState->state.scissorsTileAligned == false));
375 
376     // overwrite texcoords for point sprites
377     if (isPointSpriteTexCoordEnabled)
378     {
379         // copy original attribs
380         memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
381         newWorkDesc.pAttribs = &newAttribBuffer[0];
382 
383         // overwrite texcoord for point sprites
384         uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
385         unsigned long texCoordAttrib = 0;
386 
387         while (_BitScanForward(&texCoordAttrib, texCoordMask))
388         {
389             texCoordMask &= ~(1 << texCoordAttrib);
390             __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
391             if (rastState.pointSpriteTopOrigin)
392             {
393                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
394                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
395                 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
396             }
397             else
398             {
399                 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
400                 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
401                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
402             }
403         }
404     }
405     else
406     {
407         // no texcoord overwrite, can reuse the attrib buffer from frontend
408         newWorkDesc.pAttribs = workDesc.pAttribs;
409     }
410 
411     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
412 
413     // tri 1
414     pBuf    = &newTriBuffer[0];
415     *pBuf++ = lowerX;
416     *pBuf++ = upperX;
417     *pBuf++ = upperX;
418     pBuf++;
419     *pBuf++ = lowerY;
420     *pBuf++ = upperY;
421     *pBuf++ = lowerY;
422     // z, w unchanged
423 
424     if (isPointSpriteTexCoordEnabled)
425     {
426         uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
427         unsigned long texCoordAttrib = 0;
428 
429         while (_BitScanForward(&texCoordAttrib, texCoordMask))
430         {
431             texCoordMask &= ~(1 << texCoordAttrib);
432             __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
433             if (rastState.pointSpriteTopOrigin)
434             {
435                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
436                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
437                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
438             }
439             else
440             {
441                 pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
442                 pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
443                 pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
444             }
445         }
446     }
447 
448     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
449 }
450 
InitRasterizerFunctions()451 void InitRasterizerFunctions()
452 {
453     InitRasterizerFuncs();
454 }
455 
456 // Selector for correct templated RasterizeTriangle function
GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,bool IsCenter,bool IsConservative,SWR_INPUT_COVERAGE InputCoverage,uint32_t EdgeEnable,bool RasterizeScissorEdges)457 PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
458                                 bool                  IsCenter,
459                                 bool                  IsConservative,
460                                 SWR_INPUT_COVERAGE    InputCoverage,
461                                 uint32_t              EdgeEnable,
462                                 bool                  RasterizeScissorEdges)
463 {
464     SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
465     SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
466     SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
467 
468     PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage]
469                                          [EdgeEnable][RasterizeScissorEdges];
470     SWR_ASSERT(func);
471 
472     return func;
473 }
474