1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file context.h
24  *
25  * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26  *        The SWR_CONTEXT is our global context and contains the DC ring,
27  *        thread state, etc.
28  *
29  *        The DRAW_CONTEXT contains all state associated with a draw operation.
30  *
31  ******************************************************************************/
32 #pragma once
33 
34 #include <condition_variable>
35 #include <algorithm>
36 
37 #include "core/api.h"
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/intrin.h"
43 #include "common/rdtsc_buckets.h"
44 #include "core/threads.h"
45 #include "ringbuffer.h"
46 #include "archrast/archrast.h"
47 
48 // x.8 fixed point precision values
49 #define FIXED_POINT_SHIFT 8
50 #define FIXED_POINT_SCALE 256
51 
52 // x.16 fixed point precision values
53 #define FIXED_POINT16_SHIFT 16
54 #define FIXED_POINT16_SCALE 65536
55 
56 struct SWR_CONTEXT;
57 struct DRAW_CONTEXT;
58 
59 struct TRI_FLAGS
60 {
61     uint32_t frontFacing : 1;
62     uint32_t yMajor : 1;
63     uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM);
64     uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
65     float    pointSize;
66     uint32_t renderTargetArrayIndex;
67     uint32_t viewportIndex;
68 };
69 
70 //////////////////////////////////////////////////////////////////////////
71 /// SWR_TRIANGLE_DESC
72 /////////////////////////////////////////////////////////////////////////
73 struct SWR_TRIANGLE_DESC
74 {
75     float I[3];
76     float J[3];
77     float Z[3];
78     float OneOverW[3];
79     float recipDet;
80 
81     float* pRecipW;
82     float* pAttribs;
83     float* pPerspAttribs;
84     float* pSamplePos;
85     float* pUserClipBuffer;
86 
87     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
88     uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if
89                                 // entire pixel is covered
90     uint64_t anyCoveredSamples;
91 
92     TRI_FLAGS triFlags;
93 };
94 
95 struct TRIANGLE_WORK_DESC
96 {
97     float* pTriBuffer;
98     float* pAttribs;
99     float* pUserClipBuffer;
100     uint32_t  numAttribs;
101     TRI_FLAGS triFlags;
102 };
103 
104 struct CLEAR_DESC
105 {
106     SWR_RECT rect;
107     uint32_t attachmentMask;
108     uint32_t renderTargetArrayIndex;
109     float    clearRTColor[4]; // RGBA_32F
110     float    clearDepth;      // [0..1]
111     uint8_t  clearStencil;
112 };
113 
114 struct DISCARD_INVALIDATE_TILES_DESC
115 {
116     uint32_t       attachmentMask;
117     SWR_RECT       rect;
118     SWR_TILE_STATE newTileState;
119     bool           createNewTiles;
120     bool           fullTilesOnly;
121 };
122 
123 struct SYNC_DESC
124 {
125     PFN_CALLBACK_FUNC pfnCallbackFunc;
126     uint64_t          userData;
127     uint64_t          userData2;
128     uint64_t          userData3;
129 };
130 
131 struct STORE_TILES_DESC
132 {
133     uint32_t       attachmentMask;
134     SWR_TILE_STATE postStoreTileState;
135     SWR_RECT       rect;
136 };
137 
138 struct COMPUTE_DESC
139 {
140     uint32_t threadGroupCountX;
141     uint32_t threadGroupCountY;
142     uint32_t threadGroupCountZ;
143     bool     enableThreadDispatch;
144 };
145 
146 typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
147                               uint32_t      workerId,
148                               uint32_t      macroTile,
149                               void*         pDesc);
150 
151 enum WORK_TYPE
152 {
153     SYNC,
154     DRAW,
155     CLEAR,
156     DISCARDINVALIDATETILES,
157     STORETILES,
158     SHUTDOWN,
159 };
160 
OSALIGNSIMD(struct)161 OSALIGNSIMD(struct) BE_WORK
162 {
163     WORK_TYPE     type;
164     PFN_WORK_FUNC pfnWork;
165     union
166     {
167         SYNC_DESC                     sync;
168         TRIANGLE_WORK_DESC            tri;
169         CLEAR_DESC                    clear;
170         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
171         STORE_TILES_DESC              storeTiles;
172     } desc;
173 };
174 
175 struct DRAW_WORK
176 {
177     DRAW_CONTEXT* pDC;
178     union
179     {
180         uint32_t numIndices; // DrawIndexed: Number of indices for draw.
181         uint32_t numVerts;   // Draw: Number of verts (triangles, lines, etc)
182     };
183     union
184     {
185         gfxptr_t xpIB;        // DrawIndexed: App supplied int32 indices
186         uint32_t startVertex; // Draw: Starting vertex in VB to render from.
187     };
188     int32_t  baseVertex;
189     uint32_t numInstances;  // Number of instances
190     uint32_t startInstance; // Instance offset
191     uint32_t startPrimID;   // starting primitiveID for this draw batch
192     uint32_t
193                startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
194     SWR_FORMAT type;          // index buffer type
195 };
196 
197 typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT*  pContext,
198                                  DRAW_CONTEXT* pDC,
199                                  uint32_t      workerId,
200                                  void*         pDesc);
201 struct FE_WORK
202 {
203     WORK_TYPE        type;
204     PFN_FE_WORK_FUNC pfnWork;
205     union
206     {
207         SYNC_DESC                     sync;
208         DRAW_WORK                     draw;
209         CLEAR_DESC                    clear;
210         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
211         STORE_TILES_DESC              storeTiles;
212     } desc;
213 };
214 
215 struct GUARDBANDS
216 {
217     float left[KNOB_NUM_VIEWPORTS_SCISSORS];
218     float right[KNOB_NUM_VIEWPORTS_SCISSORS];
219     float top[KNOB_NUM_VIEWPORTS_SCISSORS];
220     float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
221 };
222 
223 struct PA_STATE;
224 
225 // function signature for pipeline stages that execute after primitive assembly
226 typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT*      pDC,
227                                   PA_STATE&          pa,
228                                   uint32_t           workerId,
229                                   simdvector         prims[],
230                                   uint32_t           primMask,
231                                   simdscalari const& primID,
232                                   simdscalari const& viewportIdx,
233                                   simdscalari const& rtIdx);
234 
235 // function signature for pipeline stages that execute after primitive assembly
236 typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT*        pDC,
237                                                  PA_STATE&            pa,
238                                                  uint32_t             workerId,
239                                                  simd16vector         prims[],
240                                                  uint32_t             primMask,
241                                                  simd16scalari const& primID,
242                                                  simd16scalari const& viewportIdx,
243                                                  simd16scalari const& rtIdx);
244 
OSALIGNLINE(struct)245 OSALIGNLINE(struct) API_STATE
246 {
247     // Vertex Buffers
248     SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
249 
250     // GS - Geometry Shader State
251     SWR_GS_STATE gsState;
252     PFN_GS_FUNC  pfnGsFunc;
253 
254     // FS - Fetch Shader State
255     PFN_FETCH_FUNC pfnFetchFunc;
256 
257     // VS - Vertex Shader State
258     PFN_VERTEX_FUNC pfnVertexFunc;
259 
260     // Index Buffer
261     SWR_INDEX_BUFFER_STATE indexBuffer;
262 
263     // CS - Compute Shader
264     PFN_CS_FUNC pfnCsFunc;
265     uint32_t    totalThreadsInGroup;
266     uint32_t    totalSpillFillSize;
267     uint32_t    scratchSpaceSizePerWarp;
268     uint32_t    scratchSpaceNumWarps;
269 
270     // FE - Frontend State
271     SWR_FRONTEND_STATE frontendState;
272 
273     // SOS - Streamout Shader State
274     PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
275 
276     // Streamout state
277     SWR_STREAMOUT_STATE          soState;
278     mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
279     mutable SWR_STREAMOUT_BUFFER soPausedBuffer[MAX_SO_STREAMS];
280 
281     // Tessellation State
282     PFN_HS_FUNC  pfnHsFunc;
283     PFN_DS_FUNC  pfnDsFunc;
284     SWR_TS_STATE tsState;
285 
286     // Number of attributes used by the frontend (vs, so, gs)
287     uint32_t feNumAttributes;
288 
289     // RS - Rasterizer State
290     SWR_RASTSTATE rastState;
291     // floating point multisample offsets
292     float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
293 
294     GUARDBANDS gbState;
295 
296     SWR_VIEWPORT          vp[KNOB_NUM_VIEWPORTS_SCISSORS];
297     SWR_VIEWPORT_MATRICES vpMatrices;
298 
299     SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
300     SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
301     bool     scissorsTileAligned;
302 
303     bool               forceFront;
304     PRIMITIVE_TOPOLOGY topology;
305 
306 
307     // Backend state
308     OSALIGNLINE(SWR_BACKEND_STATE) backendState;
309 
310     SWR_DEPTH_BOUNDS_STATE depthBoundsState;
311 
312     // PS - Pixel shader state
313     SWR_PS_STATE psState;
314 
315     SWR_DEPTH_STENCIL_STATE depthStencilState;
316 
317     // OM - Output Merger State
318     SWR_BLEND_STATE    blendState;
319     PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
320 
321     struct
322     {
323         uint32_t enableStatsFE : 1;        // Enable frontend pipeline stats
324         uint32_t enableStatsBE : 1;        // Enable backend pipeline stats
325         uint32_t colorHottileEnable : 8;   // Bitmask of enabled color hottiles
326         uint32_t depthHottileEnable : 1;   // Enable depth buffer hottile
327         uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
328     };
329 
330     PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
331 };
332 
333 class MacroTileMgr;
334 class DispatchQueue;
335 class HOTTILE;
336 
337 struct RenderOutputBuffers
338 {
339     uint8_t* pColor[SWR_NUM_RENDERTARGETS];
340     uint8_t* pDepth;
341     uint8_t* pStencil;
342 
343     HOTTILE* pColorHotTile[SWR_NUM_RENDERTARGETS];
344     HOTTILE* pDepthHotTile;
345     HOTTILE* pStencilHotTile;
346 };
347 
348 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
349 struct BarycentricCoeffs
350 {
351     simdscalar vIa;
352     simdscalar vIb;
353     simdscalar vIc;
354 
355     simdscalar vJa;
356     simdscalar vJb;
357     simdscalar vJc;
358 
359     simdscalar vZa;
360     simdscalar vZb;
361     simdscalar vZc;
362 
363     simdscalar vRecipDet;
364 
365     simdscalar vAOneOverW;
366     simdscalar vBOneOverW;
367     simdscalar vCOneOverW;
368 };
369 
370 // pipeline function pointer types
371 typedef void (*PFN_BACKEND_FUNC)(
372     DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
373 typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&,
374                                   uint8_t* (&)[SWR_NUM_RENDERTARGETS],
375                                   uint32_t,
376                                   const SWR_BLEND_STATE*,
377                                   const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS],
378                                   simdscalar&,
379                                   simdscalar const&);
380 typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
381 typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
382 typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&,
383                                                SWR_PS_CONTEXT&,
384                                                const uint64_t* const,
385                                                const uint32_t,
386                                                simdscalar const&,
387                                                simdscalar const&);
388 
389 struct BACKEND_FUNCS
390 {
391     PFN_BACKEND_FUNC pfnBackend;
392 };
393 
394 // Draw State
395 struct DRAW_STATE
396 {
397     API_STATE state;
398 
399     void* pPrivateState; // Its required the driver sets this up for each draw.
400 
401     // pipeline function pointers, filled in by API thread when setting up the draw
402     BACKEND_FUNCS     backendFuncs;
403     PFN_PROCESS_PRIMS pfnProcessPrims;
404 #if USE_SIMD16_FRONTEND
405     PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
406 #endif
407 
408     CachingArena* pArena; // This should only be used by API thread.
409 };
410 
411 struct DRAW_DYNAMIC_STATE
412 {
ResetDRAW_DYNAMIC_STATE413     void Reset(uint32_t numThreads)
414     {
415         SWR_STATS* pSavePtr = pStats;
416         memset(this, 0, sizeof(*this));
417         pStats = pSavePtr;
418         memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
419     }
420     ///@todo Currently assumes only a single FE can do stream output for a draw.
421     uint32_t SoWriteOffset[4];
422     bool     SoWriteOffsetDirty[4];
423 
424     SWR_STATS_FE statsFE; // Only one FE thread per DC.
425     SWR_STATS*   pStats;
426     uint64_t     soPrims; // number of primitives written to StremOut buffer
427 };
428 
429 // Draw Context
430 //    The api thread sets up a draw context that exists for the life of the draw.
431 //    This draw context maintains all of the state needed for the draw operation.
432 struct DRAW_CONTEXT
433 {
434     SWR_CONTEXT* pContext;
435     union
436     {
437         MacroTileMgr*  pTileMgr;
438         DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
439     };
440     DRAW_STATE*   pState; // Read-only state. Core should not update this outside of API thread.
441     CachingArena* pArena;
442 
443     uint32_t drawId;
444     bool     dependentFE;  // Frontend work is dependent on all previous FE
445     bool     dependent;    // Backend work is dependent on all previous BE
446     bool     isCompute;    // Is this DC a compute context?
447     bool     cleanupState; // True if this is the last draw using an entry in the state ring.
448 
449     FE_WORK FeWork;
450 
451     SYNC_DESC retireCallback; // Call this func when this DC is retired.
452 
453     DRAW_DYNAMIC_STATE dynState;
454 
455     volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
456     volatile OSALIGNLINE(uint32_t) FeLock;
457     volatile OSALIGNLINE(uint32_t) threadsDone;
458 };
459 
460 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
461 
GetApiState(const DRAW_CONTEXT * pDC)462 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
463 {
464     SWR_ASSERT(pDC != nullptr);
465     SWR_ASSERT(pDC->pState != nullptr);
466 
467     return pDC->pState->state;
468 }
469 
GetPrivateState(const DRAW_CONTEXT * pDC)470 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
471 {
472     SWR_ASSERT(pDC != nullptr);
473     SWR_ASSERT(pDC->pState != nullptr);
474 
475     return pDC->pState->pPrivateState;
476 }
477 
478 class HotTileMgr;
479 
480 struct SWR_CONTEXT
481 {
482     // Draw Context Ring
483     //  Each draw needs its own state in order to support mulitple draws in flight across multiple
484     //  threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
485     //  maximum number of draws that can be in flight at any given time.
486     //
487     //  Description:
488     //  1. State - When an application first sets state we'll request a new draw context to use.
489     //     a. If there are no available draw contexts then we'll have to wait until one becomes
490     //     free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
491     //     c. All state calls set state on pCurDrawContext.
492     //  2. Draw - Creates submits a work item that is associated with current draw context.
493     //     a. Set pPrevDrawContext = pCurDrawContext
494     //     b. Set pCurDrawContext to NULL.
495     //  3. State - When an applications sets state after draw
496     //     a. Same as step 1.
497     //     b. State is copied from prev draw context to current.
498     RingBuffer<DRAW_CONTEXT> dcRing;
499 
500     DRAW_CONTEXT* pCurDrawContext;  // This points to DC entry in ring for an unsubmitted draw.
501     DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted
502                                     // that we can copy state from.
503 
504     MacroTileMgr*  pMacroTileManagerArray;
505     DispatchQueue* pDispatchQueueArray;
506 
507     // Draw State Ring
508     //  When draw are very large (lots of primitives) then the API thread will break these up.
509     //  These split draws all have identical state. So instead of storing the state directly
510     //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
511     //  to reference a single entry in the DS ring.
512     RingBuffer<DRAW_STATE> dsRing;
513 
514     uint32_t curStateId; // Current index to the next available entry in the DS ring.
515 
516     uint32_t NumWorkerThreads;
517     uint32_t NumFEThreads;
518     uint32_t NumBEThreads;
519 
520     THREAD_POOL              threadPool; // Thread pool associated with this context
521     SWR_THREADING_INFO       threadInfo;
522     SWR_API_THREADING_INFO   apiThreadInfo;
523     SWR_WORKER_PRIVATE_STATE workerPrivateState;
524 
525     uint32_t MAX_DRAWS_IN_FLIGHT;
526 
527     std::condition_variable FifosNotEmpty;
528     std::mutex              WaitLock;
529 
530     uint32_t privateStateSize;
531 
532     HotTileMgr* pHotTileMgr;
533 
534     // Callback functions, passed in at create context time
535     PFN_LOAD_TILE                  pfnLoadTile;
536     PFN_STORE_TILE                 pfnStoreTile;
537     PFN_TRANSLATE_GFXPTR_FOR_READ  pfnTranslateGfxptrForRead;
538     PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
539     PFN_MAKE_GFXPTR                pfnMakeGfxPtr;
540     PFN_CREATE_MEMORY_CONTEXT      pfnCreateMemoryContext;
541     PFN_DESTROY_MEMORY_CONTEXT     pfnDestroyMemoryContext;
542     PFN_UPDATE_SO_WRITE_OFFSET     pfnUpdateSoWriteOffset;
543     PFN_UPDATE_STATS               pfnUpdateStats;
544     PFN_UPDATE_STATS_FE            pfnUpdateStatsFE;
545     PFN_UPDATE_STREAMOUT           pfnUpdateStreamOut;
546 
547 
548     // Global Stats
549     SWR_STATS* pStats;
550 
551     // Scratch space for workers.
552     uint8_t** ppScratch;
553 
554     volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
555 
556     OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
557     uint32_t frameCount;
558 
559     uint32_t lastFrameChecked;
560     uint64_t lastDrawChecked;
561     TileSet* pSingleThreadLockedTiles;
562 
563     // ArchRast thread contexts.
564     HANDLE* pArContext;
565 
566     // handle to external memory for worker datas to create memory contexts
567     HANDLE hExternalMemory;
568 
569     BucketManager *pBucketMgr;
570 };
571 
572 #define UPDATE_STAT_BE(name, count)                   \
573     if (GetApiState(pDC).enableStatsBE)               \
574     {                                                 \
575         pDC->dynState.pStats[workerId].name += count; \
576     }
577 #define UPDATE_STAT_FE(name, count)          \
578     if (GetApiState(pDC).enableStatsFE)      \
579     {                                        \
580         pDC->dynState.statsFE.name += count; \
581     }
582 
583 // ArchRast instrumentation framework
584 #define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
585 #define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
586 
587 #ifdef KNOB_ENABLE_RDTSC
588 #define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type)
589 #define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0)
590 #else
591 #define RDTSC_BEGIN(pBucketMgr, type, drawid)
592 #define RDTSC_END(pBucketMgr, type, count)
593 #endif
594 
595 #ifdef KNOB_ENABLE_AR
596 #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
597 #define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
598 #else
599 #define _AR_EVENT(ctx, event)
600 #define _AR_FLUSH(ctx, id)
601 #endif
602 
603 // Use these macros for api thread.
604 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
605 
606 // Use these macros for worker threads.
607 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
608 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)
609