1 /****************************************************************************
2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file tilemgr.cpp
24  *
25  * @brief Implementation for Macro Tile Manager which provides the facilities
26  *        for threads to work on an macro tile.
27  *
28  ******************************************************************************/
29 #include <unordered_map>
30 
31 #include "fifo.hpp"
32 #include "core/tilemgr.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
35 
MacroTileMgr(CachingArena & arena)36 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) {}
37 
enqueue(uint32_t x,uint32_t y,BE_WORK * pWork)38 void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK* pWork)
39 {
40     // Should not enqueue more then what we have backing for in the hot tile manager.
41     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
42     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
43 
44     if ((x & ~(KNOB_NUM_HOT_TILES_X - 1)) | (y & ~(KNOB_NUM_HOT_TILES_Y - 1)))
45     {
46         return;
47     }
48 
49     uint32_t id = getTileId(x, y);
50 
51     if (id >= mTiles.size())
52     {
53         mTiles.resize((16 + id) * 2);
54     }
55 
56     MacroTileQueue* pTile = mTiles[id];
57     if (!pTile)
58     {
59         pTile = mTiles[id] = new MacroTileQueue();
60     }
61     pTile->mWorkItemsFE++;
62     pTile->mId = id;
63 
64     if (pTile->mWorkItemsFE == 1)
65     {
66         pTile->clear(mArena);
67         mDirtyTiles.push_back(pTile);
68     }
69 
70     mWorkItemsProduced++;
71     pTile->enqueue_try_nosync(mArena, pWork);
72 }
73 
markTileComplete(uint32_t id)74 void MacroTileMgr::markTileComplete(uint32_t id)
75 {
76     SWR_ASSERT(mTiles.size() > id);
77     MacroTileQueue& tile     = *mTiles[id];
78     uint32_t        numTiles = tile.mWorkItemsFE;
79     InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
80 
81     _ReadWriteBarrier();
82     tile.mWorkItemsBE += numTiles;
83     SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
84 
85     // clear out tile, but defer fifo clear until the next DC first queues to it.
86     // this prevents worker threads from constantly locking a completed macro tile
87     tile.mWorkItemsFE = 0;
88     tile.mWorkItemsBE = 0;
89 }
90 
GetHotTile(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,HANDLE hWorkerPrivateData,uint32_t macroID,SWR_RENDERTARGET_ATTACHMENT attachment,bool create,uint32_t numSamples,uint32_t renderTargetArrayIndex)91 HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT*                pContext,
92                                 DRAW_CONTEXT*               pDC,
93                                 HANDLE                      hWorkerPrivateData,
94                                 uint32_t                    macroID,
95                                 SWR_RENDERTARGET_ATTACHMENT attachment,
96                                 bool                        create,
97                                 uint32_t                    numSamples,
98                                 uint32_t                    renderTargetArrayIndex)
99 {
100     uint32_t x, y;
101     MacroTileMgr::getTileIndices(macroID, x, y);
102 
103     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
104     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
105 
106     HotTileSet& tile    = mHotTiles[x][y];
107     HOTTILE&    hotTile = tile.Attachment[attachment];
108     if (hotTile.pBuffer == NULL)
109     {
110         if (create)
111         {
112             uint32_t size     = numSamples * mHotTileSize[attachment];
113             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
114             hotTile.pBuffer =
115                 (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
116             hotTile.state                  = HOTTILE_INVALID;
117             hotTile.numSamples             = numSamples;
118             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
119         }
120         else
121         {
122             return NULL;
123         }
124     }
125     else
126     {
127         // free the old tile and create a new one with enough space to hold all samples
128         if (numSamples > hotTile.numSamples)
129         {
130             // tile should be either uninitialized or resolved if we're deleting and switching to a
131             // new sample count
132             SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) ||
133                        (hotTile.state == HOTTILE_CLEAR));
134             FreeHotTileMem(hotTile.pBuffer);
135 
136             uint32_t size     = numSamples * mHotTileSize[attachment];
137             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
138             hotTile.pBuffer =
139                 (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
140             hotTile.state      = HOTTILE_INVALID;
141             hotTile.numSamples = numSamples;
142         }
143 
144         // if requested render target array index isn't currently loaded, need to store out the
145         // current hottile and load the requested array slice
146         if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
147         {
148             SWR_FORMAT format;
149             switch (attachment)
150             {
151             case SWR_ATTACHMENT_COLOR0:
152             case SWR_ATTACHMENT_COLOR1:
153             case SWR_ATTACHMENT_COLOR2:
154             case SWR_ATTACHMENT_COLOR3:
155             case SWR_ATTACHMENT_COLOR4:
156             case SWR_ATTACHMENT_COLOR5:
157             case SWR_ATTACHMENT_COLOR6:
158             case SWR_ATTACHMENT_COLOR7:
159                 format = KNOB_COLOR_HOT_TILE_FORMAT;
160                 break;
161             case SWR_ATTACHMENT_DEPTH:
162                 format = KNOB_DEPTH_HOT_TILE_FORMAT;
163                 break;
164             case SWR_ATTACHMENT_STENCIL:
165                 format = KNOB_STENCIL_HOT_TILE_FORMAT;
166                 break;
167             default:
168                 SWR_INVALID("Unknown attachment: %d", attachment);
169                 format = KNOB_COLOR_HOT_TILE_FORMAT;
170                 break;
171             }
172 
173             if (hotTile.state == HOTTILE_CLEAR)
174             {
175                 if (attachment == SWR_ATTACHMENT_STENCIL)
176                     ClearStencilHotTile(&hotTile);
177                 else if (attachment == SWR_ATTACHMENT_DEPTH)
178                     ClearDepthHotTile(&hotTile);
179                 else
180                     ClearColorHotTile(&hotTile);
181 
182                 hotTile.state = HOTTILE_DIRTY;
183             }
184 
185             if (hotTile.state == HOTTILE_DIRTY)
186             {
187                 pContext->pfnStoreTile(pDC,
188                                        hWorkerPrivateData,
189                                        format,
190                                        attachment,
191                                        x * KNOB_MACROTILE_X_DIM,
192                                        y * KNOB_MACROTILE_Y_DIM,
193                                        hotTile.renderTargetArrayIndex,
194                                        hotTile.pBuffer);
195             }
196 
197             pContext->pfnLoadTile(pDC,
198                                   hWorkerPrivateData,
199                                   format,
200                                   attachment,
201                                   x * KNOB_MACROTILE_X_DIM,
202                                   y * KNOB_MACROTILE_Y_DIM,
203                                   renderTargetArrayIndex,
204                                   hotTile.pBuffer);
205 
206             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
207             hotTile.state = HOTTILE_RESOLVED;
208         }
209     }
210     return &tile.Attachment[attachment];
211 }
212 
GetHotTileNoLoad(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t macroID,SWR_RENDERTARGET_ATTACHMENT attachment,bool create,uint32_t numSamples)213 HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT*                pContext,
214                                       DRAW_CONTEXT*               pDC,
215                                       uint32_t                    macroID,
216                                       SWR_RENDERTARGET_ATTACHMENT attachment,
217                                       bool                        create,
218                                       uint32_t                    numSamples)
219 {
220     uint32_t x, y;
221     MacroTileMgr::getTileIndices(macroID, x, y);
222 
223     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
224     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
225 
226     HotTileSet& tile    = mHotTiles[x][y];
227     HOTTILE&    hotTile = tile.Attachment[attachment];
228     if (hotTile.pBuffer == NULL)
229     {
230         if (create)
231         {
232             uint32_t size                  = numSamples * mHotTileSize[attachment];
233             hotTile.pBuffer                = (uint8_t*)AlignedMalloc(size, 64);
234             hotTile.state                  = HOTTILE_INVALID;
235             hotTile.numSamples             = numSamples;
236             hotTile.renderTargetArrayIndex = 0;
237         }
238         else
239         {
240             return NULL;
241         }
242     }
243 
244     return &hotTile;
245 }
246 
ClearColorHotTile(const HOTTILE * pHotTile)247 void HotTileMgr::ClearColorHotTile(
248     const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
249 {
250     // Load clear color into SIMD register...
251     float*       pClearData = (float*)(pHotTile->clearData);
252     simd16scalar valR       = _simd16_broadcast_ss(&pClearData[0]);
253     simd16scalar valG       = _simd16_broadcast_ss(&pClearData[1]);
254     simd16scalar valB       = _simd16_broadcast_ss(&pClearData[2]);
255     simd16scalar valA       = _simd16_broadcast_ss(&pClearData[3]);
256 
257     float*   pfBuf      = (float*)pHotTile->pBuffer;
258     uint32_t numSamples = pHotTile->numSamples;
259 
260     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
261     {
262         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
263         {
264             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
265                  si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
266             {
267                 _simd16_store_ps(pfBuf, valR);
268                 pfBuf += KNOB_SIMD16_WIDTH;
269 
270                 _simd16_store_ps(pfBuf, valG);
271                 pfBuf += KNOB_SIMD16_WIDTH;
272 
273                 _simd16_store_ps(pfBuf, valB);
274                 pfBuf += KNOB_SIMD16_WIDTH;
275 
276                 _simd16_store_ps(pfBuf, valA);
277                 pfBuf += KNOB_SIMD16_WIDTH;
278             }
279         }
280     }
281 }
282 
ClearDepthHotTile(const HOTTILE * pHotTile)283 void HotTileMgr::ClearDepthHotTile(
284     const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
285 {
286     // Load clear color into SIMD register...
287     float*       pClearData = (float*)(pHotTile->clearData);
288     simd16scalar valZ       = _simd16_broadcast_ss(&pClearData[0]);
289 
290     float*   pfBuf      = (float*)pHotTile->pBuffer;
291     uint32_t numSamples = pHotTile->numSamples;
292 
293     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
294     {
295         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
296         {
297             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
298                  si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
299             {
300                 _simd16_store_ps(pfBuf, valZ);
301                 pfBuf += KNOB_SIMD16_WIDTH;
302             }
303         }
304     }
305 }
306 
ClearStencilHotTile(const HOTTILE * pHotTile)307 void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
308 {
309     // convert from F32 to U8.
310     uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
311     // broadcast 32x into __m256i...
312     simd16scalari valS = _simd16_set1_epi8(clearVal);
313 
314     simd16scalari* pBuf       = (simd16scalari*)pHotTile->pBuffer;
315     uint32_t       numSamples = pHotTile->numSamples;
316 
317     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
318     {
319         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
320         {
321             // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
322             for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
323                  si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4)
324             {
325                 _simd16_store_si(pBuf, valS);
326                 pBuf += 1;
327             }
328         }
329     }
330 }
331 
332 //////////////////////////////////////////////////////////////////////////
333 /// @brief InitializeHotTiles
334 /// for draw calls, we initialize the active hot tiles and perform deferred
335 /// load on them if tile is in invalid state. we do this in the outer thread
336 /// loop instead of inside the draw routine itself mainly for performance,
337 /// to avoid unnecessary setup every triangle
338 /// @todo support deferred clear
339 /// @param pCreateInfo - pointer to creation info.
InitializeHotTiles(SWR_CONTEXT * pContext,DRAW_CONTEXT * pDC,uint32_t workerId,uint32_t macroID)340 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT*  pContext,
341                                     DRAW_CONTEXT* pDC,
342                                     uint32_t      workerId,
343                                     uint32_t      macroID)
344 {
345     const API_STATE& state    = GetApiState(pDC);
346     HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
347 
348     uint32_t x, y;
349     MacroTileMgr::getTileIndices(macroID, x, y);
350     x *= KNOB_MACROTILE_X_DIM;
351     y *= KNOB_MACROTILE_Y_DIM;
352 
353     uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
354 
355     // check RT if enabled
356     unsigned long rtSlot                 = 0;
357     uint32_t      colorHottileEnableMask = state.colorHottileEnable;
358     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
359     {
360         HOTTILE* pHotTile =
361             GetHotTile(pContext,
362                        pDC,
363                        hWorkerPrivateData,
364                        macroID,
365                        (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
366                        true,
367                        numSamples);
368 
369         if (pHotTile->state == HOTTILE_INVALID)
370         {
371             RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
372             // invalid hottile before draw requires a load from surface before we can draw to it
373             pContext->pfnLoadTile(pDC,
374                                   hWorkerPrivateData,
375                                   KNOB_COLOR_HOT_TILE_FORMAT,
376                                   (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
377                                   x,
378                                   y,
379                                   pHotTile->renderTargetArrayIndex,
380                                   pHotTile->pBuffer);
381             pHotTile->state = HOTTILE_RESOLVED;
382             RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
383         }
384         else if (pHotTile->state == HOTTILE_CLEAR)
385         {
386             RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
387             // Clear the tile.
388             ClearColorHotTile(pHotTile);
389             pHotTile->state = HOTTILE_DIRTY;
390             RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
391         }
392         colorHottileEnableMask &= ~(1 << rtSlot);
393     }
394 
395     // check depth if enabled
396     if (state.depthHottileEnable)
397     {
398         HOTTILE* pHotTile = GetHotTile(
399             pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
400         if (pHotTile->state == HOTTILE_INVALID)
401         {
402             RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
403             // invalid hottile before draw requires a load from surface before we can draw to it
404             pContext->pfnLoadTile(pDC,
405                                   hWorkerPrivateData,
406                                   KNOB_DEPTH_HOT_TILE_FORMAT,
407                                   SWR_ATTACHMENT_DEPTH,
408                                   x,
409                                   y,
410                                   pHotTile->renderTargetArrayIndex,
411                                   pHotTile->pBuffer);
412             pHotTile->state = HOTTILE_DIRTY;
413             RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
414         }
415         else if (pHotTile->state == HOTTILE_CLEAR)
416         {
417             RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
418             // Clear the tile.
419             ClearDepthHotTile(pHotTile);
420             pHotTile->state = HOTTILE_DIRTY;
421             RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
422         }
423     }
424 
425     // check stencil if enabled
426     if (state.stencilHottileEnable)
427     {
428         HOTTILE* pHotTile = GetHotTile(
429             pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
430         if (pHotTile->state == HOTTILE_INVALID)
431         {
432             RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
433             // invalid hottile before draw requires a load from surface before we can draw to it
434             pContext->pfnLoadTile(pDC,
435                                   hWorkerPrivateData,
436                                   KNOB_STENCIL_HOT_TILE_FORMAT,
437                                   SWR_ATTACHMENT_STENCIL,
438                                   x,
439                                   y,
440                                   pHotTile->renderTargetArrayIndex,
441                                   pHotTile->pBuffer);
442             pHotTile->state = HOTTILE_DIRTY;
443             RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
444         }
445         else if (pHotTile->state == HOTTILE_CLEAR)
446         {
447             RDTSC_BEGIN(pContext->pBucketMgr, BELoadTiles, pDC->drawId);
448             // Clear the tile.
449             ClearStencilHotTile(pHotTile);
450             pHotTile->state = HOTTILE_DIRTY;
451             RDTSC_END(pContext->pBucketMgr, BELoadTiles, 0);
452         }
453     }
454 }
455