rasterizer/core/pa.h

/****************************************************************************
 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * @file pa.h
 *
 * @brief Definitions for primitive assembly.
 *        N primitives are assembled at a time, where N is the SIMD width.
 *        A state machine, that is specific for a given topology, drives the
 *        assembly of vertices into triangles.
 *
 ******************************************************************************/
#pragma once

#include "frontend.h"

struct PA_STATE
{
#if USE_SIMD16_FRONTEND
    enum
    {
        SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
        SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
        SIMD_WIDTH_LOG2 = 4
    };

    typedef simd16mask SIMDMASK;

    typedef simd16scalar SIMDSCALAR;
    typedef simd16vector SIMDVECTOR;
    typedef simd16vertex SIMDVERTEX;

    typedef simd16scalari SIMDSCALARI;

#else
    enum
    {
        SIMD_WIDTH      = KNOB_SIMD_WIDTH,
        SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
        SIMD_WIDTH_LOG2 = 3
    };

    typedef simdmask SIMDMASK;

    typedef simdscalar SIMDSCALAR;
    typedef simdvector SIMDVECTOR;
    typedef simdvertex SIMDVERTEX;

    typedef simdscalari SIMDSCALARI;

#endif
    DRAW_CONTEXT* pDC{nullptr};         // draw context
    uint8_t*      pStreamBase{nullptr}; // vertex stream
    uint32_t      streamSizeInVerts{0}; // total size of the input stream in verts
    uint32_t      vertexStride{0};      // stride of a vertex in simdvector units

    // The topology the binner will use. In some cases the FE changes the topology from the api
    // state.
    PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN};

#if ENABLE_AVX512_SIMD16
    bool useAlternateOffset{false};
#endif

    bool     viewportArrayActive{false};
    bool     rtArrayActive{false};
    uint32_t numVertsPerPrim{0};

    PA_STATE() {}
    PA_STATE(DRAW_CONTEXT* in_pDC,
             uint8_t*      in_pStreamBase,
             uint32_t      in_streamSizeInVerts,
             uint32_t      in_vertexStride,
             uint32_t      in_numVertsPerPrim) :
        pDC(in_pDC),
        pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts),
        vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim)
    {
    }

    virtual bool        HasWork()                                    = 0;
    virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
#if ENABLE_AVX512_SIMD16
    virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
#endif
    virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
#if ENABLE_AVX512_SIMD16
    virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
#endif
    virtual void        AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
    virtual bool        NextPrim()                                                             = 0;
    virtual SIMDVERTEX& GetNextVsOutput()                                                      = 0;
    virtual bool        GetNextStreamOutput()                                                  = 0;
    virtual SIMDMASK&   GetNextVsIndices()                                                     = 0;
    virtual uint32_t    NumPrims()                                                             = 0;
    virtual void        Reset()                                                                = 0;
    virtual SIMDSCALARI GetPrimID(uint32_t startID)                                            = 0;
};

// The Optimized PA is a state machine that assembles triangles from vertex shader simd
// output. Here is the sequence
//    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
//    2. Execute PA function to assemble and bin triangles.
//        a.    The PA function is a set of functions that collectively make up the
//            state machine for a given topology.
//                1.    We use a state index to track which PA function to call.
//        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
//                1.    We call this the current and previous simd vertex.
//                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
//                    order to assemble the second triangle, for a triangle list, we'll need the
//                    last vertex from the previous simd and the first 2 vertices from the current
//                    simd.
//                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
//
// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
// cuts
struct PA_STATE_OPT : public PA_STATE
{
    uint32_t numPrims{0};         // Total number of primitives for draw.
    uint32_t numPrimsComplete{0}; // Total number of complete primitives.

    uint32_t numSimdPrims{0}; // Number of prims in current simd.

    uint32_t       cur{0};   // index to current VS output.
    uint32_t       prev{0};  // index to prev VS output. Not really needed in the state.
    const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop.

    uint32_t counter{0};   // state counter
    bool     reset{false}; // reset state

    uint32_t    primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2})
    SIMDSCALARI primID;

    typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
#if ENABLE_AVX512_SIMD16
    typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
    typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa,
                                       uint32_t      slot,
                                       uint32_t      primIndex,
                                       simd4scalar   verts[]);

    PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles.
#if ENABLE_AVX512_SIMD16
    PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr};
#endif
    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{
        nullptr}; // PA state machine function for assembling single triangle.
    PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset
#if ENABLE_AVX512_SIMD16
    PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr};
#endif

    // state used to advance the PA when Next is called
    PFN_PA_FUNC pfnPaNextFunc{nullptr};
#if ENABLE_AVX512_SIMD16
    PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr};
#endif
    uint32_t nextNumSimdPrims{0};
    uint32_t nextNumPrimsIncrement{0};
    bool     nextReset{false};
    bool     isStreaming{false};

    SIMDMASK junkIndices{0}; // temporary index store for unused virtual function

    PA_STATE_OPT() {}
    PA_STATE_OPT(DRAW_CONTEXT*      pDC,
                 uint32_t           numPrims,
                 uint8_t*           pStream,
                 uint32_t           streamSizeInVerts,
                 uint32_t           vertexStride,
                 bool               in_isStreaming,
                 uint32_t           numVertsPerPrim,
                 PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);

    bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; }

    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
    {
        SWR_ASSERT(slot < vertexStride);
        uint32_t    offset     = index * vertexStride + slot;
        simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
        return vertexSlot;
    }

#if ENABLE_AVX512_SIMD16
    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
    {
        SWR_ASSERT(slot < vertexStride);
        uint32_t      offset     = index * vertexStride + slot;
        simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
        return vertexSlot;
    }

#endif
    // Assembles 4 triangles. Each simdvector is a single vertex from 4
    // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
    bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); }

#if ENABLE_AVX512_SIMD16
    bool Assemble(uint32_t slot, simd16vector verts[])
    {
        return this->pfnPaFunc_simd16(*this, slot, verts);
    }

#endif
    // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
    {
        return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
    }

    bool NextPrim()
    {
        this->pfnPaFunc = this->pfnPaNextFunc;
#if ENABLE_AVX512_SIMD16
        this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
#endif
        this->numSimdPrims = this->nextNumSimdPrims;
        this->numPrimsComplete += this->nextNumPrimsIncrement;
        this->reset = this->nextReset;

        if (this->isStreaming)
        {
            this->reset = false;
        }

        bool morePrims = false;

        if (this->numSimdPrims > 0)
        {
            morePrims = true;
            this->numSimdPrims--;
        }
        else
        {
            this->counter = (this->reset) ? 0 : (this->counter + 1);
            this->reset   = false;
        }

        if (!HasWork())
        {
            morePrims = false; // no more to do
        }

        return morePrims;
    }

    SIMDVERTEX& GetNextVsOutput()
    {
        const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;

        // increment cur and prev indices
        if (counter < numSimdVerts)
        {
            // prev undefined for first state
            prev = cur;
            cur  = counter;
        }
        else
        {
            // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in
            // the buffer
            uint32_t temp = prev;

            prev = cur;
            cur  = temp;
        }

        SWR_ASSERT(cur < numSimdVerts);
        SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];

        return *(SIMDVERTEX*)pVertex;
    }

    SIMDMASK& GetNextVsIndices()
    {
        // unused in optimized PA, pass tmp buffer back
        return junkIndices;
    }

    bool GetNextStreamOutput()
    {
        this->prev = this->cur;
        this->cur  = this->counter;

        return HasWork();
    }

    uint32_t NumPrims()
    {
        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims)
                   ? (SIMD_WIDTH -
                      (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims))
                   : SIMD_WIDTH;
    }

    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
                      PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
                      uint32_t                         numSimdPrims      = 0,
                      uint32_t                         numPrimsIncrement = 0,
                      bool                             reset             = false)
    {
        this->pfnPaNextFunc         = pfnPaNextFunc;
        this->nextNumSimdPrims      = numSimdPrims;
        this->nextNumPrimsIncrement = numPrimsIncrement;
        this->nextReset             = reset;

        this->pfnPaSingleFunc = pfnPaNextSingleFunc;
    }

#if ENABLE_AVX512_SIMD16
    void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
                             PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
                             PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
                             uint32_t                         numSimdPrims      = 0,
                             uint32_t                         numPrimsIncrement = 0,
                             bool                             reset             = false)
    {
        this->pfnPaNextFunc_simd16  = pfnPaNextFunc_simd16;
        this->pfnPaNextFunc         = pfnPaNextFunc;
        this->nextNumSimdPrims      = numSimdPrims;
        this->nextNumPrimsIncrement = numPrimsIncrement;
        this->nextReset             = reset;

        this->pfnPaSingleFunc = pfnPaNextSingleFunc;
    }

#endif
    void Reset()
    {
#if ENABLE_AVX512_SIMD16
        useAlternateOffset = false;

#endif
        this->pfnPaFunc = this->pfnPaFuncReset;
#if ENABLE_AVX512_SIMD16
        this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
#endif
        this->numPrimsComplete = 0;
        this->numSimdPrims     = 0;
        this->cur              = 0;
        this->prev             = 0;
        this->counter          = 0;
        this->reset            = false;
    }

    SIMDSCALARI GetPrimID(uint32_t startID)
    {
#if USE_SIMD16_FRONTEND
        return _simd16_add_epi32(
            this->primID,
            _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
#else
        return _simd_add_epi32(
            this->primID,
            _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
#endif
    }
};

// helper C wrappers to avoid having to rewrite all the PA topology state functions
INLINE void SetNextPaState(PA_STATE_OPT&                    pa,
                           PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
                           PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
                           uint32_t                         numSimdPrims      = 0,
                           uint32_t                         numPrimsIncrement = 0,
                           bool                             reset             = false)
{
    return pa.SetNextState(
        pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
}

#if ENABLE_AVX512_SIMD16
INLINE void SetNextPaState_simd16(PA_STATE_OPT&                    pa,
                                  PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
                                  PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
                                  PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
                                  uint32_t                         numSimdPrims      = 0,
                                  uint32_t                         numPrimsIncrement = 0,
                                  bool                             reset             = false)
{
    return pa.SetNextState_simd16(pfnPaNextFunc_simd16,
                                  pfnPaNextFunc,
                                  pfnPaNextSingleFunc,
                                  numSimdPrims,
                                  numPrimsIncrement,
                                  reset);
}

#endif
INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
{
    return pa.GetSimdVector(index, slot);
}

#if ENABLE_AVX512_SIMD16
INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
{
    return pa.GetSimdVector_simd16(index, slot);
}

#endif
// Cut-aware primitive assembler.
struct PA_STATE_CUT : public PA_STATE
{
    SIMDMASK* pCutIndices{nullptr};  // cut indices buffer, 1 bit per vertex
    uint32_t  numVerts{0};           // number of vertices available in buffer store
    uint32_t  numAttribs{0};         // number of attributes
    int32_t   numRemainingVerts{0};  // number of verts remaining to be assembled
    uint32_t  numVertsToAssemble{0}; // total number of verts to assemble for the draw
#if ENABLE_AVX512_SIMD16
    OSALIGNSIMD16(uint32_t)
    indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
#else
    OSALIGNSIMD(uint32_t)
    indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
#endif
    SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
    uint32_t    numPrimsAssembled{0};             // number of primitives that are fully assembled
    uint32_t    headVertex{0};      // current unused vertex slot in vertex buffer store
    uint32_t    tailVertex{0};      // beginning vertex currently assembling
    uint32_t    curVertex{0};       // current unprocessed vertex
    uint32_t    startPrimId{0};     // starting prim id
    SIMDSCALARI vPrimId;            // vector of prim ID
    bool        needOffsets{false}; // need to compute gather offsets for current SIMD
    uint32_t    vertsPerPrim{0};
    bool        processCutVerts{
        false}; // vertex indices with cuts should be processed as normal, otherwise they
                // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
                // while the GS sends valid verts for every index

    simdvector junkVector; // junk simdvector for unimplemented API
#if ENABLE_AVX512_SIMD16
    simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
#endif

    // Topology state tracking
    uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
    uint32_t curIndex{0};
    bool     reverseWinding{false}; // indicates reverse winding for strips
    int32_t  adjExtraVert{0};       // extra vert uses for tristrip w/ adj

    typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish);
    PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert

    PA_STATE_CUT() {}
    PA_STATE_CUT(DRAW_CONTEXT*      pDC,
                 uint8_t*           in_pStream,
                 uint32_t           in_streamSizeInVerts,
                 uint32_t           in_vertexStride,
                 SIMDMASK*          in_pIndices,
                 uint32_t           in_numVerts,
                 uint32_t           in_numAttribs,
                 PRIMITIVE_TOPOLOGY topo,
                 bool               in_processCutVerts,
                 uint32_t           in_numVertsPerPrim) :
        PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
    {
        numVerts        = in_streamSizeInVerts;
        numAttribs      = in_numAttribs;
        binTopology     = topo;
        needOffsets     = false;
        processCutVerts = in_processCutVerts;

        numVertsToAssemble = numRemainingVerts = in_numVerts;
        numPrimsAssembled                      = 0;
        headVertex = tailVertex = curVertex = 0;

        curIndex    = 0;
        pCutIndices = in_pIndices;
        memset(indices, 0, sizeof(indices));
#if USE_SIMD16_FRONTEND
        vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
#else
        vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
#endif
        reverseWinding = false;
        adjExtraVert   = -1;

        bool gsEnabled = pDC->pState->state.gsState.gsEnable;
        vertsPerPrim   = NumVertsPerPrim(topo, gsEnabled);

        switch (topo)
        {
        case TOP_TRIANGLE_LIST:
            pfnPa = &PA_STATE_CUT::ProcessVertTriList;
            break;
        case TOP_TRI_LIST_ADJ:
            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj
                              : &PA_STATE_CUT::ProcessVertTriListAdjNoGs;
            break;
        case TOP_TRIANGLE_STRIP:
            pfnPa = &PA_STATE_CUT::ProcessVertTriStrip;
            break;
        case TOP_TRI_STRIP_ADJ:
            if (gsEnabled)
            {
                pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>;
            }
            else
            {
                pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>;
            }
            break;

        case TOP_POINT_LIST:
            pfnPa = &PA_STATE_CUT::ProcessVertPointList;
            break;
        case TOP_LINE_LIST:
            pfnPa = &PA_STATE_CUT::ProcessVertLineList;
            break;
        case TOP_LINE_LIST_ADJ:
            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj
                              : &PA_STATE_CUT::ProcessVertLineListAdjNoGs;
            break;
        case TOP_LINE_STRIP:
            pfnPa = &PA_STATE_CUT::ProcessVertLineStrip;
            break;
        case TOP_LISTSTRIP_ADJ:
            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj
                              : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs;
            break;
        case TOP_RECT_LIST:
            pfnPa = &PA_STATE_CUT::ProcessVertRectList;
            break;
        default:
            assert(0 && "Unimplemented topology");
        }
    }

    SIMDVERTEX& GetNextVsOutput()
    {
        uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
        this->headVertex     = (this->headVertex + SIMD_WIDTH) % this->numVerts;
        this->needOffsets    = true;
        SIMDVECTOR* pVertex  = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];

        return *(SIMDVERTEX*)pVertex;
    }

    SIMDMASK& GetNextVsIndices()
    {
        uint32_t  vertexIndex  = this->headVertex / SIMD_WIDTH;
        SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
        return *pCurCutIndex;
    }

    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
    {
        // unused
        SWR_ASSERT(0 && "Not implemented");
        return junkVector;
    }

#if ENABLE_AVX512_SIMD16
    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
    {
        // unused
        SWR_ASSERT(0 && "Not implemented");
        return junkVector_simd16;
    }

#endif
    bool GetNextStreamOutput()
    {
        this->headVertex += SIMD_WIDTH;
        this->needOffsets = true;
        return HasWork();
    }

    SIMDSCALARI GetPrimID(uint32_t startID)
    {
#if USE_SIMD16_FRONTEND
        return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
#else
        return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
#endif
    }

    void Reset()
    {
#if ENABLE_AVX512_SIMD16
        useAlternateOffset = false;

#endif
        this->numRemainingVerts = this->numVertsToAssemble;
        this->numPrimsAssembled = 0;
        this->curIndex          = 0;
        this->curVertex         = 0;
        this->tailVertex        = 0;
        this->headVertex        = 0;
        this->reverseWinding    = false;
        this->adjExtraVert      = -1;
#if USE_SIMD16_FRONTEND
        this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
#else
        this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
#endif
    }

    bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; }

    bool IsVertexStoreFull()
    {
        return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
    }

    void RestartTopology()
    {
        this->curIndex       = 0;
        this->reverseWinding = false;
        this->adjExtraVert   = -1;
    }

    bool IsCutIndex(uint32_t vertex)
    {
        uint32_t vertexIndex  = vertex / SIMD_WIDTH;
        uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
        return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
    }

    // iterates across the unprocessed verts until we hit the end or we
    // have assembled SIMD prims
    void ProcessVerts()
    {
        while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 &&
               this->curVertex != this->headVertex)
        {
            // if cut index, restart topology
            if (IsCutIndex(this->curVertex))
            {
                if (this->processCutVerts)
                {
                    (this->*pfnPa)(this->curVertex, false);
                }
                // finish off tri strip w/ adj before restarting topo
                if (this->adjExtraVert != -1)
                {
                    (this->*pfnPa)(this->curVertex, true);
                }
                RestartTopology();
            }
            else
            {
                (this->*pfnPa)(this->curVertex, false);
            }

            this->curVertex++;
            if (this->curVertex >= this->numVerts)
            {
                this->curVertex = 0;
            }
            this->numRemainingVerts--;
        }

        // special case last primitive for tri strip w/ adj
        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 &&
            this->adjExtraVert != -1)
        {
            (this->*pfnPa)(this->curVertex, true);
        }
    }

    void Advance()
    {
        // done with current batch
        // advance tail to the current unsubmitted vertex
        this->tailVertex        = this->curVertex;
        this->numPrimsAssembled = 0;
#if USE_SIMD16_FRONTEND
        this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
#else
        this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
#endif
    }

    bool NextPrim()
    {
        // if we've assembled enough prims, we can advance to the next set of verts
        if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
        {
            Advance();
        }
        return false;
    }

    void ComputeOffsets()
    {
        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
        {
            uint32_t    vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
            SIMDSCALARI vIndices          = *(SIMDSCALARI*)&this->indices[v][0];

            // step to simdvertex batch
            const uint32_t simdShift = SIMD_WIDTH_LOG2;
#if USE_SIMD16_FRONTEND
            SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
            this->vOffsets[v] =
                _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
#else
            SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
            this->vOffsets[v] =
                _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
#endif

            // step to index
            const uint32_t simdMask = SIMD_WIDTH - 1;
#if USE_SIMD16_FRONTEND
            SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
            this->vOffsets[v]        = _simd16_add_epi32(
                this->vOffsets[v],
                _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
#else
            SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
            this->vOffsets[v] =
                _simd_add_epi32(this->vOffsets[v],
                                _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
#endif
        }
    }

    bool Assemble(uint32_t slot, simdvector* verts)
    {
        // process any outstanding verts
        ProcessVerts();

        // return false if we don't have enough prims assembled
        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
        {
            return false;
        }

        // cache off gather offsets given the current SIMD set of indices the first time we get an
        // assemble
        if (this->needOffsets)
        {
            ComputeOffsets();
            this->needOffsets = false;
        }

        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
        {
            SIMDSCALARI offsets = this->vOffsets[v];

            // step to attribute
#if USE_SIMD16_FRONTEND
            offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
#else
            offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
#endif

            float* pBase = (float*)this->pStreamBase;
            for (uint32_t c = 0; c < 4; ++c)
            {
#if USE_SIMD16_FRONTEND
                simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);

                // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
                simdscalar t =
                    useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
                verts[v].v[c] = t;
#else
                verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
#endif

                // move base to next component
                pBase += SIMD_WIDTH;
            }
        }

        // compute the implied 4th vertex, v3
        if (this->binTopology == TOP_RECT_LIST)
        {
            for (uint32_t c = 0; c < 4; ++c)
            {
                // v1, v3 = v1 + v2 - v0, v2
                // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
                simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
                temp              = _simd16_sub_ps(temp, verts[1].v[c]);
                temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
                verts[1].v[c] = _simd16_extract_ps(temp, 0);
            }
        }

        return true;
    }

#if ENABLE_AVX512_SIMD16
    bool Assemble(uint32_t slot, simd16vector verts[])
    {
       // process any outstanding verts
        ProcessVerts();

        // return false if we don't have enough prims assembled
        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
        {
            return false;
        }

        // cache off gather offsets given the current SIMD set of indices the first time we get an
        // assemble
        if (this->needOffsets)
        {
            ComputeOffsets();
            this->needOffsets = false;
        }

        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
        {
            SIMDSCALARI offsets = this->vOffsets[v];

            // step to attribute
#if USE_SIMD16_FRONTEND
            offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
#else
            offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
#endif

            float* pBase = (float*)this->pStreamBase;
            for (uint32_t c = 0; c < 4; ++c)
            {
#if USE_SIMD16_FRONTEND
                verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
#else
                verts[v].v[c] = _simd16_insert_ps(
                    _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
#endif

                // move base to next component
                pBase += SIMD_WIDTH;
            }
        }

        // compute the implied 4th vertex, v3
        if (this->binTopology == TOP_RECT_LIST)
        {
            for (uint32_t c = 0; c < 4; ++c)
            {
                // v1, v3 = v1 + v2 - v0, v2
                // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
                simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
                temp              = _simd16_sub_ps(temp, verts[1].v[c]);
                verts[1].v[c] =
                    _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
            }
        }

        return true;
    }

#endif
    void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
    {
       // move to slot
        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
        {
            uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
#if USE_SIMD16_FRONTEND
            uint32_t offset =
                useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
#else
            uint32_t offset = pOffset[triIndex];
#endif
            offset += sizeof(SIMDVECTOR) * slot;
            float* pVert = (float*)&tri[v];
            for (uint32_t c = 0; c < 4; ++c)
            {
                float* pComponent = (float*)(this->pStreamBase + offset);
                pVert[c]          = *pComponent;
                offset += SIMD_WIDTH * sizeof(float);
            }
        }

        // compute the implied 4th vertex, v3
        if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1))
        {
            // v1, v3 = v1 + v2 - v0, v2
            // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2]
            float* pVert0 = (float*)&tri[1];
            float* pVert1 = (float*)&tri[0];
            float* pVert2 = (float*)&tri[2];
            float* pVert3 = (float*)&tri[1];
            for (uint32_t c = 0; c < 4; ++c)
            {
                pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c];
            }
        }
    }

    uint32_t NumPrims() { return this->numPrimsAssembled; }

    // Per-topology functions
    void ProcessVertTriStrip(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 3)
        {
            // assembled enough verts for prim, add to gather indices
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            if (reverseWinding)
            {
                this->indices[1][this->numPrimsAssembled] = this->vert[2];
                this->indices[2][this->numPrimsAssembled] = this->vert[1];
            }
            else
            {
                this->indices[1][this->numPrimsAssembled] = this->vert[1];
                this->indices[2][this->numPrimsAssembled] = this->vert[2];
            }

            // increment numPrimsAssembled
            this->numPrimsAssembled++;

            // set up next prim state
            this->vert[0]  = this->vert[1];
            this->vert[1]  = this->vert[2];
            this->curIndex = 2;
            this->reverseWinding ^= 1;
        }
    }

    template <bool gsEnabled>
    void AssembleTriStripAdj()
    {
        if (!gsEnabled)
        {
            this->vert[1] = this->vert[2];
            this->vert[2] = this->vert[4];

            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];
            this->indices[2][this->numPrimsAssembled] = this->vert[2];

            this->vert[4] = this->vert[2];
            this->vert[2] = this->vert[1];
        }
        else
        {
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];
            this->indices[2][this->numPrimsAssembled] = this->vert[2];
            this->indices[3][this->numPrimsAssembled] = this->vert[3];
            this->indices[4][this->numPrimsAssembled] = this->vert[4];
            this->indices[5][this->numPrimsAssembled] = this->vert[5];
        }
        this->numPrimsAssembled++;
    }

    template <bool gsEnabled>
    void ProcessVertTriStripAdj(uint32_t index, bool finish)
    {
        // handle last primitive of tristrip
        if (finish && this->adjExtraVert != -1)
        {
            this->vert[3] = this->adjExtraVert;
            AssembleTriStripAdj<gsEnabled>();
            this->adjExtraVert = -1;
            return;
        }

        switch (this->curIndex)
        {
        case 0:
        case 1:
        case 2:
        case 4:
            this->vert[this->curIndex] = index;
            this->curIndex++;
            break;
        case 3:
            this->vert[5] = index;
            this->curIndex++;
            break;
        case 5:
            if (this->adjExtraVert == -1)
            {
                this->adjExtraVert = index;
            }
            else
            {
                this->vert[3] = index;
                if (!gsEnabled)
                {
                    AssembleTriStripAdj<gsEnabled>();

                    uint32_t nextTri[6];
                    if (this->reverseWinding)
                    {
                        nextTri[0] = this->vert[4];
                        nextTri[1] = this->vert[0];
                        nextTri[2] = this->vert[2];
                        nextTri[4] = this->vert[3];
                        nextTri[5] = this->adjExtraVert;
                    }
                    else
                    {
                        nextTri[0] = this->vert[2];
                        nextTri[1] = this->adjExtraVert;
                        nextTri[2] = this->vert[3];
                        nextTri[4] = this->vert[4];
                        nextTri[5] = this->vert[0];
                    }
                    for (uint32_t i = 0; i < 6; ++i)
                    {
                        this->vert[i] = nextTri[i];
                    }

                    this->adjExtraVert = -1;
                    this->reverseWinding ^= 1;
                }
                else
                {
                    this->curIndex++;
                }
            }
            break;
        case 6:
            SWR_ASSERT(this->adjExtraVert != -1, "Algorithm failure!");
            AssembleTriStripAdj<gsEnabled>();

            uint32_t nextTri[6];
            if (this->reverseWinding)
            {
                nextTri[0] = this->vert[4];
                nextTri[1] = this->vert[0];
                nextTri[2] = this->vert[2];
                nextTri[4] = this->vert[3];
                nextTri[5] = this->adjExtraVert;
            }
            else
            {
                nextTri[0] = this->vert[2];
                nextTri[1] = this->adjExtraVert;
                nextTri[2] = this->vert[3];
                nextTri[4] = this->vert[4];
                nextTri[5] = this->vert[0];
            }
            for (uint32_t i = 0; i < 6; ++i)
            {
                this->vert[i] = nextTri[i];
            }
            this->reverseWinding ^= 1;
            this->adjExtraVert = index;
            this->curIndex--;
            break;
        }
    }

    void ProcessVertTriList(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 3)
        {
            // assembled enough verts for prim, add to gather indices
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];
            this->indices[2][this->numPrimsAssembled] = this->vert[2];

            // increment numPrimsAssembled
            this->numPrimsAssembled++;

            // set up next prim state
            this->curIndex = 0;
        }
    }

    void ProcessVertTriListAdj(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 6)
        {
            // assembled enough verts for prim, add to gather indices
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];
            this->indices[2][this->numPrimsAssembled] = this->vert[2];
            this->indices[3][this->numPrimsAssembled] = this->vert[3];
            this->indices[4][this->numPrimsAssembled] = this->vert[4];
            this->indices[5][this->numPrimsAssembled] = this->vert[5];

            // increment numPrimsAssembled
            this->numPrimsAssembled++;

            // set up next prim state
            this->curIndex = 0;
        }
    }

    void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 6)
        {
            // assembled enough verts for prim, add to gather indices
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[2];
            this->indices[2][this->numPrimsAssembled] = this->vert[4];

            // increment numPrimsAssembled
            this->numPrimsAssembled++;

            // set up next prim state
            this->curIndex = 0;
        }
    }

    void ProcessVertLineList(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 2)
        {
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];

            this->numPrimsAssembled++;
            this->curIndex = 0;
        }
    }

    void ProcessVertLineStrip(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 2)
        {
            // assembled enough verts for prim, add to gather indices
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];

            // increment numPrimsAssembled
            this->numPrimsAssembled++;

            // set up next prim state
            this->vert[0]  = this->vert[1];
            this->curIndex = 1;
        }
    }

    void ProcessVertLineStripAdj(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 4)
        {
            // assembled enough verts for prim, add to gather indices
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];
            this->indices[2][this->numPrimsAssembled] = this->vert[2];
            this->indices[3][this->numPrimsAssembled] = this->vert[3];

            // increment numPrimsAssembled
            this->numPrimsAssembled++;

            // set up next prim state
            this->vert[0]  = this->vert[1];
            this->vert[1]  = this->vert[2];
            this->vert[2]  = this->vert[3];
            this->curIndex = 3;
        }
    }

    void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 4)
        {
            // assembled enough verts for prim, add to gather indices
            this->indices[0][this->numPrimsAssembled] = this->vert[1];
            this->indices[1][this->numPrimsAssembled] = this->vert[2];

            // increment numPrimsAssembled
            this->numPrimsAssembled++;

            // set up next prim state
            this->vert[0]  = this->vert[1];
            this->vert[1]  = this->vert[2];
            this->vert[2]  = this->vert[3];
            this->curIndex = 3;
        }
    }

    void ProcessVertLineListAdj(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 4)
        {
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];
            this->indices[2][this->numPrimsAssembled] = this->vert[2];
            this->indices[3][this->numPrimsAssembled] = this->vert[3];

            this->numPrimsAssembled++;
            this->curIndex = 0;
        }
    }

    void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 4)
        {
            this->indices[0][this->numPrimsAssembled] = this->vert[1];
            this->indices[1][this->numPrimsAssembled] = this->vert[2];

            this->numPrimsAssembled++;
            this->curIndex = 0;
        }
    }

    void ProcessVertPointList(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 1)
        {
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->numPrimsAssembled++;
            this->curIndex = 0;
        }
    }

    void ProcessVertRectList(uint32_t index, bool finish)
    {
        this->vert[this->curIndex] = index;
        this->curIndex++;
        if (this->curIndex == 3)
        {
            // assembled enough verts for prim, add to gather indices
            this->indices[0][this->numPrimsAssembled] = this->vert[0];
            this->indices[1][this->numPrimsAssembled] = this->vert[1];
            this->indices[2][this->numPrimsAssembled] = this->vert[2];

            // second triangle in the rectangle
            // v1, v3 = v1 + v2 - v0, v2
            this->indices[0][this->numPrimsAssembled + 1] = this->vert[1];
            this->indices[1][this->numPrimsAssembled + 1] = this->vert[0];
            this->indices[2][this->numPrimsAssembled + 1] = this->vert[2];

            // increment numPrimsAssembled
            this->numPrimsAssembled += 2;

            // set up next prim state
            this->curIndex = 0;
        }
    }
};

// Primitive Assembly for data output from the DomainShader.
struct PA_TESS : PA_STATE
{
    PA_TESS(DRAW_CONTEXT*     in_pDC,
            const SIMDSCALAR* in_pVertData,
            uint32_t          in_attributeStrideInVectors,
            uint32_t          in_vertexStride,
            uint32_t          in_numAttributes,
            uint32_t* (&in_ppIndices)[3],
            uint32_t           in_numPrims,
            PRIMITIVE_TOPOLOGY in_binTopology,
            uint32_t           numVertsPerPrim,
            bool               SOA = true) :

        PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
        m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors),
        m_numAttributes(in_numAttributes), m_numPrims(in_numPrims), m_SOA(SOA)
    {
#if USE_SIMD16_FRONTEND
        m_vPrimId = _simd16_setzero_si();
#else
        m_vPrimId = _simd_setzero_si();
#endif
        binTopology    = in_binTopology;
        m_ppIndices[0] = in_ppIndices[0];
        m_ppIndices[1] = in_ppIndices[1];
        m_ppIndices[2] = in_ppIndices[2];

        switch (binTopology)
        {
        case TOP_POINT_LIST:
            m_numVertsPerPrim = 1;
            break;

        case TOP_LINE_LIST:
            m_numVertsPerPrim = 2;
            break;

        case TOP_TRIANGLE_LIST:
            m_numVertsPerPrim = 3;
            break;

        default:
            SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
            break;
        }
    }

    bool HasWork() { return m_numPrims != 0; }

    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
    {
        SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
        return junkVector;
    }

#if ENABLE_AVX512_SIMD16
    simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
    {
        SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
        return junkVector_simd16;
    }

#endif
    static SIMDSCALARI GenPrimMask(uint32_t numPrims)
    {
        SWR_ASSERT(numPrims <= SIMD_WIDTH);
#if USE_SIMD16_FRONTEND
        static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = {
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};

        return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
#else
        static const OSALIGNLINE(int32_t)
            maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};

        return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
#endif
    }

    bool Assemble(uint32_t slot, simdvector verts[])
    {
        SWR_ASSERT(slot < m_numAttributes);

        uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
        if (0 == numPrimsToAssemble)
        {
            return false;
        }

        SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);

        const float* pBaseAttrib;
        if (m_SOA)
        {
            pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
        }
        else
        {
            const float* pVertData = (const float*)m_pVertexData;
            pBaseAttrib            = pVertData + slot * 4;
        }

        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
        {
#if USE_SIMD16_FRONTEND
            SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
#else
            SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
#endif

            const float* pBase = pBaseAttrib;
            for (uint32_t c = 0; c < 4; ++c)
            {
#if USE_SIMD16_FRONTEND
                simd16scalar temp =
                    _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
                                              pBase,
                                              indices,
                                              _simd16_castsi_ps(mask),
                                              4 /* gcc doesn't like sizeof(float) */);

                verts[i].v[c] =
                    useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
#else
                verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(),
                                                        pBase,
                                                        indices,
                                                        _simd_castsi_ps(mask),
                                                        4); // gcc doesn't like sizeof(float)
#endif
                if (m_SOA)
                {
                    pBase += m_attributeStrideInVectors * SIMD_WIDTH;
                }
                else
                {
                    pBase += sizeof(float);
                }
            }
        }

        return true;
    }

#if ENABLE_AVX512_SIMD16
    bool Assemble(uint32_t slot, simd16vector verts[])
    {
        SWR_ASSERT(slot < m_numAttributes);

        uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
        if (0 == numPrimsToAssemble)
        {
            return false;
        }

        SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);

        const float* pBaseAttrib;
        if (m_SOA)
        {
            pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
        }
        else
        {
            const float* pVertData = (const float*)m_pVertexData;
            pBaseAttrib            = pVertData + slot * 4;
        }

        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
        {
#if USE_SIMD16_FRONTEND
            SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
            if (!m_SOA)
            {
                indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
            }
#else
            SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
#endif

            const float* pBase = pBaseAttrib;
            for (uint32_t c = 0; c < 4; ++c)
            {
#if USE_SIMD16_FRONTEND
                verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
                                                          pBase,
                                                          indices,
                                                          _simd16_castsi_ps(mask),
                                                          4 /* gcc doesn't like sizeof(float) */);
#else
                simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(),
                                                          pBase,
                                                          indices,
                                                          _simd_castsi_ps(mask),
                                                          4 /* gcc doesn't like sizeof(float) */);
                verts[i].v[c]   = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
#endif
                if (m_SOA)
                {
                    pBase += m_attributeStrideInVectors * SIMD_WIDTH;
                }
                else
                {
                    pBase++;
                }
            }
        }

        return true;
    }

#endif
    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
    {
        SWR_ASSERT(slot < m_numAttributes);


        SWR_ASSERT(primIndex < PA_TESS::NumPrims());

        const float* pVertDataBase;
        if (m_SOA)
        {
            pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
        }
        else
        {
            const float* pVertData = (const float*)m_pVertexData;
            pVertDataBase          = pVertData + slot * 4;
        };
        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
        {
#if USE_SIMD16_FRONTEND
            uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2]
                                                : m_ppIndices[i][primIndex];
            if (!m_SOA)
            {
                index *= (vertexStride / 4);
            }
#else
            uint32_t index = m_ppIndices[i][primIndex];
#endif
            const float* pVertData = pVertDataBase;
            float*       pVert     = (float*)&verts[i];

            for (uint32_t c = 0; c < 4; ++c)
            {
                pVert[c] = pVertData[index];
                if (m_SOA)
                {
                    pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
                }
                else
                {
                    pVertData++;
                }
            }

        }
    }

    bool NextPrim()
    {
        uint32_t numPrims = PA_TESS::NumPrims();
        m_numPrims -= numPrims;
        m_ppIndices[0] += numPrims;
        m_ppIndices[1] += numPrims;
        m_ppIndices[2] += numPrims;

        return HasWork();
    }

    SIMDVERTEX& GetNextVsOutput()
    {
        SWR_NOT_IMPL;
        return junkVertex;
    }

    bool GetNextStreamOutput()
    {
        SWR_NOT_IMPL;
        return false;
    }

    SIMDMASK& GetNextVsIndices()
    {
        SWR_NOT_IMPL;
        return junkIndices;
    }

    uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); }

    void Reset() { SWR_NOT_IMPL; }

    SIMDSCALARI GetPrimID(uint32_t startID)
    {
#if USE_SIMD16_FRONTEND
        return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
#else
        return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
#endif
    }

private:
    const SIMDSCALAR* m_pVertexData              = nullptr;
    uint32_t          m_attributeStrideInVectors = 0;
    uint32_t          m_numAttributes            = 0;
    uint32_t          m_numPrims                 = 0;
    uint32_t*         m_ppIndices[3];

    uint32_t m_numVertsPerPrim = 0;

    SIMDSCALARI m_vPrimId;

    simdvector junkVector; // junk simdvector for unimplemented API
#if ENABLE_AVX512_SIMD16
    simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
#endif
    SIMDVERTEX junkVertex;  // junk SIMDVERTEX for unimplemented API
    SIMDMASK   junkIndices; // temporary index store for unused virtual function

    bool m_SOA;
};

// Primitive Assembler factory class, responsible for creating and initializing the correct
// assembler based on state.
template <typename IsIndexedT, typename IsCutIndexEnabledT>
struct PA_FACTORY
{
    PA_FACTORY(DRAW_CONTEXT*         pDC,
               PRIMITIVE_TOPOLOGY    in_topo,
               uint32_t              numVerts,
               PA_STATE::SIMDVERTEX* pVertexStore,
               uint32_t              vertexStoreSize,
               uint32_t              vertexStride,
               uint32_t              numVertsPerPrim) :
        topo(in_topo)
    {
#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
        const API_STATE& state = GetApiState(pDC);
        if ((IsIndexedT::value && IsCutIndexEnabledT::value &&
             (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST ||
              topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) ||

            // non-indexed draws with adjacency topologies must use cut-aware PA until we add
            // support for them in the optimized PA
            (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
             topo == TOP_TRI_STRIP_ADJ))
        {
            memset(&indexStore, 0, sizeof(indexStore));
            uint32_t numAttribs = state.feNumAttributes;

            new (&this->paCut) PA_STATE_CUT(pDC,
                                            reinterpret_cast<uint8_t*>(pVertexStore),
                                            vertexStoreSize * PA_STATE::SIMD_WIDTH,
                                            vertexStride,
                                            &this->indexStore[0],
                                            numVerts,
                                            numAttribs,
                                            state.topology,
                                            false,
                                            numVertsPerPrim);
            cutPA = true;
        }
        else
#endif
        {
            uint32_t numPrims = GetNumPrims(in_topo, numVerts);
            new (&this->paOpt) PA_STATE_OPT(pDC,
                                            numPrims,
                                            reinterpret_cast<uint8_t*>(pVertexStore),
                                            vertexStoreSize * PA_STATE::SIMD_WIDTH,
                                            vertexStride,
                                            false,
                                            numVertsPerPrim);
            cutPA = false;
        }
    }

    PA_STATE& GetPA()
    {
#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
        if (cutPA)
        {
            return this->paCut;
        }
        else
#endif
        {
            return this->paOpt;
        }
    }

    PA_STATE_OPT paOpt;
    PA_STATE_CUT paCut;

    bool cutPA{false};

    PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN};

    PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
};