// =============================================================================
// === GPUQREngine/Include/Kernel/Apply/block_apply_chunk.cu ===================
// =============================================================================

//------------------------------------------------------------------------------
// block_apply_chunk macro
//------------------------------------------------------------------------------

// A = A - V*T'*V'*A, for a single chunk of N columns of A, starting at column
// j1 and ending at j1+N-1.
//
// This function uses fixed thread geometry and loop unrolling, which requires
// the geometry to be known at compile time for best efficiency.  It is then
// #include'd by the block_apply_x function (block_apply.cu).  The following
// terms are #define'd by each specific version:
//
//      ROW_PANELSIZE    # of row tiles in V and A
//      COL_PANELSIZE    # of column tiles in C and A
//      CBITTYROWS       # of rows in the C bitty block
//      CBITTYCOLS       # of cols in the C bitty block
//      ABITTYROWS       # of rows in the A bitty block
//      ABITTYCOLS       # of cols in the A bitty block
//
// The C bitty must cannot be larger than the A bitty block, since additional
// registers are used to buffer the A matrix while the C bitty block is being
// computed.  These buffer registers are not used while computing with the A
// bitty block, so for some variants of this kernel, they can be overlapped
// with the A bitty block.
//
// The ROW_PANELSIZE, COL_PANELSIZE, ROW_EDGE_CASE, and COL_EDGE_CASE are
// #define'd by the parent file(s) that include this file.  The *_EDGE_CASE
// macros are then #undefined here.  The bitty block dimensions are defined
// below.  This file is #include'd into block_apply.cu.  It is not a standalone
// function.

{

    //--------------------------------------------------------------------------
    // bitty block sizes
    //--------------------------------------------------------------------------

    #if (ROW_PANELSIZE == 3)

        #if (COL_PANELSIZE == 2)

            //------------------------------------------------------------------
            // 3-by-2 block apply
            //------------------------------------------------------------------

            // V is 3-by-1, C is 1-by-2, A is 3-by-2 (in # tiles)
            // 256 threads, each does a 4-by-2 block of C = T'*V'*A
            #define CBITTYROWS      4
            #define CBITTYCOLS      2
            // 384 threads, each does a 4-by-4 block of A = A-V*C
            #define ABITTYROWS      4
            #define ABITTYCOLS      4

        #else

            //------------------------------------------------------------------
            // 3-by-1 block apply
            //------------------------------------------------------------------

            // V is 3-by-1, C is 1-by-1, A is 3-by-1 (in # tiles)
            // 256 threads, each does a 2-by-2 block of C = T'*V'*A
            #define CBITTYROWS      2
            #define CBITTYCOLS      2
            // 384 threads, each does a 2-by-4 block of A = A-V*C
            #define ABITTYROWS      2
            #define ABITTYCOLS      4

        #endif

    #elif (ROW_PANELSIZE == 2)

        #if (COL_PANELSIZE == 2)

            //------------------------------------------------------------------
            // block_apply_2_by_2
            //------------------------------------------------------------------

            // V is 2-by-1, C is 1-by-2, A is 2-by-2 (in # tiles)
            // 256 threads, each does a 4-by-2 block of C = T'*V'*A
            #define CBITTYROWS      4
            #define CBITTYCOLS      2
            // 256 threads, each does a 4-by-4 block of A = A-V*C
            #define ABITTYROWS      4
            #define ABITTYCOLS      4

        #else

            //------------------------------------------------------------------
            // block_apply_2_by_1
            //------------------------------------------------------------------

            // V is 2-by-1, C is 1-by-1, A is 2-by-1 (in # tiles)
            // 256 threads, each does a 2-by-2 block of C = T'*V'*A
            #define CBITTYROWS      2
            #define CBITTYCOLS      2
            // 256 threads, each does a 2-by-4 block of A = A-V*C
            #define ABITTYROWS      2
            #define ABITTYCOLS      4


        #endif

    #else

        #if (COL_PANELSIZE == 2)

            //------------------------------------------------------------------
            // block_apply_1_by_2
            //------------------------------------------------------------------

            // V is 1-by-1, C is 1-by-2, A is 1-by-2 (in # tiles)
            // 256 threads, each does a 4-by-2 block of C = T'*V'*A
            #define CBITTYROWS      2
            #define CBITTYCOLS      4
            // 256 threads, each does a 4-by-2 block of A = A-V*C
            #define ABITTYROWS      2
            #define ABITTYCOLS      4

        #else

            //------------------------------------------------------------------
            // block_apply_1_by_1
            //------------------------------------------------------------------

            // V is 1-by-1, C is 1-by-1, A is 1-by-1 (in # tiles)
            // 256 threads, each does a 2-by-2 block of C = T'*V'*A
            #define CBITTYROWS      2
            #define CBITTYCOLS      2
            // 256 threads, each does a 2-by-2 block of A = A-V*C
            #define ABITTYROWS      2
            #define ABITTYCOLS      2

        #endif

    #endif

    //--------------------------------------------------------------------------
    // matrix sizes and thread geometry
    //--------------------------------------------------------------------------

    // For each outer iteration, C is M-by-N, V is (K+1)-by-M (with an extra
    // row for T), and A is K-by-N.
    #define K           (ROW_PANELSIZE * M)
    #define N           (COL_PANELSIZE * M)

    // threads to use for C=T'*(V'*A)
    #define CTHREADS    ((M * N) / (CBITTYROWS * CBITTYCOLS))

    // threads to use for A=A-V*C
    #define ATHREADS    ((K * N) / (ABITTYROWS * ABITTYCOLS))

    //--------------------------------------------------------------------------
    // bitty blocks for the computation
    //--------------------------------------------------------------------------

    // Each thread owns a bitty block of C for C=T'*V'*A.  The top left entry
    // owned by a thread is C(ic,jc).  Thread 0 does C(0,0), thread 1 does
    // C(1,0) ...
    #define ic          (threadIdx.x % (M/CBITTYROWS))
    #define jc          (threadIdx.x / (M/CBITTYROWS))
    #define MYCBITTYROW(ii) (ii * (M/CBITTYROWS) + ic)
    #define MYCBITTYCOL(jj) (jj * (N/CBITTYCOLS) + jc)

    // Each thread owns a bitty block of A for A=A-V*C, with top left entry
    // A(ia,ja).  Thread 0 does A(0,0), thread 1 does A(0,1), thread 2 does
    // A(0,2), ... so that global memory loads/stores are coallesced across a
    // warp.
    #define ia          (threadIdx.x / (N/ABITTYCOLS))
    #define ja          (threadIdx.x % (N/ABITTYCOLS))
    #define MYABITTYROW(ii) (ii * (K/ABITTYROWS) + ia)
    #define MYABITTYCOL(jj) (jj * (N/ABITTYCOLS) + ja)

    //--------------------------------------------------------------------------
    // loading the A matrix
    //--------------------------------------------------------------------------

    // Each thread loads a set of entries of A defined by iaload and jaload.
    // The first entry loaded by a thread is A(iaload,jaload), and then it
    // loads entries every ACHUNKSIZE rows after that (in the same column
    // jaload).
    #define iaload      (threadIdx.x / N)
    #define jaload      (threadIdx.x % N)
    #define ACHUNKSIZE  (NUMTHREADS / N)
    #define NACHUNKS    CEIL (HALFTILE*N, NUMTHREADS)

    int fjload = j1 + jaload ;

    //--------------------------------------------------------------------------
    // register allocation
    //--------------------------------------------------------------------------

    // C bitty block is no larger than the A bitty block, in both dimensions.
    double rbit [ABITTYROWS][ABITTYCOLS] ;
    double rrow [ABITTYROWS] ;
    double rcol [ABITTYCOLS] ;

    #if (CBITTYCOLS == ABITTYCOLS)
        // the A bitty block is too small to hold the A buffer
        double abuffer [NACHUNKS] ;
        #define rbitA(i) abuffer [i]
    #else
        // use the last column of the A bitty block for the A buffer
        #define rbitA(i) (rbit [i][ABITTYCOLS-1])
    #endif

    //--------------------------------------------------------------------------
    // edge case
    //--------------------------------------------------------------------------

    #ifdef ROW_EDGE_CASE
        // check if a row is inside the front.
        #define INSIDE_ROW(test) (test)
    #else
        // the row is guaranteed to reside inside the frontal matrix.
        #define INSIDE_ROW(test) (1)
    #endif

    #ifdef COL_EDGE_CASE
        // check if a column is inside the front.
        #define INSIDE_COL(test) (test)
    #else
        // the column is guaranteed to reside inside the frontal matrix.
        #define INSIDE_COL(test) (1)
    #endif

    bool aloader = INSIDE_COL (fjload < fn) ;

    //--------------------------------------------------------------------------
    // C = V'*A, where V is now in shared, and A is loaded from global
    //--------------------------------------------------------------------------

    // prefetch the first halftile of A from global to register
    #pragma unroll
    for (int ii = 0 ; ii < NACHUNKS ; ii++)
    {
        rbitA (ii) = 0 ;
    }
    #pragma unroll
    for (int ii = 0 ; ii < NACHUNKS ; ii++)
    {
        int i = ii * ACHUNKSIZE + iaload ;
        if (ii < NACHUNKS-1 || i < HALFTILE)
        {
            int fi = IFRONT (0, i) ;
            if (aloader && INSIDE_ROW (fi < fm))
            {
                rbitA (ii) = glF [fi * fn + fjload] ;
            }
        }
    }

    // The X=V*C computation in the prior iteration reads shC, but the same
    // space is used to load A from the frontal matrix in this iteration.
    __syncthreads ( ) ;

    // clear the C bitty block
    #pragma unroll
    for (int ii = 0 ; ii < CBITTYROWS ; ii++)
    {
        #pragma unroll
        for (int jj = 0 ; jj < CBITTYCOLS ; jj++)
        {
            rbit [ii][jj] = 0 ;
        }
    }

    // C=V'*A for the first tile of V, which is lower triangular
    #define FIRST_TILE
    #include "cevta_tile.cu"
    
    // Subsequent tiles of V are square.  Result is in C bitty block.
    for (int t = 1 ; t < ROW_PANELSIZE ; t++)
    {
        #include "cevta_tile.cu"
    }

    //--------------------------------------------------------------------------
    // write result of C=V'*A into shared, and clear the C bitty block
    //--------------------------------------------------------------------------

    if (CTHREADS == NUMTHREADS || threadIdx.x < CTHREADS)
    {
        #pragma unroll
        for (int ii = 0 ; ii < CBITTYROWS ; ii++)
        {
            int i = MYCBITTYROW (ii) ;
            #pragma unroll
            for (int jj = 0 ; jj < CBITTYCOLS ; jj++)
            {
                int j = MYCBITTYCOL (jj) ;
                shC [i][j] = rbit [ii][jj] ;
                rbit [ii][jj] = 0 ;
            }
        }
    }

    // make sure all of shC is available to all threads
    __syncthreads ( ) ;

    //--------------------------------------------------------------------------
    // C = triu(T)'*C, leaving the result in the C bitty block
    //--------------------------------------------------------------------------

    if (CTHREADS == NUMTHREADS || threadIdx.x < CTHREADS)
    {
        #pragma unroll
        for (int i = 0 ; i < M ; i++)
        {
            #pragma unroll
            for (int ii = 0 ; ii < CBITTYROWS ; ii++)
            {
                int j = MYCBITTYROW (ii) ;
                if (i <= j)
                {
                    rrow [ii] = ST (i,j) ;
                }
            }
            #pragma unroll
            for (int jj = 0 ; jj < CBITTYCOLS ; jj++)
            {
                int j = MYCBITTYCOL (jj) ;
                rcol [jj] = shC [i][j] ;
            }
            #pragma unroll
            for (int ii = 0 ; ii < CBITTYROWS ; ii++)
            {
                int j = MYCBITTYROW (ii) ;
                if (i <= j)
                {
                    #pragma unroll
                    for (int jj = 0 ; jj < CBITTYCOLS ; jj++)
                    {
                        rbit [ii][jj] += rrow [ii] * rcol [jj] ;
                    }
                }                
            }
        }
    }

    // We need syncthreads here because of the write-after-read hazard.  Each
    // thread reads the old C, above, and then C is modified below with the new
    // C, where newC = triu(T)'*oldC.
    __syncthreads ( ) ;

    //--------------------------------------------------------------------------
    // write the result of C = T'*C to shared memory
    //--------------------------------------------------------------------------

    if (CTHREADS == NUMTHREADS || threadIdx.x < CTHREADS)
    {
        #pragma unroll
        for (int ii = 0 ; ii < CBITTYROWS ; ii++)
        {
            int i = MYCBITTYROW (ii) ;
            #pragma unroll
            for (int jj = 0 ; jj < CBITTYCOLS ; jj++)
            {
                int j = MYCBITTYCOL (jj) ;
                shC [i][j] = rbit [ii][jj] ;
            }
        }
    }

    // All threads come here.  We need a syncthreads because
    // shC has been written above and must be read below in A=A-V*C.
    __syncthreads ( ) ;

    //--------------------------------------------------------------------------
    // A = A - V*C
    //--------------------------------------------------------------------------

    if (ATHREADS == NUMTHREADS || threadIdx.x < ATHREADS)
    {

        //----------------------------------------------------------------------
        // clear the A bitty block
        //----------------------------------------------------------------------

        #pragma unroll
        for (int ii = 0 ; ii < ABITTYROWS ; ii++)
        {
            #pragma unroll
            for (int jj = 0 ; jj < ABITTYCOLS ; jj++)
            {
                rbit [ii][jj] = 0 ;
            }
        }

        //----------------------------------------------------------------------
        // X = tril(V)*C, store result into register (rbit)
        //----------------------------------------------------------------------

        #pragma unroll
        for (int p = 0 ; p < M ; p++)
        {
            #pragma unroll
            for (int ii = 0 ; ii < ABITTYROWS ; ii++)
            {
                int i = MYABITTYROW (ii) ;
                if (i >= p)
                {
                    rrow [ii] = shV [1+i][p] ;
                }
            }
            #pragma unroll
            for (int jj = 0 ; jj < ABITTYCOLS ; jj++)
            {
                int j = MYABITTYCOL (jj) ;
                rcol [jj] = shC [p][j] ;
            }
            #pragma unroll
            for (int ii = 0 ; ii < ABITTYROWS ; ii++)
            {
                int i = MYABITTYROW (ii) ;
                if (i >= p)
                {
                    #pragma unroll
                    for (int jj = 0 ; jj < ABITTYCOLS ; jj++)
                    {
                        rbit [ii][jj] += rrow [ii] * rcol [jj] ;
                    }
                }
            }
        }

        //----------------------------------------------------------------------
        // A = A - X, which finalizes the computation A = A - V*(T'*(V'*A))
        //----------------------------------------------------------------------
        
        #if (COL_PANELSIZE == 2)
        
            #pragma unroll
            for (int ii = 0 ; ii < ABITTYROWS ; ii++)
            {
                int i = MYABITTYROW (ii) ;
                int fi = IFRONT (i / M, i % M) ;
                #pragma unroll
                for (int jj = 0 ; jj < ABITTYCOLS ; jj++)
                {
                    int fj = j1 + MYABITTYCOL (jj) ;
                    if (INSIDE_ROW (fi < fm) && INSIDE_COL (fj < fn))
                    {
                        glF [fi * fn + fj] -= rbit [ii][jj] ;
                    }
                }
            }

        #else
        
            #pragma unroll
            for (int ii = 0 ; ii < ABITTYROWS ; ii++)
            {
                int i = MYABITTYROW (ii) ;
                int fi = IFRONT (i / M, i % M) ;
                #pragma unroll
                for (int jj = 0 ; jj < ABITTYCOLS ; jj++)
                {
                    int fj = j1 + MYABITTYCOL (jj) ;
                    if (INSIDE_ROW (fi < fm) && INSIDE_COL (fj < fn))
                    {
                        shV[i][MYABITTYCOL(jj)] = glF[fi*fn+fj] - rbit[ii][jj];
                    }
                    else
                    {
                        shV[i][MYABITTYCOL(jj)] = 0.0;
                    }
                }
            }
    
        #endif
    }

    //--------------------------------------------------------------------------
    // sync
    //--------------------------------------------------------------------------

    // The X=V*C computation in this iteration reads shC, but the same space is
    // used to load A from the frontal matrix in C=V'*A in the next iteration.
    // This final sync also ensures that all threads finish the block_apply
    // at the same time.  Thus, no syncthreads is needed at the start of a
    // subsequent function (the pipelined apply+factorize, for example).

    __syncthreads ( ) ;
}

//------------------------------------------------------------------------------
// undef's
//------------------------------------------------------------------------------

// The following #define's appear above.  Note that FIRST_TILE is not #undef'd
// since that is done by cevta_tile.cu.
#undef CBITTYROWS
#undef CBITTYCOLS
#undef ABITTYROWS
#undef ABITTYCOLS

#undef K
#undef N

#undef CTHREADS
#undef ATHREADS

#undef ic
#undef jc
#undef MYCBITTYROW
#undef MYCBITTYCOL

#undef ia
#undef ja
#undef MYABITTYROW
#undef MYABITTYCOL

#undef iaload
#undef jaload
#undef ACHUNKSIZE
#undef NACHUNKS

#undef rbitA
#undef INSIDE_ROW
#undef INSIDE_COL

// Defined in the parent file that includes this one.  Note that ROW_PANELSIZE
// is not #undef'd, since that is done in the parent.
#undef ROW_EDGE_CASE
#undef COL_EDGE_CASE