1 2 #ifndef PARAMS_APPLY_HPP_ 3 #define PARAMS_APPLY_HPP_ 4 5 //------------------------------------------------------------------------------ 6 // definitions for all variants of block_apply 7 //------------------------------------------------------------------------------ 8 9 // maximum number of row and column tiles in a panel 10 #define MAX_ROW_TILES 3 11 #define MAX_COL_TILES 2 12 13 #define shV shMemory.apply.V 14 #define shC shMemory.apply.C 15 16 // each tile is 32-by-32, which is always M, for all variants 17 // (M is defined inside block_apply.cu, and then #undef'd there also) 18 // #define M TILESIZE 19 20 // V1 is held in the lower triangular part of the glVT array, including the 21 // diagonal. Thg glVT array is of size (M+1)-by-M. The upper triangular part 22 // holds T, (also including a diagonal) 23 #define GLVT(i,j) (glVT [1+(i)][j]) 24 25 // The shared array V is K-by-M (1 to 3 tiles of size M-by-M), with an 26 // extra column for padding. It is indexed first by the t-th row tile, and 27 // then (i,j) within that t-th tile. V is lower triangular, and shares its 28 // space with the upper triangular T matrix. 29 #define SHV(t,i,j) (shV [1+ (t)*TILESIZE + (i)][j]) 30 31 // Macros for accessing entries in a frontal matrix. The A matrix and most of 32 // the V matrix reside in the frontal matrix as a set of tiles. The row index 33 // of GLF(t,i,j) is defined by row tile t and by a row index i within that tile 34 // (i is in the range 0 to the tilesize-1). Column j refers to the global 35 // column index in F. fi = IFRONT(t,i) translates the tile t and row i inside 36 // that tile to an index fi which is in the range 0 to fm-1, which is an index 37 // into the front in global memory. 38 #define IFRONT(t,i) ((i) + myTask.extra [t]) 39 #define GLF(t,i,j) glF [IFRONT(t,i) * fn + (j)] 40 41 // C is used to buffer A, when computing C=V'*A 42 #define SHA(i,j) (shC [i][j]) 43 44 // T is upper triangular of size M-by-M, and shares its space with V 45 #define ST(i,j) (shV [i][j]) 46 47 // Each thread loads V(iv,jv) from global, and then iv+chunksize, 48 // iv+2*chunksize, etc. With M = 32 and 384 threads, the chunksize is 12, 49 // and the number of chunks is 3. 50 #define iv (threadIdx.x / TILESIZE) 51 #define jv (threadIdx.x % TILESIZE) 52 #define VCHUNKSIZE (NUMTHREADS / TILESIZE) 53 #define NVCHUNKS CEIL (TILESIZE*TILESIZE, NUMTHREADS) 54 55 // device functions block_apply, one for each variant 56 __device__ void block_apply_3 ( ) ; 57 __device__ void block_apply_2 ( ) ; 58 __device__ void block_apply_1 ( ) ; 59 __device__ void block_apply_3_by_1 ( ) ; 60 __device__ void block_apply_2_by_1_( ) ; 61 __device__ void block_apply_1_by_1 ( ) ; 62 63 #endif 64