1 
2 #ifndef PARAMS_APPLY_HPP_
3 #define PARAMS_APPLY_HPP_
4 
5 //------------------------------------------------------------------------------
6 // definitions for all variants of block_apply
7 //------------------------------------------------------------------------------
8 
9 // maximum number of row and column tiles in a panel
10 #define MAX_ROW_TILES 3
11 #define MAX_COL_TILES 2
12 
13 #define shV shMemory.apply.V
14 #define shC shMemory.apply.C
15 
16 // each tile is 32-by-32, which is always M, for all variants
17 // (M is defined inside block_apply.cu, and then #undef'd there also)
18 // #define M TILESIZE
19 
20 // V1 is held in the lower triangular part of the glVT array, including the
21 // diagonal.  Thg glVT array is of size (M+1)-by-M.  The upper triangular part
22 // holds T, (also including a diagonal)
23 #define GLVT(i,j)   (glVT [1+(i)][j])
24 
25 // The shared array V is K-by-M (1 to 3 tiles of size M-by-M), with an
26 // extra column for padding.  It is indexed first by the t-th row tile, and
27 // then (i,j) within that t-th tile.  V is lower triangular, and shares its
28 // space with the upper triangular T matrix.
29 #define SHV(t,i,j)  (shV [1+ (t)*TILESIZE + (i)][j])
30 
31 // Macros for accessing entries in a frontal matrix.  The A matrix and most of
32 // the V matrix reside in the frontal matrix as a set of tiles.  The row index
33 // of GLF(t,i,j) is defined by row tile t and by a row index i within that tile
34 // (i is in the range 0 to the tilesize-1).  Column j refers to the global
35 // column index in F.  fi = IFRONT(t,i) translates the tile t and row i inside
36 // that tile to an index fi which is in the range 0 to fm-1, which is an index
37 // into the front in global memory.
38 #define IFRONT(t,i) ((i) + myTask.extra [t])
39 #define GLF(t,i,j)  glF [IFRONT(t,i) * fn + (j)]
40 
41 // C is used to buffer A, when computing C=V'*A
42 #define SHA(i,j)    (shC [i][j])
43 
44 // T is upper triangular of size M-by-M, and shares its space with V
45 #define ST(i,j)     (shV [i][j])
46 
47 // Each thread loads V(iv,jv) from global, and then iv+chunksize,
48 // iv+2*chunksize, etc.  With M = 32 and 384 threads, the chunksize is 12,
49 // and the number of chunks is 3.
50 #define iv          (threadIdx.x / TILESIZE)
51 #define jv          (threadIdx.x % TILESIZE)
52 #define VCHUNKSIZE  (NUMTHREADS / TILESIZE)
53 #define NVCHUNKS    CEIL (TILESIZE*TILESIZE, NUMTHREADS)
54 
55 // device functions block_apply, one for each variant
56 __device__ void block_apply_3 ( ) ;
57 __device__ void block_apply_2 ( ) ;
58 __device__ void block_apply_1 ( ) ;
59 __device__ void block_apply_3_by_1 ( ) ;
60 __device__ void block_apply_2_by_1_( ) ;
61 __device__ void block_apply_1_by_1 ( ) ;
62 
63 #endif
64