1 // ============================================================================= 2 // === GPUQREngine/Include/Kernel/Apply/pipelined_rearrange.cu ================= 3 // ============================================================================= 4 5 //------------------------------------------------------------------------------ 6 // pipelined_rearrange 7 //------------------------------------------------------------------------------ 8 9 /* 10 PSEUDO #define MACROS (copied from vt_factorize.cu) 11 N 12 The # of columns to operate on (should always be TILESIZE). 13 INSIDE 14 Substitute in a condition depending on compilation options. 15 For this code, we always assume we need to check edge cases. 16 NACHUNKS 17 A chunking scheme used in the factorization kernel. We use 18 the same layout and thread dimension for our tile load/stores. 19 glA 20 Shorthand for the index computation into the global A. 21 shA 22 Shorthand for accessing the shared memory tiles of A in the union. 23 it 24 Row indices of a tile owned by a thread. 25 jt 26 Col indices of a tile owned by a thread. 27 ACHUNKSIZE 28 The amount of A do load in a chunk 29 */ 30 31 #define N (TILESIZE) 32 33 #define INSIDE(COND) (COND) 34 35 // when all threads work on a tile. 36 // (N*N / NUMTHREADS) does not have to be an integer. With a tile 37 // size of N=32, and NUMTHREADS=384, it isn't. So compute the ceiling, 38 // and handle the clean up by testing i < N below. 39 #define NACHUNKS CEIL (N*N, NUMTHREADS) 40 41 #define glA(i,j) (myTask.F[((i)*fn + (j))]) 42 #define shA shMemory.factorize.A 43 44 // ACHUNKSIZE must be an integer 45 #define it (threadIdx.x / N) 46 #define jt (threadIdx.x % N) 47 #define ACHUNKSIZE (NUMTHREADS / N) 48 49 /* 50 NEW #define MACROS 51 SAFELOAD 52 Loads a tile from global memory. Checks edge cases. 53 SH_TRANSFER 54 Moves a tile within shared memory 55 SAFESTORE 56 Stores a tile back to global memory. Checks edge cases. 57 */ 58 59 #define SAFELOAD(SLOT, ROWTILE) \ 60 { \ 61 int rowTile = (ROWTILE); \ 62 if (INSIDE (rowTile != EMPTY)) \ 63 { \ 64 /* load the tile of A from global memory */ \ 65 for (int ii = 0 ; ii < NACHUNKS ; ii++) \ 66 { \ 67 int i = ii * ACHUNKSIZE + it ; \ 68 if (ii < NACHUNKS-1 || i < N) \ 69 { \ 70 shA [i + (SLOT)*TILESIZE][jt] = \ 71 (INSIDE (i+rowTile < fm) && INSIDE (jt+j1 < fn)) ? \ 72 glA (i+rowTile, jt+j1) : 0 ; \ 73 } \ 74 } \ 75 } \ 76 else \ 77 { \ 78 /* clear the tile of A */ \ 79 for (int ii = 0 ; ii < NACHUNKS ; ii++) \ 80 { \ 81 int i = ii * ACHUNKSIZE + it ; \ 82 if (ii < NACHUNKS-1 || i < N) \ 83 { \ 84 shA [i + SLOT*TILESIZE][jt] = 0 ; \ 85 } \ 86 } \ 87 } \ 88 } \ 89 90 91 #define SH_TRANSFER(TO, FROM) \ 92 { \ 93 for (int th=threadIdx.x; th<TILESIZE*TILESIZE; th+=blockDim.x) \ 94 { \ 95 int ito = (TILESIZE*(TO)) + (th / TILESIZE); \ 96 int ifr = (TILESIZE*(FROM)) + (th / TILESIZE); \ 97 int j = (th % TILESIZE); \ 98 shA[ito][j] = shA[ifr][j]; \ 99 } \ 100 } \ 101 102 103 #define SAFESTORE(SLOT, ROWTILE) \ 104 { \ 105 int rowTile = ROWTILE; \ 106 if (INSIDE (rowTile != EMPTY)) \ 107 { \ 108 for (int ii = 0 ; ii < NACHUNKS ; ii++) \ 109 { \ 110 int i = ii * ACHUNKSIZE + it ; \ 111 if (ii < NACHUNKS-1 || i < N) \ 112 { \ 113 if (INSIDE (i+rowTile < fm) && INSIDE (jt+j1 < fn)) \ 114 { \ 115 glA (i+rowTile, jt+j1) = shA [i + (SLOT)*TILESIZE][jt]; \ 116 } \ 117 } \ 118 } \ 119 } \ 120 } \ 121 122 123 /* ALL THREADS PARTICIPATE */ 124 { 125 126 int delta = myTask.extra[8]; 127 int secondMin = myTask.extra[9]; 128 int fc = IsApplyFactorize; 129 int j1 = myTask.extra[4] + TILESIZE; 130 131 /*** DO MEMORY SHUFFLES ***/ 132 133 SAFESTORE(0, myTask.extra[0]); 134 135 /* 0 <-- secondMin */ 136 if(delta != EMPTY && secondMin == delta) 137 { 138 SAFELOAD(0, myTask.extra[secondMin]); 139 } 140 else 141 { 142 SH_TRANSFER(0, secondMin); 143 } 144 145 /* secondMin <-- fc */ 146 if(fc != secondMin) 147 { 148 if(delta != EMPTY && fc >= delta) 149 { 150 SAFELOAD(secondMin, myTask.extra[fc]); 151 } 152 else 153 { 154 SH_TRANSFER(secondMin, fc); 155 } 156 } 157 158 /* Hard-load D from global in the 2-3 case where [1] is secondMin. */ 159 if(fc == 3 && delta == 2 && secondMin == 1) 160 { 161 SAFELOAD(2, myTask.extra[2]); 162 } 163 164 /* Rearrange tiles so the tile store at the end doesn't explode. 165 This is non-essential until the very end, so we can easilly justify 166 piggybacking this integer shuffle to the next natural __syncthreads 167 that we encounter. */ 168 __syncthreads(); 169 if(threadIdx.x == 0) 170 { 171 myTask.extra[4] = j1; 172 myTask.AuxAddress[0] = myTask.AuxAddress[1]; 173 myTask.AuxAddress[1] = NULL; 174 175 myTask.extra[0] = myTask.extra[secondMin]; 176 if(fc != secondMin) 177 { 178 myTask.extra[secondMin] = myTask.extra[fc]; 179 } 180 } 181 __syncthreads(); 182 } 183 184 #undef N 185 #undef INSIDE 186 #undef NACHUNKS 187 #undef glA 188 #undef shA 189 #undef it 190 #undef jt 191 #undef ACHUNKSIZE 192 193 #undef SAFELOAD 194 #undef SH_TRANSFER 195 #undef SAFESTORE 196