1 // =============================================================================
2 // === GPUQREngine/Include/Kernel/Apply/pipelined_rearrange.cu =================
3 // =============================================================================
4 
5 //------------------------------------------------------------------------------
6 // pipelined_rearrange
7 //------------------------------------------------------------------------------
8 
9 /*
10     PSEUDO #define MACROS (copied from vt_factorize.cu)
11         N
12             The # of columns to operate on (should always be TILESIZE).
13         INSIDE
14             Substitute in a condition depending on compilation options.
15             For this code, we always assume we need to check edge cases.
16         NACHUNKS
17             A chunking scheme used in the factorization kernel. We use
18             the same layout and thread dimension for our tile load/stores.
19         glA
20             Shorthand for the index computation into the global A.
21         shA
22             Shorthand for accessing the shared memory tiles of A in the union.
23         it
24             Row indices of a tile owned by a thread.
25         jt
26             Col indices of a tile owned by a thread.
27         ACHUNKSIZE
28             The amount of A do load in a chunk
29 */
30 
31 #define N               (TILESIZE)
32 
33 #define INSIDE(COND)    (COND)
34 
35 // when all threads work on a tile.
36 // (N*N / NUMTHREADS) does not have to be an integer.  With a tile
37 // size of N=32, and NUMTHREADS=384, it isn't.  So compute the ceiling,
38 // and handle the clean up by testing i < N below.
39 #define NACHUNKS        CEIL (N*N, NUMTHREADS)
40 
41 #define glA(i,j)        (myTask.F[((i)*fn + (j))])
42 #define shA             shMemory.factorize.A
43 
44 // ACHUNKSIZE must be an integer
45 #define it              (threadIdx.x / N)
46 #define jt              (threadIdx.x % N)
47 #define ACHUNKSIZE      (NUMTHREADS / N)
48 
49 /*
50     NEW #define MACROS
51         SAFELOAD
52             Loads a tile from global memory. Checks edge cases.
53         SH_TRANSFER
54             Moves a tile within shared memory
55         SAFESTORE
56             Stores a tile back to global memory. Checks edge cases.
57 */
58 
59 #define SAFELOAD(SLOT, ROWTILE)                                             \
60 {                                                                           \
61     int rowTile = (ROWTILE);                                                \
62     if (INSIDE (rowTile != EMPTY))                                          \
63     {                                                                       \
64         /* load the tile of A from global memory */                         \
65         for (int ii = 0 ; ii < NACHUNKS ; ii++)                             \
66         {                                                                   \
67             int i = ii * ACHUNKSIZE + it ;                                  \
68             if (ii < NACHUNKS-1 || i < N)                                   \
69             {                                                               \
70                 shA [i + (SLOT)*TILESIZE][jt] =                             \
71                     (INSIDE (i+rowTile < fm) && INSIDE (jt+j1 < fn)) ?      \
72                     glA (i+rowTile, jt+j1) : 0 ;                            \
73             }                                                               \
74         }                                                                   \
75     }                                                                       \
76     else                                                                    \
77     {                                                                       \
78         /* clear the tile of A */                                           \
79         for (int ii = 0 ; ii < NACHUNKS ; ii++)                             \
80         {                                                                   \
81             int i = ii * ACHUNKSIZE + it ;                                  \
82             if (ii < NACHUNKS-1 || i < N)                                   \
83             {                                                               \
84                 shA [i + SLOT*TILESIZE][jt] = 0 ;                           \
85             }                                                               \
86         }                                                                   \
87     }                                                                       \
88 }                                                                           \
89 
90 
91 #define SH_TRANSFER(TO, FROM)                                               \
92 {                                                                           \
93     for (int th=threadIdx.x; th<TILESIZE*TILESIZE; th+=blockDim.x)          \
94     {                                                                       \
95         int ito = (TILESIZE*(TO))   + (th / TILESIZE);                      \
96         int ifr = (TILESIZE*(FROM)) + (th / TILESIZE);                      \
97         int  j  =                     (th % TILESIZE);                      \
98         shA[ito][j] = shA[ifr][j];                                          \
99     }                                                                       \
100 }                                                                           \
101 
102 
103 #define SAFESTORE(SLOT, ROWTILE)                                            \
104 {                                                                           \
105     int rowTile = ROWTILE;                                                  \
106     if (INSIDE (rowTile != EMPTY))                                          \
107     {                                                                       \
108         for (int ii = 0 ; ii < NACHUNKS ; ii++)                             \
109         {                                                                   \
110             int i = ii * ACHUNKSIZE + it ;                                  \
111             if (ii < NACHUNKS-1 || i < N)                                   \
112             {                                                               \
113                 if (INSIDE (i+rowTile < fm) && INSIDE (jt+j1 < fn))         \
114                 {                                                           \
115                     glA (i+rowTile, jt+j1) = shA [i + (SLOT)*TILESIZE][jt]; \
116                 }                                                           \
117             }                                                               \
118         }                                                                   \
119     }                                                                       \
120 }                                                                           \
121 
122 
123 /* ALL THREADS PARTICIPATE */
124 {
125 
126     int delta     = myTask.extra[8];
127     int secondMin = myTask.extra[9];
128     int fc        = IsApplyFactorize;
129     int j1        = myTask.extra[4] + TILESIZE;
130 
131     /*** DO MEMORY SHUFFLES ***/
132 
133     SAFESTORE(0, myTask.extra[0]);
134 
135     /* 0 <-- secondMin */
136     if(delta != EMPTY && secondMin == delta)
137     {
138         SAFELOAD(0, myTask.extra[secondMin]);
139     }
140     else
141     {
142         SH_TRANSFER(0, secondMin);
143     }
144 
145     /* secondMin <-- fc */
146     if(fc != secondMin)
147     {
148         if(delta != EMPTY && fc >= delta)
149         {
150             SAFELOAD(secondMin, myTask.extra[fc]);
151         }
152         else
153         {
154             SH_TRANSFER(secondMin, fc);
155         }
156     }
157 
158     /* Hard-load D from global in the 2-3 case where [1] is secondMin. */
159     if(fc == 3 && delta == 2 && secondMin == 1)
160     {
161         SAFELOAD(2, myTask.extra[2]);
162     }
163 
164     /* Rearrange tiles so the tile store at the end doesn't explode.
165        This is non-essential until the very end, so we can easilly justify
166        piggybacking this integer shuffle to the next natural __syncthreads
167        that we encounter. */
168     __syncthreads();
169     if(threadIdx.x == 0)
170     {
171         myTask.extra[4] = j1;
172         myTask.AuxAddress[0] = myTask.AuxAddress[1];
173         myTask.AuxAddress[1] = NULL;
174 
175         myTask.extra[0] = myTask.extra[secondMin];
176         if(fc != secondMin)
177         {
178             myTask.extra[secondMin] = myTask.extra[fc];
179         }
180     }
181     __syncthreads();
182 }
183 
184 #undef N
185 #undef INSIDE
186 #undef NACHUNKS
187 #undef glA
188 #undef shA
189 #undef it
190 #undef jt
191 #undef ACHUNKSIZE
192 
193 #undef SAFELOAD
194 #undef SH_TRANSFER
195 #undef SAFESTORE
196