1 // =============================================================================
2 // === GPUQREngine/Include/Kernel/Assemble/packAssemble.cu =====================
3 // =============================================================================
4 
packassemble()5 __device__ void packassemble ( )
6 {
7     // Use shared memory for Rjmap and Rimap.
8     int *shRimap = shMemory.packassemble.Rimap;
9     int *shRjmap = shMemory.packassemble.Rjmap;
10 
11     double *C     = myTask.AuxAddress[0];
12     double *P     = myTask.AuxAddress[1];
13     int *Rjmap    = (int*) myTask.AuxAddress[2];
14     int *Rimap    = (int*) myTask.AuxAddress[3];
15 //  int fm        = myTask.fm;
16     int fn        = myTask.fn;
17     int pn        = myTask.extra[0];
18 //  int cm        = myTask.extra[1];
19 //  int cn        = myTask.extra[2];
20     int cTileSize = myTask.extra[3];
21     int cistart   = myTask.extra[4];
22     int ciend     = myTask.extra[5];
23     int cjstart   = myTask.extra[6];
24     int cjend     = myTask.extra[7];
25 
26     // Fill Rjmap and Rimaps.
27     int ctm = ciend - cistart;      // # cell tile rows
28     int ctn = cjend - cjstart;      // # cell tile cols
29     for(int p=threadIdx.x; p<ctm; p+=blockDim.x)
30     {
31         shRimap[p] = Rimap[cistart+p];
32     }
33     for(int p=threadIdx.x; p<ctn; p+=blockDim.x)
34     {
35         shRjmap[p] = Rjmap[cjstart+p];
36     }
37     __syncthreads();
38 
39     for(int p=threadIdx.x; p<cTileSize; p+=blockDim.x)
40     {
41         // Translate local tile coordinates to contribution block ci, cj.
42         int cil = p / ctn;          // ci local to the tile
43         int cjl = p % ctn;          // cj local to the tile
44         int ci = cistart + cil;     // ci is really the start plus local ci
45         int cj = cjstart + cjl;     // cj is really the start plus local cj
46 
47         // Avoid copying the zeroes by only copying the upper-triangular bits.
48         if(cj >= ci)
49         {
50             int fi = shRimap[cil];
51             int fj = shRjmap[cjl];
52             int cindex = fn*ci+cj;
53             int pindex = pn*fi+fj;
54             P[pindex] = C[cindex];
55         }
56     }
57 }
58