1 // ============================================================================= 2 // === GPUQREngine/Include/Kernel/Assemble/packAssemble.cu ===================== 3 // ============================================================================= 4 packassemble()5__device__ void packassemble ( ) 6 { 7 // Use shared memory for Rjmap and Rimap. 8 int *shRimap = shMemory.packassemble.Rimap; 9 int *shRjmap = shMemory.packassemble.Rjmap; 10 11 double *C = myTask.AuxAddress[0]; 12 double *P = myTask.AuxAddress[1]; 13 int *Rjmap = (int*) myTask.AuxAddress[2]; 14 int *Rimap = (int*) myTask.AuxAddress[3]; 15 // int fm = myTask.fm; 16 int fn = myTask.fn; 17 int pn = myTask.extra[0]; 18 // int cm = myTask.extra[1]; 19 // int cn = myTask.extra[2]; 20 int cTileSize = myTask.extra[3]; 21 int cistart = myTask.extra[4]; 22 int ciend = myTask.extra[5]; 23 int cjstart = myTask.extra[6]; 24 int cjend = myTask.extra[7]; 25 26 // Fill Rjmap and Rimaps. 27 int ctm = ciend - cistart; // # cell tile rows 28 int ctn = cjend - cjstart; // # cell tile cols 29 for(int p=threadIdx.x; p<ctm; p+=blockDim.x) 30 { 31 shRimap[p] = Rimap[cistart+p]; 32 } 33 for(int p=threadIdx.x; p<ctn; p+=blockDim.x) 34 { 35 shRjmap[p] = Rjmap[cjstart+p]; 36 } 37 __syncthreads(); 38 39 for(int p=threadIdx.x; p<cTileSize; p+=blockDim.x) 40 { 41 // Translate local tile coordinates to contribution block ci, cj. 42 int cil = p / ctn; // ci local to the tile 43 int cjl = p % ctn; // cj local to the tile 44 int ci = cistart + cil; // ci is really the start plus local ci 45 int cj = cjstart + cjl; // cj is really the start plus local cj 46 47 // Avoid copying the zeroes by only copying the upper-triangular bits. 48 if(cj >= ci) 49 { 50 int fi = shRimap[cil]; 51 int fj = shRjmap[cjl]; 52 int cindex = fn*ci+cj; 53 int pindex = pn*fi+fj; 54 P[pindex] = C[cindex]; 55 } 56 } 57 } 58