1 //------------------------------------------------------------------------------
2 // denseDotProduct.cu
3 //------------------------------------------------------------------------------
4
5 // The denseDotProduct CUDA kernel produces the semi-ring dot product of two
6 // vectors of types T1 and T2 and common size n, to a vector odata of type T3.
7 // ie. we want to produce dot(x,y) in the sense of the given semi-ring.
8
9 // Both the grid and block are 1D, so blockDim.x is the # threads in a
10 // threadblock, and the # of threadblocks is grid.x
11
12 // Let b = blockIdx.x, and let s be blockDim.x.
13 // Each threadblock owns s*8 contiguous items in the input data.
14
15 // Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)]. It's job
16 // is to reduce this data to a scalar, and write it to g_odata [b].
17
18 #include <limits>
19 #include "mySemiRing.h"
20 #include <cooperative_groups.h>
21
22 using namespace cooperative_groups;
23
24 template< typename T3, int tile_sz>
25 __inline__ __device__
warp_ReduceSum(thread_block_tile<tile_sz> g,T3 val)26 T3 warp_ReduceSum(thread_block_tile<tile_sz> g, T3 val)
27 {
28 // Each iteration halves the number of active threads
29 // Each thread adds its partial sum[i] to sum[lane+i]
30 for (int i = g.size() / 2; i > 0; i /= 2)
31 {
32 T3 fold = g.shfl_down( val, i);
33 val = ADD( val, fold );
34 }
35 return val; // note: only thread 0 will return full sum
36 }
37
38 template<typename T3, int warpSize>
39 __inline__ __device__
block_ReduceSum(thread_block g,T3 val)40 T3 block_ReduceSum(thread_block g, T3 val)
41 {
42 static __shared__ T3 shared[warpSize]; // Shared mem for 32 partial sums
43 int lane = threadIdx.x % warpSize;
44 int wid = threadIdx.x / warpSize;
45 thread_block_tile<warpSize> tile = tiled_partition<warpSize>(g);
46
47 // Each warp performs partial reduction
48 val = warp_ReduceSum<T3,warpSize>(tile, val);
49
50 if (lane==0) shared[wid]=val; // Write reduced value to shared memory
51
52 __syncthreads(); // Wait for all partial reductions
53
54 //read from shared memory only if that warp existed
55 val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : (T3)MONOID_IDENTITY3;
56
57
58 if (wid==0) val = warp_ReduceSum<T3,warpSize>(tile,val); //Final reduce within first warp
59
60 return val;
61 }
62
63 template< typename T1, typename T2, typename T3>
denseDotProduct(T1 * g_xdata,T2 * g_ydata,T3 * g_odata,unsigned int n)64 __global__ void denseDotProduct
65 (
66 T1 *g_xdata, // array of size n, type T1
67 T2 *g_ydata, // array of size n, type T2
68 T3 *g_odata, // array of size grid.x, type T3
69 unsigned int n
70 )
71 {
72 // set thread ID
73 unsigned int tid = threadIdx.x ;
74
75 // this threadblock b owns g_idata [block_start ... block_end-1]
76 unsigned long int s = blockDim.x ;
77 unsigned long int b = blockIdx.x ;
78 unsigned long int block_start = b * s * 8 ;
79 unsigned long int block_end = (b + 1) * s * 8 ;
80
81 /*
82 if (tid == 0)
83 {
84 printf ("block %d: [%lu ... %ld]\n", b, block_start, block_end-1) ;
85 }
86 */
87
88 /*
89 if (tid == 0 && b == 0)
90 {
91 printf ("type is size %d\n", sizeof (T)) ;
92 for (int k = 0 ; k < n ; k++) printf ("%4d: %g\n", k, (double) g_idata [k]) ;
93 printf ("\n") ;
94 }
95 */
96
97 // each thread tid reduces its result into sum
98 T3 sum;
99
100 // nothing to do
101 if (block_start > block_end) { return ; }
102
103 // convert global data pointer to the local pointer of this block
104 T1 *xdata = g_xdata + block_start ;
105 T2 *ydata = g_ydata + block_start ;
106
107 T1 x0, x1, x2, x3, x4, x5, x6, x7 ;
108 T2 y0, y1, y2, y3, y4, y5, y6, y7 ;
109
110 if (block_end <= n)
111 {
112 // unrolling 8
113 x0 = xdata [tid] ;
114 x1 = xdata [tid + s] ;
115 x2 = xdata [tid + 2 * s] ;
116 x3 = xdata [tid + 3 * s] ;
117 x4 = xdata [tid + 4 * s] ;
118 x5 = xdata [tid + 5 * s] ;
119 x6 = xdata [tid + 6 * s] ;
120 x7 = xdata [tid + 7 * s] ;
121
122 y0 = ydata [tid] ;
123 y1 = ydata [tid + s] ;
124 y2 = ydata [tid + 2 * s] ;
125 y3 = ydata [tid + 3 * s] ;
126 y4 = ydata [tid + 4 * s] ;
127 y5 = ydata [tid + 5 * s] ;
128 y6 = ydata [tid + 6 * s] ;
129 y7 = ydata [tid + 7 * s] ;
130 /*
131 if (b == 0)
132 {
133 printf ("block zero: here is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
134 (double) x0, (double) x1, (double) x2, (double) x3,
135 (double) x4, (double) x5, (double) x6, (double) x7) ;
136 }
137 */
138
139 }
140 else
141 {
142 // the last block has size less than 8*s
143 #define XDATA(i) ((i < lastblocksize) ? xdata [i] : MONOID_IDENTITY1)
144 #define YDATA(i) ((i < lastblocksize) ? ydata [i] : MONOID_IDENTITY2)
145 int lastblocksize = n - block_start ;
146 x0 = XDATA (tid) ;
147 x1 = XDATA (tid + s) ;
148 x2 = XDATA (tid + 2 * s) ;
149 x3 = XDATA (tid + 3 * s) ;
150 x4 = XDATA (tid + 4 * s) ;
151 x5 = XDATA (tid + 5 * s) ;
152 x6 = XDATA (tid + 6 * s) ;
153 x7 = XDATA (tid + 7 * s) ;
154
155 y0 = YDATA (tid) ;
156 y1 = YDATA (tid + s) ;
157 y2 = YDATA (tid + 2 * s) ;
158 y3 = YDATA (tid + 3 * s) ;
159 y4 = YDATA (tid + 4 * s) ;
160 y5 = YDATA (tid + 5 * s) ;
161 y6 = YDATA (tid + 6 * s) ;
162 y7 = YDATA (tid + 7 * s) ;
163 }
164
165 //work [tid] = mul(x0,y0) + mul(x1,y1) + mul(x2,y2) + mul(x3,y3)
166 // + mul(x4,y4) + mul(x5,y5) + mul(x6,y6)+ mul(x7,y7) ;
167 sum = ADD( MUL(x0,y0) , ADD( MUL(x1,y1) , ADD( MUL(x2,y2),
168 ADD( MUL(x3,y3) , ADD( MUL(x4,y4) , ADD( MUL(x5,y5),
169 ADD( MUL(x6,y6) , MUL(x7,y7)))))))) ;
170
171 /*
172 if (b == 0)
173 {
174 printf ("block zero: still is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
175 (double) x0, (double) x1, (double) x2, (double) x3,
176 (double) x4, (double) x5, (double) x6, (double) x7) ;
177 }
178
179 if (b == 0)
180 {
181 printf ("block zero: here is tid %d result %g is %g\n",
182 tid, sum,
183 (double) (x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7)) ;
184 }
185 */
186
187 __syncthreads ( ) ;
188
189 //--------------------------------------------------------------------------
190 // reduce per-thread sums to a single scalar
191 //--------------------------------------------------------------------------
192
193 sum = block_ReduceSum<T3, 32>( this_thread_block(), sum);
194
195 // write result for this block to global mem
196 if (tid == 0)
197 {
198 printf ("final %d : %g\n", b, (T3) sum) ;
199 g_odata [b] = sum ;
200 }
201 }
202
203