1 //------------------------------------------------------------------------------
2 // denseDotProduct.cu
3 //------------------------------------------------------------------------------
4 
5 // The denseDotProduct CUDA kernel produces the semi-ring dot product of two
6 // vectors of types T1 and T2 and common size n, to a vector odata of type T3.
7 // ie. we want to produce dot(x,y) in the sense of the given semi-ring.
8 
9 // Both the grid and block are 1D, so blockDim.x is the # threads in a
10 // threadblock, and the # of threadblocks is grid.x
11 
12 // Let b = blockIdx.x, and let s be blockDim.x.
13 // Each threadblock owns s*8 contiguous items in the input data.
14 
15 // Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)].  It's job
16 // is to reduce this data to a scalar, and write it to g_odata [b].
17 
18 #include <limits>
19 #include "mySemiRing.h"
20 #include <cooperative_groups.h>
21 
22 using namespace cooperative_groups;
23 
24 template< typename T3, int tile_sz>
25 __inline__ __device__
warp_ReduceSum(thread_block_tile<tile_sz> g,T3 val)26 T3 warp_ReduceSum(thread_block_tile<tile_sz> g, T3 val)
27 {
28     // Each iteration halves the number of active threads
29     // Each thread adds its partial sum[i] to sum[lane+i]
30     for (int i = g.size() / 2; i > 0; i /= 2)
31     {
32         T3 fold = g.shfl_down( val, i);
33         val = ADD( val, fold );
34     }
35     return val; // note: only thread 0 will return full sum
36 }
37 
38 template<typename T3, int warpSize>
39 __inline__ __device__
block_ReduceSum(thread_block g,T3 val)40 T3 block_ReduceSum(thread_block g, T3 val)
41 {
42   static __shared__ T3 shared[warpSize]; // Shared mem for 32 partial sums
43   int lane = threadIdx.x % warpSize;
44   int wid = threadIdx.x / warpSize;
45   thread_block_tile<warpSize> tile = tiled_partition<warpSize>(g);
46 
47   // Each warp performs partial reduction
48   val = warp_ReduceSum<T3,warpSize>(tile, val);
49 
50   if (lane==0) shared[wid]=val; // Write reduced value to shared memory
51 
52   __syncthreads();              // Wait for all partial reductions
53 
54   //read from shared memory only if that warp existed
55   val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : (T3)MONOID_IDENTITY3;
56 
57 
58   if (wid==0) val = warp_ReduceSum<T3,warpSize>(tile,val); //Final reduce within first warp
59 
60   return val;
61 }
62 
63 template< typename T1, typename T2, typename T3>
denseDotProduct(T1 * g_xdata,T2 * g_ydata,T3 * g_odata,unsigned int n)64 __global__ void denseDotProduct
65 (
66     T1 *g_xdata,     // array of size n, type T1
67     T2 *g_ydata,     // array of size n, type T2
68     T3 *g_odata,       // array of size grid.x, type T3
69     unsigned int n
70 )
71 {
72     // set thread ID
73     unsigned int tid = threadIdx.x ;
74 
75     // this threadblock b owns g_idata [block_start ... block_end-1]
76     unsigned long int s = blockDim.x ;
77     unsigned long int b = blockIdx.x ;
78     unsigned long int block_start = b * s * 8 ;
79     unsigned long int block_end   = (b + 1) * s * 8 ;
80 
81     /*
82     if (tid == 0)
83     {
84         printf ("block %d: [%lu ... %ld]\n", b, block_start, block_end-1) ;
85     }
86     */
87 
88     /*
89     if (tid == 0 && b == 0)
90     {
91         printf ("type is size %d\n", sizeof (T)) ;
92         for (int k = 0 ; k < n ; k++) printf ("%4d: %g\n", k, (double) g_idata [k]) ;
93         printf ("\n") ;
94     }
95     */
96 
97     // each thread tid reduces its result into sum
98     T3 sum;
99 
100     // nothing to do
101     if (block_start > block_end) { return ; }
102 
103     // convert global data pointer to the local pointer of this block
104     T1 *xdata = g_xdata + block_start ;
105     T2 *ydata = g_ydata + block_start ;
106 
107     T1 x0, x1, x2, x3, x4, x5, x6, x7 ;
108     T2 y0, y1, y2, y3, y4, y5, y6, y7 ;
109 
110     if (block_end <= n)
111     {
112         // unrolling 8
113         x0 = xdata [tid] ;
114         x1 = xdata [tid +     s] ;
115         x2 = xdata [tid + 2 * s] ;
116         x3 = xdata [tid + 3 * s] ;
117         x4 = xdata [tid + 4 * s] ;
118         x5 = xdata [tid + 5 * s] ;
119         x6 = xdata [tid + 6 * s] ;
120         x7 = xdata [tid + 7 * s] ;
121 
122         y0 = ydata [tid] ;
123         y1 = ydata [tid +     s] ;
124         y2 = ydata [tid + 2 * s] ;
125         y3 = ydata [tid + 3 * s] ;
126         y4 = ydata [tid + 4 * s] ;
127         y5 = ydata [tid + 5 * s] ;
128         y6 = ydata [tid + 6 * s] ;
129         y7 = ydata [tid + 7 * s] ;
130         /*
131         if (b == 0)
132         {
133             printf ("block zero: here is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
134                 (double) x0, (double) x1, (double) x2, (double) x3,
135                 (double) x4, (double) x5, (double) x6, (double) x7) ;
136         }
137         */
138 
139     }
140     else
141     {
142         // the last block has size less than 8*s
143         #define XDATA(i) ((i < lastblocksize) ? xdata [i] : MONOID_IDENTITY1)
144         #define YDATA(i) ((i < lastblocksize) ? ydata [i] : MONOID_IDENTITY2)
145         int lastblocksize = n - block_start ;
146         x0 = XDATA (tid) ;
147         x1 = XDATA (tid +     s) ;
148         x2 = XDATA (tid + 2 * s) ;
149         x3 = XDATA (tid + 3 * s) ;
150         x4 = XDATA (tid + 4 * s) ;
151         x5 = XDATA (tid + 5 * s) ;
152         x6 = XDATA (tid + 6 * s) ;
153         x7 = XDATA (tid + 7 * s) ;
154 
155         y0 = YDATA (tid) ;
156         y1 = YDATA (tid +     s) ;
157         y2 = YDATA (tid + 2 * s) ;
158         y3 = YDATA (tid + 3 * s) ;
159         y4 = YDATA (tid + 4 * s) ;
160         y5 = YDATA (tid + 5 * s) ;
161         y6 = YDATA (tid + 6 * s) ;
162         y7 = YDATA (tid + 7 * s) ;
163     }
164 
165     //work [tid] = mul(x0,y0) + mul(x1,y1) + mul(x2,y2) + mul(x3,y3)
166     //               + mul(x4,y4) + mul(x5,y5) + mul(x6,y6)+ mul(x7,y7) ;
167           sum  = ADD( MUL(x0,y0) , ADD( MUL(x1,y1) , ADD( MUL(x2,y2),
168                  ADD( MUL(x3,y3) , ADD( MUL(x4,y4) , ADD( MUL(x5,y5),
169                  ADD( MUL(x6,y6) , MUL(x7,y7)))))))) ;
170 
171         /*
172         if (b == 0)
173         {
174             printf ("block zero: still is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
175                 (double) x0, (double) x1, (double) x2, (double) x3,
176                 (double) x4, (double) x5, (double) x6, (double) x7) ;
177         }
178 
179         if (b == 0)
180         {
181             printf ("block zero: here is tid %d result %g  is %g\n",
182             tid, sum,
183             (double) (x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7)) ;
184         }
185         */
186 
187     __syncthreads ( ) ;
188 
189     //--------------------------------------------------------------------------
190     // reduce per-thread sums to a single scalar
191     //--------------------------------------------------------------------------
192 
193     sum = block_ReduceSum<T3, 32>( this_thread_block(), sum);
194 
195     // write result for this block to global mem
196     if (tid == 0)
197     {
198         printf ("final %d : %g\n", b, (T3) sum) ;
199         g_odata [b] = sum ;
200     }
201 }
202 
203