1 #include <stdio.h>
2 
3 // __device__ function
func()4 __device__ void func()
5 {
6     short* array0 = (short*)array;
7     float* array1 = (float*)&array0[127];
8 }
9 
10 /* __global__ function */
reduction(const float * __restrict__ input,float * output,clock_t * timer)11 __global__ static void reduction(const float* __restrict__ input, float *output, clock_t *timer)
12 {
13     // __shared__ float shared[2 * blockDim.x];
14     extern __shared__ float shared[];
15 
16     const int tid = threadIdx.x;
17     const int bid = blockIdx.x;
18 
19     if (threadIdx.x == 0) {
20         __threadfence();
21     }
22 
23     // Perform reduction to find minimum.
24     for (int d = blockDim.x; d > 0; d /= 2)
25     {
26         __syncthreads();
27     }
28 }
29 
main(int argc,char ** argv)30 int main(int argc, char **argv)
31 {
32     dim3 dimBlock(8, 8, 1);
33 
34     timedReduction<<<dimBlock, 256, 256, 0>>>(dinput, doutput, dtimer);
35     cudaDeviceReset();
36 }
37