1 #include <unittest/unittest.h>
2 #include <thrust/for_each.h>
3 #include <thrust/execution_policy.h>
4 #include <algorithm>
5 
6 static const size_t NUM_REGISTERS = 64;
7 
f(int * x)8 template <size_t N> __host__ __device__ void f   (int * x) { int temp = *x; f<N - 1>(x + 1); *x = temp;};
f(int *)9 template <>         __host__ __device__ void f<0>(int * /*x*/) { }
10 template <size_t N>
11 struct CopyFunctorWithManyRegisters
12 {
13   __host__ __device__
operator ()CopyFunctorWithManyRegisters14   void operator()(int * ptr)
15   {
16       f<N>(ptr);
17   }
18 };
19 
20 
TestForEachLargeRegisterFootprint()21 void TestForEachLargeRegisterFootprint()
22 {
23   int current_device = -1;
24   cudaGetDevice(&current_device);
25   cudaDeviceProp prop;
26   cudaGetDeviceProperties(&prop, current_device);
27 
28   thrust::device_vector<int> data(NUM_REGISTERS, 12345);
29 
30   thrust::device_vector<int *> input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant
31 
32   thrust::for_each(input.begin(), input.end(), CopyFunctorWithManyRegisters<NUM_REGISTERS>());
33 }
34 DECLARE_UNITTEST(TestForEachLargeRegisterFootprint);
35 
36 
TestForEachNLargeRegisterFootprint()37 void TestForEachNLargeRegisterFootprint()
38 {
39   int current_device = -1;
40   cudaGetDevice(&current_device);
41   cudaDeviceProp prop;
42   cudaGetDeviceProperties(&prop, current_device);
43 
44   thrust::device_vector<int> data(NUM_REGISTERS, 12345);
45 
46   thrust::device_vector<int *> input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant
47 
48   thrust::for_each_n(input.begin(), input.size(), CopyFunctorWithManyRegisters<NUM_REGISTERS>());
49 }
50 DECLARE_UNITTEST(TestForEachNLargeRegisterFootprint);
51 
52 
53 template <typename T>
54 struct mark_present_for_each
55 {
56   T * ptr;
57   __host__ __device__ void
operator ()mark_present_for_each58   operator()(T x){ ptr[(int) x] = 1; }
59 };
60 
61 
62 template<typename ExecutionPolicy, typename Iterator, typename Function>
for_each_kernel(ExecutionPolicy exec,Iterator first,Iterator last,Function f)63 __global__ void for_each_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
64 {
65   thrust::for_each(exec, first, last, f);
66 }
67 
68 
69 template<typename T>
TestForEachDeviceSeq(const size_t n)70 void TestForEachDeviceSeq(const size_t n)
71 {
72   const size_t output_size = std::min((size_t) 10, 2 * n);
73 
74   thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
75 
76   for(size_t i = 0; i < n; i++)
77     h_input[i] =  ((size_t) h_input[i]) % output_size;
78 
79   thrust::device_vector<T> d_input = h_input;
80 
81   thrust::host_vector<T>   h_output(output_size, (T) 0);
82   thrust::device_vector<T> d_output(output_size, (T) 0);
83 
84   mark_present_for_each<T> h_f;
85   mark_present_for_each<T> d_f;
86   h_f.ptr = &h_output[0];
87   d_f.ptr = (&d_output[0]).get();
88 
89   thrust::for_each(h_input.begin(), h_input.end(), h_f);
90 
91   for_each_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.end(), d_f);
92   cudaError_t const err = cudaDeviceSynchronize();
93   ASSERT_EQUAL(cudaSuccess, err);
94 
95   ASSERT_EQUAL(h_output, d_output);
96 }
97 DECLARE_VARIABLE_UNITTEST(TestForEachDeviceSeq);
98 
99 
100 template<typename T>
TestForEachDeviceDevice(const size_t n)101 void TestForEachDeviceDevice(const size_t n)
102 {
103   const size_t output_size = std::min((size_t) 10, 2 * n);
104 
105   thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
106 
107   for(size_t i = 0; i < n; i++)
108     h_input[i] = ((size_t) h_input[i]) % output_size;
109 
110   thrust::device_vector<T> d_input = h_input;
111 
112   thrust::host_vector<T>   h_output(output_size, (T) 0);
113   thrust::device_vector<T> d_output(output_size, (T) 0);
114 
115   mark_present_for_each<T> h_f;
116   mark_present_for_each<T> d_f;
117   h_f.ptr = &h_output[0];
118   d_f.ptr = (&d_output[0]).get();
119 
120   thrust::for_each(h_input.begin(), h_input.end(), h_f);
121 
122   for_each_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.end(), d_f);
123   {
124     cudaError_t const err = cudaGetLastError();
125     ASSERT_EQUAL(cudaSuccess, err);
126   }
127   {
128     cudaError_t const err = cudaDeviceSynchronize();
129     ASSERT_EQUAL(cudaSuccess, err);
130   }
131 
132   ASSERT_EQUAL(h_output, d_output);
133 }
134 DECLARE_VARIABLE_UNITTEST(TestForEachDeviceDevice);
135 
136 
137 template<typename ExecutionPolicy, typename Iterator, typename Size, typename Function>
138 __global__
for_each_n_kernel(ExecutionPolicy exec,Iterator first,Size n,Function f)139 void for_each_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f)
140 {
141   thrust::for_each_n(exec, first, n, f);
142 }
143 
144 
145 template<typename T>
TestForEachNDeviceSeq(const size_t n)146 void TestForEachNDeviceSeq(const size_t n)
147 {
148   const size_t output_size = std::min((size_t) 10, 2 * n);
149 
150   thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
151 
152   for(size_t i = 0; i < n; i++)
153     h_input[i] =  static_cast<T>(((size_t) h_input[i]) % output_size);
154 
155   thrust::device_vector<T> d_input = h_input;
156 
157   thrust::host_vector<T>   h_output(output_size, (T) 0);
158   thrust::device_vector<T> d_output(output_size, (T) 0);
159 
160   mark_present_for_each<T> h_f;
161   mark_present_for_each<T> d_f;
162   h_f.ptr = &h_output[0];
163   d_f.ptr = (&d_output[0]).get();
164 
165   thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
166 
167   for_each_n_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.size(), d_f);
168   cudaError_t const err = cudaDeviceSynchronize();
169   ASSERT_EQUAL(cudaSuccess, err);
170 
171   ASSERT_EQUAL(h_output, d_output);
172 }
173 DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceSeq);
174 
175 
176 template<typename T>
TestForEachNDeviceDevice(const size_t n)177 void TestForEachNDeviceDevice(const size_t n)
178 {
179   const size_t output_size = std::min((size_t) 10, 2 * n);
180 
181   thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
182 
183   for(size_t i = 0; i < n; i++)
184     h_input[i] =  static_cast<T>(((size_t) h_input[i]) % output_size);
185 
186   thrust::device_vector<T> d_input = h_input;
187 
188   thrust::host_vector<T>   h_output(output_size, (T) 0);
189   thrust::device_vector<T> d_output(output_size, (T) 0);
190 
191   mark_present_for_each<T> h_f;
192   mark_present_for_each<T> d_f;
193   h_f.ptr = &h_output[0];
194   d_f.ptr = (&d_output[0]).get();
195 
196   thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
197 
198   for_each_n_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.size(), d_f);
199   cudaError_t const err = cudaDeviceSynchronize();
200   ASSERT_EQUAL(cudaSuccess, err);
201 
202   ASSERT_EQUAL(h_output, d_output);
203 }
204 DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceDevice);
205 
206 
TestForEachCudaStreams()207 void TestForEachCudaStreams()
208 {
209   cudaStream_t s;
210   cudaStreamCreate(&s);
211 
212   thrust::device_vector<int> input(5);
213   thrust::device_vector<int> output(7, 0);
214 
215   input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6;
216 
217   mark_present_for_each<int> f;
218   f.ptr = thrust::raw_pointer_cast(output.data());
219 
220   thrust::for_each(thrust::cuda::par.on(s), input.begin(), input.end(), f);
221 
222   cudaStreamSynchronize(s);
223 
224   ASSERT_EQUAL(output[0], 0);
225   ASSERT_EQUAL(output[1], 0);
226   ASSERT_EQUAL(output[2], 1);
227   ASSERT_EQUAL(output[3], 1);
228   ASSERT_EQUAL(output[4], 1);
229   ASSERT_EQUAL(output[5], 0);
230   ASSERT_EQUAL(output[6], 1);
231 
232   cudaStreamDestroy(s);
233 }
234 DECLARE_UNITTEST(TestForEachCudaStreams);
235 
236