1 #include <unittest/unittest.h>
2 #include <thrust/for_each.h>
3 #include <thrust/execution_policy.h>
4 #include <algorithm>
5
6 static const size_t NUM_REGISTERS = 64;
7
f(int * x)8 template <size_t N> __host__ __device__ void f (int * x) { int temp = *x; f<N - 1>(x + 1); *x = temp;};
f(int *)9 template <> __host__ __device__ void f<0>(int * /*x*/) { }
10 template <size_t N>
11 struct CopyFunctorWithManyRegisters
12 {
13 __host__ __device__
operator ()CopyFunctorWithManyRegisters14 void operator()(int * ptr)
15 {
16 f<N>(ptr);
17 }
18 };
19
20
TestForEachLargeRegisterFootprint()21 void TestForEachLargeRegisterFootprint()
22 {
23 int current_device = -1;
24 cudaGetDevice(¤t_device);
25 cudaDeviceProp prop;
26 cudaGetDeviceProperties(&prop, current_device);
27
28 thrust::device_vector<int> data(NUM_REGISTERS, 12345);
29
30 thrust::device_vector<int *> input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant
31
32 thrust::for_each(input.begin(), input.end(), CopyFunctorWithManyRegisters<NUM_REGISTERS>());
33 }
34 DECLARE_UNITTEST(TestForEachLargeRegisterFootprint);
35
36
TestForEachNLargeRegisterFootprint()37 void TestForEachNLargeRegisterFootprint()
38 {
39 int current_device = -1;
40 cudaGetDevice(¤t_device);
41 cudaDeviceProp prop;
42 cudaGetDeviceProperties(&prop, current_device);
43
44 thrust::device_vector<int> data(NUM_REGISTERS, 12345);
45
46 thrust::device_vector<int *> input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant
47
48 thrust::for_each_n(input.begin(), input.size(), CopyFunctorWithManyRegisters<NUM_REGISTERS>());
49 }
50 DECLARE_UNITTEST(TestForEachNLargeRegisterFootprint);
51
52
53 template <typename T>
54 struct mark_present_for_each
55 {
56 T * ptr;
57 __host__ __device__ void
operator ()mark_present_for_each58 operator()(T x){ ptr[(int) x] = 1; }
59 };
60
61
62 template<typename ExecutionPolicy, typename Iterator, typename Function>
for_each_kernel(ExecutionPolicy exec,Iterator first,Iterator last,Function f)63 __global__ void for_each_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
64 {
65 thrust::for_each(exec, first, last, f);
66 }
67
68
69 template<typename T>
TestForEachDeviceSeq(const size_t n)70 void TestForEachDeviceSeq(const size_t n)
71 {
72 const size_t output_size = std::min((size_t) 10, 2 * n);
73
74 thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
75
76 for(size_t i = 0; i < n; i++)
77 h_input[i] = ((size_t) h_input[i]) % output_size;
78
79 thrust::device_vector<T> d_input = h_input;
80
81 thrust::host_vector<T> h_output(output_size, (T) 0);
82 thrust::device_vector<T> d_output(output_size, (T) 0);
83
84 mark_present_for_each<T> h_f;
85 mark_present_for_each<T> d_f;
86 h_f.ptr = &h_output[0];
87 d_f.ptr = (&d_output[0]).get();
88
89 thrust::for_each(h_input.begin(), h_input.end(), h_f);
90
91 for_each_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.end(), d_f);
92 cudaError_t const err = cudaDeviceSynchronize();
93 ASSERT_EQUAL(cudaSuccess, err);
94
95 ASSERT_EQUAL(h_output, d_output);
96 }
97 DECLARE_VARIABLE_UNITTEST(TestForEachDeviceSeq);
98
99
100 template<typename T>
TestForEachDeviceDevice(const size_t n)101 void TestForEachDeviceDevice(const size_t n)
102 {
103 const size_t output_size = std::min((size_t) 10, 2 * n);
104
105 thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
106
107 for(size_t i = 0; i < n; i++)
108 h_input[i] = ((size_t) h_input[i]) % output_size;
109
110 thrust::device_vector<T> d_input = h_input;
111
112 thrust::host_vector<T> h_output(output_size, (T) 0);
113 thrust::device_vector<T> d_output(output_size, (T) 0);
114
115 mark_present_for_each<T> h_f;
116 mark_present_for_each<T> d_f;
117 h_f.ptr = &h_output[0];
118 d_f.ptr = (&d_output[0]).get();
119
120 thrust::for_each(h_input.begin(), h_input.end(), h_f);
121
122 for_each_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.end(), d_f);
123 {
124 cudaError_t const err = cudaGetLastError();
125 ASSERT_EQUAL(cudaSuccess, err);
126 }
127 {
128 cudaError_t const err = cudaDeviceSynchronize();
129 ASSERT_EQUAL(cudaSuccess, err);
130 }
131
132 ASSERT_EQUAL(h_output, d_output);
133 }
134 DECLARE_VARIABLE_UNITTEST(TestForEachDeviceDevice);
135
136
137 template<typename ExecutionPolicy, typename Iterator, typename Size, typename Function>
138 __global__
for_each_n_kernel(ExecutionPolicy exec,Iterator first,Size n,Function f)139 void for_each_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f)
140 {
141 thrust::for_each_n(exec, first, n, f);
142 }
143
144
145 template<typename T>
TestForEachNDeviceSeq(const size_t n)146 void TestForEachNDeviceSeq(const size_t n)
147 {
148 const size_t output_size = std::min((size_t) 10, 2 * n);
149
150 thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
151
152 for(size_t i = 0; i < n; i++)
153 h_input[i] = static_cast<T>(((size_t) h_input[i]) % output_size);
154
155 thrust::device_vector<T> d_input = h_input;
156
157 thrust::host_vector<T> h_output(output_size, (T) 0);
158 thrust::device_vector<T> d_output(output_size, (T) 0);
159
160 mark_present_for_each<T> h_f;
161 mark_present_for_each<T> d_f;
162 h_f.ptr = &h_output[0];
163 d_f.ptr = (&d_output[0]).get();
164
165 thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
166
167 for_each_n_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.size(), d_f);
168 cudaError_t const err = cudaDeviceSynchronize();
169 ASSERT_EQUAL(cudaSuccess, err);
170
171 ASSERT_EQUAL(h_output, d_output);
172 }
173 DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceSeq);
174
175
176 template<typename T>
TestForEachNDeviceDevice(const size_t n)177 void TestForEachNDeviceDevice(const size_t n)
178 {
179 const size_t output_size = std::min((size_t) 10, 2 * n);
180
181 thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
182
183 for(size_t i = 0; i < n; i++)
184 h_input[i] = static_cast<T>(((size_t) h_input[i]) % output_size);
185
186 thrust::device_vector<T> d_input = h_input;
187
188 thrust::host_vector<T> h_output(output_size, (T) 0);
189 thrust::device_vector<T> d_output(output_size, (T) 0);
190
191 mark_present_for_each<T> h_f;
192 mark_present_for_each<T> d_f;
193 h_f.ptr = &h_output[0];
194 d_f.ptr = (&d_output[0]).get();
195
196 thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
197
198 for_each_n_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.size(), d_f);
199 cudaError_t const err = cudaDeviceSynchronize();
200 ASSERT_EQUAL(cudaSuccess, err);
201
202 ASSERT_EQUAL(h_output, d_output);
203 }
204 DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceDevice);
205
206
TestForEachCudaStreams()207 void TestForEachCudaStreams()
208 {
209 cudaStream_t s;
210 cudaStreamCreate(&s);
211
212 thrust::device_vector<int> input(5);
213 thrust::device_vector<int> output(7, 0);
214
215 input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6;
216
217 mark_present_for_each<int> f;
218 f.ptr = thrust::raw_pointer_cast(output.data());
219
220 thrust::for_each(thrust::cuda::par.on(s), input.begin(), input.end(), f);
221
222 cudaStreamSynchronize(s);
223
224 ASSERT_EQUAL(output[0], 0);
225 ASSERT_EQUAL(output[1], 0);
226 ASSERT_EQUAL(output[2], 1);
227 ASSERT_EQUAL(output[3], 1);
228 ASSERT_EQUAL(output[4], 1);
229 ASSERT_EQUAL(output[5], 0);
230 ASSERT_EQUAL(output[6], 1);
231
232 cudaStreamDestroy(s);
233 }
234 DECLARE_UNITTEST(TestForEachCudaStreams);
235
236