#include #include #include #include static const size_t NUM_REGISTERS = 64; template __host__ __device__ void f (int * x) { int temp = *x; f(x + 1); *x = temp;}; template <> __host__ __device__ void f<0>(int * /*x*/) { } template struct CopyFunctorWithManyRegisters { __host__ __device__ void operator()(int * ptr) { f(ptr); } }; void TestForEachLargeRegisterFootprint() { int current_device = -1; cudaGetDevice(¤t_device); cudaDeviceProp prop; cudaGetDeviceProperties(&prop, current_device); thrust::device_vector data(NUM_REGISTERS, 12345); thrust::device_vector input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant thrust::for_each(input.begin(), input.end(), CopyFunctorWithManyRegisters()); } DECLARE_UNITTEST(TestForEachLargeRegisterFootprint); void TestForEachNLargeRegisterFootprint() { int current_device = -1; cudaGetDevice(¤t_device); cudaDeviceProp prop; cudaGetDeviceProperties(&prop, current_device); thrust::device_vector data(NUM_REGISTERS, 12345); thrust::device_vector input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant thrust::for_each_n(input.begin(), input.size(), CopyFunctorWithManyRegisters()); } DECLARE_UNITTEST(TestForEachNLargeRegisterFootprint); template struct mark_present_for_each { T * ptr; __host__ __device__ void operator()(T x){ ptr[(int) x] = 1; } }; template __global__ void for_each_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f) { thrust::for_each(exec, first, last, f); } template void TestForEachDeviceSeq(const size_t n) { const size_t output_size = std::min((size_t) 10, 2 * n); thrust::host_vector h_input = unittest::random_integers(n); for(size_t i = 0; i < n; i++) h_input[i] = ((size_t) h_input[i]) % output_size; thrust::device_vector d_input = h_input; thrust::host_vector h_output(output_size, (T) 0); thrust::device_vector d_output(output_size, (T) 0); mark_present_for_each h_f; mark_present_for_each d_f; h_f.ptr = &h_output[0]; d_f.ptr = (&d_output[0]).get(); thrust::for_each(h_input.begin(), h_input.end(), h_f); for_each_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.end(), d_f); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); ASSERT_EQUAL(h_output, d_output); } DECLARE_VARIABLE_UNITTEST(TestForEachDeviceSeq); template void TestForEachDeviceDevice(const size_t n) { const size_t output_size = std::min((size_t) 10, 2 * n); thrust::host_vector h_input = unittest::random_integers(n); for(size_t i = 0; i < n; i++) h_input[i] = ((size_t) h_input[i]) % output_size; thrust::device_vector d_input = h_input; thrust::host_vector h_output(output_size, (T) 0); thrust::device_vector d_output(output_size, (T) 0); mark_present_for_each h_f; mark_present_for_each d_f; h_f.ptr = &h_output[0]; d_f.ptr = (&d_output[0]).get(); thrust::for_each(h_input.begin(), h_input.end(), h_f); for_each_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.end(), d_f); { cudaError_t const err = cudaGetLastError(); ASSERT_EQUAL(cudaSuccess, err); } { cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); } ASSERT_EQUAL(h_output, d_output); } DECLARE_VARIABLE_UNITTEST(TestForEachDeviceDevice); template __global__ void for_each_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f) { thrust::for_each_n(exec, first, n, f); } template void TestForEachNDeviceSeq(const size_t n) { const size_t output_size = std::min((size_t) 10, 2 * n); thrust::host_vector h_input = unittest::random_integers(n); for(size_t i = 0; i < n; i++) h_input[i] = static_cast(((size_t) h_input[i]) % output_size); thrust::device_vector d_input = h_input; thrust::host_vector h_output(output_size, (T) 0); thrust::device_vector d_output(output_size, (T) 0); mark_present_for_each h_f; mark_present_for_each d_f; h_f.ptr = &h_output[0]; d_f.ptr = (&d_output[0]).get(); thrust::for_each_n(h_input.begin(), h_input.size(), h_f); for_each_n_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.size(), d_f); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); ASSERT_EQUAL(h_output, d_output); } DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceSeq); template void TestForEachNDeviceDevice(const size_t n) { const size_t output_size = std::min((size_t) 10, 2 * n); thrust::host_vector h_input = unittest::random_integers(n); for(size_t i = 0; i < n; i++) h_input[i] = static_cast(((size_t) h_input[i]) % output_size); thrust::device_vector d_input = h_input; thrust::host_vector h_output(output_size, (T) 0); thrust::device_vector d_output(output_size, (T) 0); mark_present_for_each h_f; mark_present_for_each d_f; h_f.ptr = &h_output[0]; d_f.ptr = (&d_output[0]).get(); thrust::for_each_n(h_input.begin(), h_input.size(), h_f); for_each_n_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.size(), d_f); cudaError_t const err = cudaDeviceSynchronize(); ASSERT_EQUAL(cudaSuccess, err); ASSERT_EQUAL(h_output, d_output); } DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceDevice); void TestForEachCudaStreams() { cudaStream_t s; cudaStreamCreate(&s); thrust::device_vector input(5); thrust::device_vector output(7, 0); input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6; mark_present_for_each f; f.ptr = thrust::raw_pointer_cast(output.data()); thrust::for_each(thrust::cuda::par.on(s), input.begin(), input.end(), f); cudaStreamSynchronize(s); ASSERT_EQUAL(output[0], 0); ASSERT_EQUAL(output[1], 0); ASSERT_EQUAL(output[2], 1); ASSERT_EQUAL(output[3], 1); ASSERT_EQUAL(output[4], 1); ASSERT_EQUAL(output[5], 0); ASSERT_EQUAL(output[6], 1); cudaStreamDestroy(s); } DECLARE_UNITTEST(TestForEachCudaStreams);