backend/cuda/for_each.cu

#include <unittest/unittest.h>
#include <thrust/for_each.h>
#include <thrust/execution_policy.h>
#include <algorithm>

static const size_t NUM_REGISTERS = 64;

template <size_t N> __host__ __device__ void f   (int * x) { int temp = *x; f<N - 1>(x + 1); *x = temp;};
template <>         __host__ __device__ void f<0>(int * /*x*/) { }
template <size_t N>
struct CopyFunctorWithManyRegisters
{
  __host__ __device__
  void operator()(int * ptr)
  {
      f<N>(ptr);
  }
};


void TestForEachLargeRegisterFootprint()
{
  int current_device = -1;
  cudaGetDevice(&current_device);
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, current_device);

  thrust::device_vector<int> data(NUM_REGISTERS, 12345);

  thrust::device_vector<int *> input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant

  thrust::for_each(input.begin(), input.end(), CopyFunctorWithManyRegisters<NUM_REGISTERS>());
}
DECLARE_UNITTEST(TestForEachLargeRegisterFootprint);


void TestForEachNLargeRegisterFootprint()
{
  int current_device = -1;
  cudaGetDevice(&current_device);
  cudaDeviceProp prop;
  cudaGetDeviceProperties(&prop, current_device);

  thrust::device_vector<int> data(NUM_REGISTERS, 12345);

  thrust::device_vector<int *> input(1, thrust::raw_pointer_cast(&data[0])); // length is irrelevant

  thrust::for_each_n(input.begin(), input.size(), CopyFunctorWithManyRegisters<NUM_REGISTERS>());
}
DECLARE_UNITTEST(TestForEachNLargeRegisterFootprint);


template <typename T>
struct mark_present_for_each
{
  T * ptr;
  __host__ __device__ void
  operator()(T x){ ptr[(int) x] = 1; }
};


template<typename ExecutionPolicy, typename Iterator, typename Function>
__global__ void for_each_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
{
  thrust::for_each(exec, first, last, f);
}


template<typename T>
void TestForEachDeviceSeq(const size_t n)
{
  const size_t output_size = std::min((size_t) 10, 2 * n);

  thrust::host_vector<T> h_input = unittest::random_integers<T>(n);

  for(size_t i = 0; i < n; i++)
    h_input[i] =  ((size_t) h_input[i]) % output_size;

  thrust::device_vector<T> d_input = h_input;

  thrust::host_vector<T>   h_output(output_size, (T) 0);
  thrust::device_vector<T> d_output(output_size, (T) 0);

  mark_present_for_each<T> h_f;
  mark_present_for_each<T> d_f;
  h_f.ptr = &h_output[0];
  d_f.ptr = (&d_output[0]).get();

  thrust::for_each(h_input.begin(), h_input.end(), h_f);

  for_each_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.end(), d_f);
  cudaError_t const err = cudaDeviceSynchronize();
  ASSERT_EQUAL(cudaSuccess, err);

  ASSERT_EQUAL(h_output, d_output);
}
DECLARE_VARIABLE_UNITTEST(TestForEachDeviceSeq);


template<typename T>
void TestForEachDeviceDevice(const size_t n)
{
  const size_t output_size = std::min((size_t) 10, 2 * n);

  thrust::host_vector<T> h_input = unittest::random_integers<T>(n);

  for(size_t i = 0; i < n; i++)
    h_input[i] = ((size_t) h_input[i]) % output_size;

  thrust::device_vector<T> d_input = h_input;

  thrust::host_vector<T>   h_output(output_size, (T) 0);
  thrust::device_vector<T> d_output(output_size, (T) 0);

  mark_present_for_each<T> h_f;
  mark_present_for_each<T> d_f;
  h_f.ptr = &h_output[0];
  d_f.ptr = (&d_output[0]).get();

  thrust::for_each(h_input.begin(), h_input.end(), h_f);

  for_each_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.end(), d_f);
  {
    cudaError_t const err = cudaGetLastError();
    ASSERT_EQUAL(cudaSuccess, err);
  }
  {
    cudaError_t const err = cudaDeviceSynchronize();
    ASSERT_EQUAL(cudaSuccess, err);
  }

  ASSERT_EQUAL(h_output, d_output);
}
DECLARE_VARIABLE_UNITTEST(TestForEachDeviceDevice);


template<typename ExecutionPolicy, typename Iterator, typename Size, typename Function>
__global__
void for_each_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f)
{
  thrust::for_each_n(exec, first, n, f);
}


template<typename T>
void TestForEachNDeviceSeq(const size_t n)
{
  const size_t output_size = std::min((size_t) 10, 2 * n);

  thrust::host_vector<T> h_input = unittest::random_integers<T>(n);

  for(size_t i = 0; i < n; i++)
    h_input[i] =  static_cast<T>(((size_t) h_input[i]) % output_size);

  thrust::device_vector<T> d_input = h_input;

  thrust::host_vector<T>   h_output(output_size, (T) 0);
  thrust::device_vector<T> d_output(output_size, (T) 0);

  mark_present_for_each<T> h_f;
  mark_present_for_each<T> d_f;
  h_f.ptr = &h_output[0];
  d_f.ptr = (&d_output[0]).get();

  thrust::for_each_n(h_input.begin(), h_input.size(), h_f);

  for_each_n_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.size(), d_f);
  cudaError_t const err = cudaDeviceSynchronize();
  ASSERT_EQUAL(cudaSuccess, err);

  ASSERT_EQUAL(h_output, d_output);
}
DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceSeq);


template<typename T>
void TestForEachNDeviceDevice(const size_t n)
{
  const size_t output_size = std::min((size_t) 10, 2 * n);

  thrust::host_vector<T> h_input = unittest::random_integers<T>(n);

  for(size_t i = 0; i < n; i++)
    h_input[i] =  static_cast<T>(((size_t) h_input[i]) % output_size);

  thrust::device_vector<T> d_input = h_input;

  thrust::host_vector<T>   h_output(output_size, (T) 0);
  thrust::device_vector<T> d_output(output_size, (T) 0);

  mark_present_for_each<T> h_f;
  mark_present_for_each<T> d_f;
  h_f.ptr = &h_output[0];
  d_f.ptr = (&d_output[0]).get();

  thrust::for_each_n(h_input.begin(), h_input.size(), h_f);

  for_each_n_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.size(), d_f);
  cudaError_t const err = cudaDeviceSynchronize();
  ASSERT_EQUAL(cudaSuccess, err);

  ASSERT_EQUAL(h_output, d_output);
}
DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceDevice);


void TestForEachCudaStreams()
{
  cudaStream_t s;
  cudaStreamCreate(&s);

  thrust::device_vector<int> input(5);
  thrust::device_vector<int> output(7, 0);

  input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6;

  mark_present_for_each<int> f;
  f.ptr = thrust::raw_pointer_cast(output.data());

  thrust::for_each(thrust::cuda::par.on(s), input.begin(), input.end(), f);

  cudaStreamSynchronize(s);

  ASSERT_EQUAL(output[0], 0);
  ASSERT_EQUAL(output[1], 0);
  ASSERT_EQUAL(output[2], 1);
  ASSERT_EQUAL(output[3], 1);
  ASSERT_EQUAL(output[4], 1);
  ASSERT_EQUAL(output[5], 0);
  ASSERT_EQUAL(output[6], 1);

  cudaStreamDestroy(s);
}
DECLARE_UNITTEST(TestForEachCudaStreams);