1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
13 #define EIGEN_USE_GPU
14 
15 #include "main.h"
16 #include <unsupported/Eigen/CXX11/Tensor>
17 
18 
19 template<typename Type, int DataLayout>
test_full_reductions()20 static void test_full_reductions() {
21 
22   Eigen::CudaStreamDevice stream;
23   Eigen::GpuDevice gpu_device(&stream);
24 
25   const int num_rows = internal::random<int>(1024, 5*1024);
26   const int num_cols = internal::random<int>(1024, 5*1024);
27 
28   Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
29   in.setRandom();
30 
31   Tensor<Type, 0, DataLayout> full_redux;
32   full_redux = in.sum();
33 
34   std::size_t in_bytes = in.size() * sizeof(Type);
35   std::size_t out_bytes = full_redux.size() * sizeof(Type);
36   Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
37   Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
38   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
39 
40   TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
41   TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
42 
43   out_gpu.device(gpu_device) = in_gpu.sum();
44 
45   Tensor<Type, 0, DataLayout> full_redux_gpu;
46   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
47   gpu_device.synchronize();
48 
49   // Check that the CPU and GPU reductions return the same result.
50   VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
51 
52   gpu_device.deallocate(gpu_in_ptr);
53   gpu_device.deallocate(gpu_out_ptr);
54 }
55 
56 template<typename Type, int DataLayout>
test_first_dim_reductions()57 static void test_first_dim_reductions() {
58   int dim_x = 33;
59   int dim_y = 1;
60   int dim_z = 128;
61 
62   Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
63   in.setRandom();
64 
65   Eigen::array<int, 1> red_axis;
66   red_axis[0] = 0;
67   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
68 
69   // Create device
70   Eigen::CudaStreamDevice stream;
71   Eigen::GpuDevice dev(&stream);
72 
73   // Create data(T)
74   Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
75   Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
76   Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
77   Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
78 
79   // Perform operation
80   dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
81   gpu_out.device(dev) = gpu_in.sum(red_axis);
82   gpu_out.device(dev) += gpu_in.sum(red_axis);
83   Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
84   dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
85   dev.synchronize();
86 
87   // Check that the CPU and GPU reductions return the same result.
88   for (int i = 0; i < gpu_out.size(); ++i) {
89     VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
90   }
91 
92   dev.deallocate(in_data);
93   dev.deallocate(out_data);
94 }
95 
96 template<typename Type, int DataLayout>
test_last_dim_reductions()97 static void test_last_dim_reductions() {
98   int dim_x = 128;
99   int dim_y = 1;
100   int dim_z = 33;
101 
102   Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
103   in.setRandom();
104 
105   Eigen::array<int, 1> red_axis;
106   red_axis[0] = 2;
107   Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
108 
109   // Create device
110   Eigen::CudaStreamDevice stream;
111   Eigen::GpuDevice dev(&stream);
112 
113   // Create data
114   Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
115   Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
116   Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
117   Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
118 
119   // Perform operation
120   dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
121   gpu_out.device(dev) = gpu_in.sum(red_axis);
122   gpu_out.device(dev) += gpu_in.sum(red_axis);
123   Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
124   dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
125   dev.synchronize();
126 
127   // Check that the CPU and GPU reductions return the same result.
128   for (int i = 0; i < gpu_out.size(); ++i) {
129     VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
130   }
131 
132   dev.deallocate(in_data);
133   dev.deallocate(out_data);
134 }
135 
136 
test_cxx11_tensor_reduction_cuda()137 void test_cxx11_tensor_reduction_cuda() {
138   CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
139   CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
140   CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
141   CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
142 
143   CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>()));
144   CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>()));
145   CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>()));
146 // Outer reductions of doubles aren't supported just yet.
147 //  CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>()))
148 
149   CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>()));
150 // Outer reductions of doubles aren't supported just yet.
151 //  CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>()));
152   CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>()));
153   CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>()));
154 }
155