1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 #define EIGEN_TEST_FUNC cxx11_tensor_device
13 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
14 #define EIGEN_USE_GPU
15
16 #include "main.h"
17 #include <unsupported/Eigen/CXX11/Tensor>
18
19 using Eigen::Tensor;
20 using Eigen::RowMajor;
21
22 // Context for evaluation on cpu
23 struct CPUContext {
CPUContextCPUContext24 CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
25 kernel_1d_(0) = 3.14f;
26 kernel_1d_(1) = 2.7f;
27
28 kernel_2d_(0,0) = 3.14f;
29 kernel_2d_(1,0) = 2.7f;
30 kernel_2d_(0,1) = 0.2f;
31 kernel_2d_(1,1) = 7.0f;
32
33 kernel_3d_(0,0,0) = 3.14f;
34 kernel_3d_(0,1,0) = 2.7f;
35 kernel_3d_(0,0,1) = 0.2f;
36 kernel_3d_(0,1,1) = 7.0f;
37 kernel_3d_(1,0,0) = -1.0f;
38 kernel_3d_(1,1,0) = -0.3f;
39 kernel_3d_(1,0,1) = -0.7f;
40 kernel_3d_(1,1,1) = -0.5f;
41 }
42
deviceCPUContext43 const Eigen::DefaultDevice& device() const { return cpu_device_; }
44
in1CPUContext45 const Eigen::Tensor<float, 3>& in1() const { return in1_; }
in2CPUContext46 const Eigen::Tensor<float, 3>& in2() const { return in2_; }
outCPUContext47 Eigen::Tensor<float, 3>& out() { return out_; }
kernel1dCPUContext48 const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
kernel2dCPUContext49 const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
kernel3dCPUContext50 const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
51
52 private:
53 const Eigen::Tensor<float, 3>& in1_;
54 const Eigen::Tensor<float, 3>& in2_;
55 Eigen::Tensor<float, 3>& out_;
56
57 Eigen::Tensor<float, 1> kernel_1d_;
58 Eigen::Tensor<float, 2> kernel_2d_;
59 Eigen::Tensor<float, 3> kernel_3d_;
60
61 Eigen::DefaultDevice cpu_device_;
62 };
63
64
65 // Context for evaluation on GPU
66 struct GPUContext {
GPUContextGPUContext67 GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
68 assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
69 float kernel_1d_val[] = {3.14f, 2.7f};
70 assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
71
72 assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
73 float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
74 assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
75
76 assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
77 float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
78 assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
79 }
~GPUContextGPUContext80 ~GPUContext() {
81 assert(cudaFree(kernel_1d_) == cudaSuccess);
82 assert(cudaFree(kernel_2d_) == cudaSuccess);
83 assert(cudaFree(kernel_3d_) == cudaSuccess);
84 }
85
deviceGPUContext86 const Eigen::GpuDevice& device() const { return gpu_device_; }
87
in1GPUContext88 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
in2GPUContext89 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
outGPUContext90 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
kernel1dGPUContext91 Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
kernel2dGPUContext92 Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
kernel3dGPUContext93 Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
94
95 private:
96 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
97 const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
98 Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
99
100 float* kernel_1d_;
101 float* kernel_2d_;
102 float* kernel_3d_;
103
104 Eigen::CudaStreamDevice stream_;
105 Eigen::GpuDevice gpu_device_;
106 };
107
108
109 // The actual expression to evaluate
110 template <typename Context>
test_contextual_eval(Context * context)111 void test_contextual_eval(Context* context)
112 {
113 context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
114 }
115
116 template <typename Context>
test_forced_contextual_eval(Context * context)117 void test_forced_contextual_eval(Context* context)
118 {
119 context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
120 }
121
122 template <typename Context>
test_compound_assignment(Context * context)123 void test_compound_assignment(Context* context)
124 {
125 context->out().device(context->device()) = context->in1().constant(2.718f);
126 context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
127 }
128
129
130 template <typename Context>
test_contraction(Context * context)131 void test_contraction(Context* context)
132 {
133 Eigen::array<std::pair<int, int>, 2> dims;
134 dims[0] = std::make_pair(1, 1);
135 dims[1] = std::make_pair(2, 2);
136
137 Eigen::array<int, 2> shape(40, 50*70);
138
139 Eigen::DSizes<int, 2> indices(0,0);
140 Eigen::DSizes<int, 2> sizes(40,40);
141
142 context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
143 }
144
145
146 template <typename Context>
test_1d_convolution(Context * context)147 void test_1d_convolution(Context* context)
148 {
149 Eigen::DSizes<int, 3> indices(0,0,0);
150 Eigen::DSizes<int, 3> sizes(40,49,70);
151
152 Eigen::array<int, 1> dims(1);
153 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
154 }
155
156 template <typename Context>
test_2d_convolution(Context * context)157 void test_2d_convolution(Context* context)
158 {
159 Eigen::DSizes<int, 3> indices(0,0,0);
160 Eigen::DSizes<int, 3> sizes(40,49,69);
161
162 Eigen::array<int, 2> dims(1,2);
163 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
164 }
165
166 template <typename Context>
test_3d_convolution(Context * context)167 void test_3d_convolution(Context* context)
168 {
169 Eigen::DSizes<int, 3> indices(0,0,0);
170 Eigen::DSizes<int, 3> sizes(39,49,69);
171
172 Eigen::array<int, 3> dims(0,1,2);
173 context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
174 }
175
176
test_cpu()177 void test_cpu() {
178 Eigen::Tensor<float, 3> in1(40,50,70);
179 Eigen::Tensor<float, 3> in2(40,50,70);
180 Eigen::Tensor<float, 3> out(40,50,70);
181
182 in1 = in1.random() + in1.constant(10.0f);
183 in2 = in2.random() + in2.constant(10.0f);
184
185 CPUContext context(in1, in2, out);
186 test_contextual_eval(&context);
187 for (int i = 0; i < 40; ++i) {
188 for (int j = 0; j < 50; ++j) {
189 for (int k = 0; k < 70; ++k) {
190 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
191 }
192 }
193 }
194
195 test_forced_contextual_eval(&context);
196 for (int i = 0; i < 40; ++i) {
197 for (int j = 0; j < 50; ++j) {
198 for (int k = 0; k < 70; ++k) {
199 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
200 }
201 }
202 }
203
204 test_compound_assignment(&context);
205 for (int i = 0; i < 40; ++i) {
206 for (int j = 0; j < 50; ++j) {
207 for (int k = 0; k < 70; ++k) {
208 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
209 }
210 }
211 }
212
213 test_contraction(&context);
214 for (int i = 0; i < 40; ++i) {
215 for (int j = 0; j < 40; ++j) {
216 const float result = out(i,j,0);
217 float expected = 0;
218 for (int k = 0; k < 50; ++k) {
219 for (int l = 0; l < 70; ++l) {
220 expected += in1(i, k, l) * in2(j, k, l);
221 }
222 }
223 VERIFY_IS_APPROX(expected, result);
224 }
225 }
226
227 test_1d_convolution(&context);
228 for (int i = 0; i < 40; ++i) {
229 for (int j = 0; j < 49; ++j) {
230 for (int k = 0; k < 70; ++k) {
231 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
232 }
233 }
234 }
235
236 test_2d_convolution(&context);
237 for (int i = 0; i < 40; ++i) {
238 for (int j = 0; j < 49; ++j) {
239 for (int k = 0; k < 69; ++k) {
240 const float result = out(i,j,k);
241 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
242 (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
243 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
244 continue;
245 }
246 VERIFY_IS_APPROX(expected, result);
247 }
248 }
249 }
250
251 test_3d_convolution(&context);
252 for (int i = 0; i < 39; ++i) {
253 for (int j = 0; j < 49; ++j) {
254 for (int k = 0; k < 69; ++k) {
255 const float result = out(i,j,k);
256 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
257 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
258 (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
259 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
260 if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
261 continue;
262 }
263 VERIFY_IS_APPROX(expected, result);
264 }
265 }
266 }
267 }
268
test_gpu()269 void test_gpu() {
270 Eigen::Tensor<float, 3> in1(40,50,70);
271 Eigen::Tensor<float, 3> in2(40,50,70);
272 Eigen::Tensor<float, 3> out(40,50,70);
273 in1 = in1.random() + in1.constant(10.0f);
274 in2 = in2.random() + in2.constant(10.0f);
275
276 std::size_t in1_bytes = in1.size() * sizeof(float);
277 std::size_t in2_bytes = in2.size() * sizeof(float);
278 std::size_t out_bytes = out.size() * sizeof(float);
279
280 float* d_in1;
281 float* d_in2;
282 float* d_out;
283 cudaMalloc((void**)(&d_in1), in1_bytes);
284 cudaMalloc((void**)(&d_in2), in2_bytes);
285 cudaMalloc((void**)(&d_out), out_bytes);
286
287 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
288 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
289
290 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
291 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
292 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
293
294 GPUContext context(gpu_in1, gpu_in2, gpu_out);
295 test_contextual_eval(&context);
296 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
297 for (int i = 0; i < 40; ++i) {
298 for (int j = 0; j < 50; ++j) {
299 for (int k = 0; k < 70; ++k) {
300 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
301 }
302 }
303 }
304
305 test_forced_contextual_eval(&context);
306 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
307 for (int i = 0; i < 40; ++i) {
308 for (int j = 0; j < 50; ++j) {
309 for (int k = 0; k < 70; ++k) {
310 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
311 }
312 }
313 }
314
315 test_compound_assignment(&context);
316 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
317 for (int i = 0; i < 40; ++i) {
318 for (int j = 0; j < 50; ++j) {
319 for (int k = 0; k < 70; ++k) {
320 VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
321 }
322 }
323 }
324
325 test_contraction(&context);
326 assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
327 for (int i = 0; i < 40; ++i) {
328 for (int j = 0; j < 40; ++j) {
329 const float result = out(i,j,0);
330 float expected = 0;
331 for (int k = 0; k < 50; ++k) {
332 for (int l = 0; l < 70; ++l) {
333 expected += in1(i, k, l) * in2(j, k, l);
334 }
335 }
336 VERIFY_IS_APPROX(expected, result);
337 }
338 }
339
340 test_1d_convolution(&context);
341 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
342 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
343 for (int i = 0; i < 40; ++i) {
344 for (int j = 0; j < 49; ++j) {
345 for (int k = 0; k < 70; ++k) {
346 VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
347 }
348 }
349 }
350
351 test_2d_convolution(&context);
352 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
353 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
354 for (int i = 0; i < 40; ++i) {
355 for (int j = 0; j < 49; ++j) {
356 for (int k = 0; k < 69; ++k) {
357 const float result = out(i,j,k);
358 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
359 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
360 VERIFY_IS_APPROX(expected, result);
361 }
362 }
363 }
364
365 test_3d_convolution(&context);
366 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
367 assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
368 for (int i = 0; i < 39; ++i) {
369 for (int j = 0; j < 49; ++j) {
370 for (int k = 0; k < 69; ++k) {
371 const float result = out(i,j,k);
372 const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
373 in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
374 in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
375 in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
376 VERIFY_IS_APPROX(expected, result);
377 }
378 }
379 }
380 }
381
382
test_cxx11_tensor_device()383 void test_cxx11_tensor_device()
384 {
385 CALL_SUBTEST_1(test_cpu());
386 CALL_SUBTEST_2(test_gpu());
387 }
388