1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
13 #define EIGEN_USE_GPU
14
15 #include "main.h"
16 #include <unsupported/Eigen/CXX11/Tensor>
17
18 using Eigen::Tensor;
19
test_cuda_nullary()20 void test_cuda_nullary() {
21 Tensor<float, 1, 0, int> in1(2);
22 Tensor<float, 1, 0, int> in2(2);
23 in1.setRandom();
24 in2.setRandom();
25
26 std::size_t tensor_bytes = in1.size() * sizeof(float);
27
28 float* d_in1;
29 float* d_in2;
30 cudaMalloc((void**)(&d_in1), tensor_bytes);
31 cudaMalloc((void**)(&d_in2), tensor_bytes);
32 cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
33 cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
34
35 Eigen::CudaStreamDevice stream;
36 Eigen::GpuDevice gpu_device(&stream);
37
38 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
39 d_in1, 2);
40 Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
41 d_in2, 2);
42
43 gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
44 gpu_in2.device(gpu_device) = gpu_in2.random();
45
46 Tensor<float, 1, 0, int> new1(2);
47 Tensor<float, 1, 0, int> new2(2);
48
49 assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
50 gpu_device.stream()) == cudaSuccess);
51 assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
52 gpu_device.stream()) == cudaSuccess);
53
54 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
55
56 for (int i = 0; i < 2; ++i) {
57 VERIFY_IS_APPROX(new1(i), 3.14f);
58 VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
59 }
60
61 cudaFree(d_in1);
62 cudaFree(d_in2);
63 }
64
test_cuda_elementwise_small()65 void test_cuda_elementwise_small() {
66 Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
67 Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
68 Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
69 in1.setRandom();
70 in2.setRandom();
71
72 std::size_t in1_bytes = in1.size() * sizeof(float);
73 std::size_t in2_bytes = in2.size() * sizeof(float);
74 std::size_t out_bytes = out.size() * sizeof(float);
75
76 float* d_in1;
77 float* d_in2;
78 float* d_out;
79 cudaMalloc((void**)(&d_in1), in1_bytes);
80 cudaMalloc((void**)(&d_in2), in2_bytes);
81 cudaMalloc((void**)(&d_out), out_bytes);
82
83 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
84 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
85
86 Eigen::CudaStreamDevice stream;
87 Eigen::GpuDevice gpu_device(&stream);
88
89 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
90 d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
91 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
92 d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
93 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
94 d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
95
96 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
97
98 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
99 gpu_device.stream()) == cudaSuccess);
100 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
101
102 for (int i = 0; i < 2; ++i) {
103 VERIFY_IS_APPROX(
104 out(Eigen::array<Eigen::DenseIndex, 1>(i)),
105 in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
106 }
107
108 cudaFree(d_in1);
109 cudaFree(d_in2);
110 cudaFree(d_out);
111 }
112
test_cuda_elementwise()113 void test_cuda_elementwise()
114 {
115 Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
116 Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
117 Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
118 Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
119 in1.setRandom();
120 in2.setRandom();
121 in3.setRandom();
122
123 std::size_t in1_bytes = in1.size() * sizeof(float);
124 std::size_t in2_bytes = in2.size() * sizeof(float);
125 std::size_t in3_bytes = in3.size() * sizeof(float);
126 std::size_t out_bytes = out.size() * sizeof(float);
127
128 float* d_in1;
129 float* d_in2;
130 float* d_in3;
131 float* d_out;
132 cudaMalloc((void**)(&d_in1), in1_bytes);
133 cudaMalloc((void**)(&d_in2), in2_bytes);
134 cudaMalloc((void**)(&d_in3), in3_bytes);
135 cudaMalloc((void**)(&d_out), out_bytes);
136
137 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
138 cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
139 cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
140
141 Eigen::CudaStreamDevice stream;
142 Eigen::GpuDevice gpu_device(&stream);
143
144 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
145 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
146 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
147 Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
148
149 gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
150
151 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
152 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
153
154 for (int i = 0; i < 72; ++i) {
155 for (int j = 0; j < 53; ++j) {
156 for (int k = 0; k < 97; ++k) {
157 VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
158 }
159 }
160 }
161
162 cudaFree(d_in1);
163 cudaFree(d_in2);
164 cudaFree(d_in3);
165 cudaFree(d_out);
166 }
167
test_cuda_props()168 void test_cuda_props() {
169 Tensor<float, 1> in1(200);
170 Tensor<bool, 1> out(200);
171 in1.setRandom();
172
173 std::size_t in1_bytes = in1.size() * sizeof(float);
174 std::size_t out_bytes = out.size() * sizeof(bool);
175
176 float* d_in1;
177 bool* d_out;
178 cudaMalloc((void**)(&d_in1), in1_bytes);
179 cudaMalloc((void**)(&d_out), out_bytes);
180
181 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
182
183 Eigen::CudaStreamDevice stream;
184 Eigen::GpuDevice gpu_device(&stream);
185
186 Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
187 d_in1, 200);
188 Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
189 d_out, 200);
190
191 gpu_out.device(gpu_device) = (gpu_in1.isnan)();
192
193 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
194 gpu_device.stream()) == cudaSuccess);
195 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
196
197 for (int i = 0; i < 200; ++i) {
198 VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
199 }
200
201 cudaFree(d_in1);
202 cudaFree(d_out);
203 }
204
test_cuda_reduction()205 void test_cuda_reduction()
206 {
207 Tensor<float, 4> in1(72,53,97,113);
208 Tensor<float, 2> out(72,97);
209 in1.setRandom();
210
211 std::size_t in1_bytes = in1.size() * sizeof(float);
212 std::size_t out_bytes = out.size() * sizeof(float);
213
214 float* d_in1;
215 float* d_out;
216 cudaMalloc((void**)(&d_in1), in1_bytes);
217 cudaMalloc((void**)(&d_out), out_bytes);
218
219 cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
220
221 Eigen::CudaStreamDevice stream;
222 Eigen::GpuDevice gpu_device(&stream);
223
224 Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
225 Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
226
227 array<Eigen::DenseIndex, 2> reduction_axis;
228 reduction_axis[0] = 1;
229 reduction_axis[1] = 3;
230
231 gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
232
233 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
234 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
235
236 for (int i = 0; i < 72; ++i) {
237 for (int j = 0; j < 97; ++j) {
238 float expected = 0;
239 for (int k = 0; k < 53; ++k) {
240 for (int l = 0; l < 113; ++l) {
241 expected =
242 std::max<float>(expected, in1(i, k, j, l));
243 }
244 }
245 VERIFY_IS_APPROX(out(i,j), expected);
246 }
247 }
248
249 cudaFree(d_in1);
250 cudaFree(d_out);
251 }
252
253 template<int DataLayout>
test_cuda_contraction()254 void test_cuda_contraction()
255 {
256 // with these dimensions, the output has 300 * 140 elements, which is
257 // more than 30 * 1024, which is the number of threads in blocks on
258 // a 15 SM GK110 GPU
259 Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
260 Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
261 Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
262
263 t_left.setRandom();
264 t_right.setRandom();
265
266 std::size_t t_left_bytes = t_left.size() * sizeof(float);
267 std::size_t t_right_bytes = t_right.size() * sizeof(float);
268 std::size_t t_result_bytes = t_result.size() * sizeof(float);
269
270 float* d_t_left;
271 float* d_t_right;
272 float* d_t_result;
273
274 cudaMalloc((void**)(&d_t_left), t_left_bytes);
275 cudaMalloc((void**)(&d_t_right), t_right_bytes);
276 cudaMalloc((void**)(&d_t_result), t_result_bytes);
277
278 cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
279 cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
280
281 Eigen::CudaStreamDevice stream;
282 Eigen::GpuDevice gpu_device(&stream);
283
284 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
285 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
286 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
287
288 typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
289 MapXf m_left(t_left.data(), 300, 93);
290 MapXf m_right(t_right.data(), 93, 140);
291 Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
292
293 typedef Tensor<float, 1>::DimensionPair DimPair;
294 Eigen::array<DimPair, 2> dims;
295 dims[0] = DimPair(2, 0);
296 dims[1] = DimPair(3, 1);
297
298 m_result = m_left * m_right;
299 gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
300
301 cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
302
303 for (DenseIndex i = 0; i < t_result.size(); i++) {
304 if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
305 std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
306 assert(false);
307 }
308 }
309
310 cudaFree(d_t_left);
311 cudaFree(d_t_right);
312 cudaFree(d_t_result);
313 }
314
315 template<int DataLayout>
test_cuda_convolution_1d()316 void test_cuda_convolution_1d()
317 {
318 Tensor<float, 4, DataLayout> input(74,37,11,137);
319 Tensor<float, 1, DataLayout> kernel(4);
320 Tensor<float, 4, DataLayout> out(74,34,11,137);
321 input = input.constant(10.0f) + input.random();
322 kernel = kernel.constant(7.0f) + kernel.random();
323
324 std::size_t input_bytes = input.size() * sizeof(float);
325 std::size_t kernel_bytes = kernel.size() * sizeof(float);
326 std::size_t out_bytes = out.size() * sizeof(float);
327
328 float* d_input;
329 float* d_kernel;
330 float* d_out;
331 cudaMalloc((void**)(&d_input), input_bytes);
332 cudaMalloc((void**)(&d_kernel), kernel_bytes);
333 cudaMalloc((void**)(&d_out), out_bytes);
334
335 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
336 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
337
338 Eigen::CudaStreamDevice stream;
339 Eigen::GpuDevice gpu_device(&stream);
340
341 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
342 Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
343 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
344
345 Eigen::array<Eigen::DenseIndex, 1> dims(1);
346 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
347
348 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
349 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
350
351 for (int i = 0; i < 74; ++i) {
352 for (int j = 0; j < 34; ++j) {
353 for (int k = 0; k < 11; ++k) {
354 for (int l = 0; l < 137; ++l) {
355 const float result = out(i,j,k,l);
356 const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
357 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
358 VERIFY_IS_APPROX(result, expected);
359 }
360 }
361 }
362 }
363
364 cudaFree(d_input);
365 cudaFree(d_kernel);
366 cudaFree(d_out);
367 }
368
test_cuda_convolution_inner_dim_col_major_1d()369 void test_cuda_convolution_inner_dim_col_major_1d()
370 {
371 Tensor<float, 4, ColMajor> input(74,9,11,7);
372 Tensor<float, 1, ColMajor> kernel(4);
373 Tensor<float, 4, ColMajor> out(71,9,11,7);
374 input = input.constant(10.0f) + input.random();
375 kernel = kernel.constant(7.0f) + kernel.random();
376
377 std::size_t input_bytes = input.size() * sizeof(float);
378 std::size_t kernel_bytes = kernel.size() * sizeof(float);
379 std::size_t out_bytes = out.size() * sizeof(float);
380
381 float* d_input;
382 float* d_kernel;
383 float* d_out;
384 cudaMalloc((void**)(&d_input), input_bytes);
385 cudaMalloc((void**)(&d_kernel), kernel_bytes);
386 cudaMalloc((void**)(&d_out), out_bytes);
387
388 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
389 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
390
391 Eigen::CudaStreamDevice stream;
392 Eigen::GpuDevice gpu_device(&stream);
393
394 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
395 Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
396 Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
397
398 Eigen::array<Eigen::DenseIndex, 1> dims(0);
399 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
400
401 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
402 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
403
404 for (int i = 0; i < 71; ++i) {
405 for (int j = 0; j < 9; ++j) {
406 for (int k = 0; k < 11; ++k) {
407 for (int l = 0; l < 7; ++l) {
408 const float result = out(i,j,k,l);
409 const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
410 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
411 VERIFY_IS_APPROX(result, expected);
412 }
413 }
414 }
415 }
416
417 cudaFree(d_input);
418 cudaFree(d_kernel);
419 cudaFree(d_out);
420 }
421
test_cuda_convolution_inner_dim_row_major_1d()422 void test_cuda_convolution_inner_dim_row_major_1d()
423 {
424 Tensor<float, 4, RowMajor> input(7,9,11,74);
425 Tensor<float, 1, RowMajor> kernel(4);
426 Tensor<float, 4, RowMajor> out(7,9,11,71);
427 input = input.constant(10.0f) + input.random();
428 kernel = kernel.constant(7.0f) + kernel.random();
429
430 std::size_t input_bytes = input.size() * sizeof(float);
431 std::size_t kernel_bytes = kernel.size() * sizeof(float);
432 std::size_t out_bytes = out.size() * sizeof(float);
433
434 float* d_input;
435 float* d_kernel;
436 float* d_out;
437 cudaMalloc((void**)(&d_input), input_bytes);
438 cudaMalloc((void**)(&d_kernel), kernel_bytes);
439 cudaMalloc((void**)(&d_out), out_bytes);
440
441 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
442 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
443
444 Eigen::CudaStreamDevice stream;
445 Eigen::GpuDevice gpu_device(&stream);
446
447 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
448 Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
449 Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
450
451 Eigen::array<Eigen::DenseIndex, 1> dims(3);
452 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
453
454 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
455 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
456
457 for (int i = 0; i < 7; ++i) {
458 for (int j = 0; j < 9; ++j) {
459 for (int k = 0; k < 11; ++k) {
460 for (int l = 0; l < 71; ++l) {
461 const float result = out(i,j,k,l);
462 const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
463 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
464 VERIFY_IS_APPROX(result, expected);
465 }
466 }
467 }
468 }
469
470 cudaFree(d_input);
471 cudaFree(d_kernel);
472 cudaFree(d_out);
473 }
474
475 template<int DataLayout>
test_cuda_convolution_2d()476 void test_cuda_convolution_2d()
477 {
478 Tensor<float, 4, DataLayout> input(74,37,11,137);
479 Tensor<float, 2, DataLayout> kernel(3,4);
480 Tensor<float, 4, DataLayout> out(74,35,8,137);
481 input = input.constant(10.0f) + input.random();
482 kernel = kernel.constant(7.0f) + kernel.random();
483
484 std::size_t input_bytes = input.size() * sizeof(float);
485 std::size_t kernel_bytes = kernel.size() * sizeof(float);
486 std::size_t out_bytes = out.size() * sizeof(float);
487
488 float* d_input;
489 float* d_kernel;
490 float* d_out;
491 cudaMalloc((void**)(&d_input), input_bytes);
492 cudaMalloc((void**)(&d_kernel), kernel_bytes);
493 cudaMalloc((void**)(&d_out), out_bytes);
494
495 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
496 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
497
498 Eigen::CudaStreamDevice stream;
499 Eigen::GpuDevice gpu_device(&stream);
500
501 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
502 Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
503 Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
504
505 Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
506 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
507
508 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
509 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
510
511 for (int i = 0; i < 74; ++i) {
512 for (int j = 0; j < 35; ++j) {
513 for (int k = 0; k < 8; ++k) {
514 for (int l = 0; l < 137; ++l) {
515 const float result = out(i,j,k,l);
516 const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
517 input(i,j+1,k+0,l) * kernel(1,0) +
518 input(i,j+2,k+0,l) * kernel(2,0) +
519 input(i,j+0,k+1,l) * kernel(0,1) +
520 input(i,j+1,k+1,l) * kernel(1,1) +
521 input(i,j+2,k+1,l) * kernel(2,1) +
522 input(i,j+0,k+2,l) * kernel(0,2) +
523 input(i,j+1,k+2,l) * kernel(1,2) +
524 input(i,j+2,k+2,l) * kernel(2,2) +
525 input(i,j+0,k+3,l) * kernel(0,3) +
526 input(i,j+1,k+3,l) * kernel(1,3) +
527 input(i,j+2,k+3,l) * kernel(2,3);
528 VERIFY_IS_APPROX(result, expected);
529 }
530 }
531 }
532 }
533
534 cudaFree(d_input);
535 cudaFree(d_kernel);
536 cudaFree(d_out);
537 }
538
539 template<int DataLayout>
test_cuda_convolution_3d()540 void test_cuda_convolution_3d()
541 {
542 Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
543 Tensor<float, 3, DataLayout> kernel(3,4,2);
544 Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
545 input = input.constant(10.0f) + input.random();
546 kernel = kernel.constant(7.0f) + kernel.random();
547
548 std::size_t input_bytes = input.size() * sizeof(float);
549 std::size_t kernel_bytes = kernel.size() * sizeof(float);
550 std::size_t out_bytes = out.size() * sizeof(float);
551
552 float* d_input;
553 float* d_kernel;
554 float* d_out;
555 cudaMalloc((void**)(&d_input), input_bytes);
556 cudaMalloc((void**)(&d_kernel), kernel_bytes);
557 cudaMalloc((void**)(&d_out), out_bytes);
558
559 cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
560 cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
561
562 Eigen::CudaStreamDevice stream;
563 Eigen::GpuDevice gpu_device(&stream);
564
565 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
566 Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
567 Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
568
569 Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
570 gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
571
572 assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
573 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
574
575 for (int i = 0; i < 74; ++i) {
576 for (int j = 0; j < 35; ++j) {
577 for (int k = 0; k < 8; ++k) {
578 for (int l = 0; l < 136; ++l) {
579 for (int m = 0; m < 17; ++m) {
580 const float result = out(i,j,k,l,m);
581 const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
582 input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
583 input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
584 input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
585 input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
586 input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
587 input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
588 input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
589 input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
590 input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
591 input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
592 input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
593 input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
594 input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
595 input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
596 input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
597 input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
598 input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
599 input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
600 input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
601 input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
602 input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
603 input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
604 input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
605 VERIFY_IS_APPROX(result, expected);
606 }
607 }
608 }
609 }
610 }
611
612 cudaFree(d_input);
613 cudaFree(d_kernel);
614 cudaFree(d_out);
615 }
616
617
618 template <typename Scalar>
test_cuda_lgamma(const Scalar stddev)619 void test_cuda_lgamma(const Scalar stddev)
620 {
621 Tensor<Scalar, 2> in(72,97);
622 in.setRandom();
623 in *= in.constant(stddev);
624 Tensor<Scalar, 2> out(72,97);
625 out.setZero();
626
627 std::size_t bytes = in.size() * sizeof(Scalar);
628
629 Scalar* d_in;
630 Scalar* d_out;
631 cudaMalloc((void**)(&d_in), bytes);
632 cudaMalloc((void**)(&d_out), bytes);
633
634 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
635
636 Eigen::CudaStreamDevice stream;
637 Eigen::GpuDevice gpu_device(&stream);
638
639 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
640 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
641
642 gpu_out.device(gpu_device) = gpu_in.lgamma();
643
644 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
645 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
646
647 for (int i = 0; i < 72; ++i) {
648 for (int j = 0; j < 97; ++j) {
649 VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
650 }
651 }
652
653 cudaFree(d_in);
654 cudaFree(d_out);
655 }
656
657 template <typename Scalar>
test_cuda_digamma()658 void test_cuda_digamma()
659 {
660 Tensor<Scalar, 1> in(7);
661 Tensor<Scalar, 1> out(7);
662 Tensor<Scalar, 1> expected_out(7);
663 out.setZero();
664
665 in(0) = Scalar(1);
666 in(1) = Scalar(1.5);
667 in(2) = Scalar(4);
668 in(3) = Scalar(-10.5);
669 in(4) = Scalar(10000.5);
670 in(5) = Scalar(0);
671 in(6) = Scalar(-1);
672
673 expected_out(0) = Scalar(-0.5772156649015329);
674 expected_out(1) = Scalar(0.03648997397857645);
675 expected_out(2) = Scalar(1.2561176684318);
676 expected_out(3) = Scalar(2.398239129535781);
677 expected_out(4) = Scalar(9.210340372392849);
678 expected_out(5) = std::numeric_limits<Scalar>::infinity();
679 expected_out(6) = std::numeric_limits<Scalar>::infinity();
680
681 std::size_t bytes = in.size() * sizeof(Scalar);
682
683 Scalar* d_in;
684 Scalar* d_out;
685 cudaMalloc((void**)(&d_in), bytes);
686 cudaMalloc((void**)(&d_out), bytes);
687
688 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
689
690 Eigen::CudaStreamDevice stream;
691 Eigen::GpuDevice gpu_device(&stream);
692
693 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
694 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
695
696 gpu_out.device(gpu_device) = gpu_in.digamma();
697
698 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
699 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
700
701 for (int i = 0; i < 5; ++i) {
702 VERIFY_IS_APPROX(out(i), expected_out(i));
703 }
704 for (int i = 5; i < 7; ++i) {
705 VERIFY_IS_EQUAL(out(i), expected_out(i));
706 }
707
708 cudaFree(d_in);
709 cudaFree(d_out);
710 }
711
712 template <typename Scalar>
test_cuda_zeta()713 void test_cuda_zeta()
714 {
715 Tensor<Scalar, 1> in_x(6);
716 Tensor<Scalar, 1> in_q(6);
717 Tensor<Scalar, 1> out(6);
718 Tensor<Scalar, 1> expected_out(6);
719 out.setZero();
720
721 in_x(0) = Scalar(1);
722 in_x(1) = Scalar(1.5);
723 in_x(2) = Scalar(4);
724 in_x(3) = Scalar(-10.5);
725 in_x(4) = Scalar(10000.5);
726 in_x(5) = Scalar(3);
727
728 in_q(0) = Scalar(1.2345);
729 in_q(1) = Scalar(2);
730 in_q(2) = Scalar(1.5);
731 in_q(3) = Scalar(3);
732 in_q(4) = Scalar(1.0001);
733 in_q(5) = Scalar(-2.5);
734
735 expected_out(0) = std::numeric_limits<Scalar>::infinity();
736 expected_out(1) = Scalar(1.61237534869);
737 expected_out(2) = Scalar(0.234848505667);
738 expected_out(3) = Scalar(1.03086757337e-5);
739 expected_out(4) = Scalar(0.367879440865);
740 expected_out(5) = Scalar(0.054102025820864097);
741
742 std::size_t bytes = in_x.size() * sizeof(Scalar);
743
744 Scalar* d_in_x;
745 Scalar* d_in_q;
746 Scalar* d_out;
747 cudaMalloc((void**)(&d_in_x), bytes);
748 cudaMalloc((void**)(&d_in_q), bytes);
749 cudaMalloc((void**)(&d_out), bytes);
750
751 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
752 cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
753
754 Eigen::CudaStreamDevice stream;
755 Eigen::GpuDevice gpu_device(&stream);
756
757 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
758 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
759 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
760
761 gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
762
763 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
764 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
765
766 VERIFY_IS_EQUAL(out(0), expected_out(0));
767 VERIFY((std::isnan)(out(3)));
768
769 for (int i = 1; i < 6; ++i) {
770 if (i != 3) {
771 VERIFY_IS_APPROX(out(i), expected_out(i));
772 }
773 }
774
775 cudaFree(d_in_x);
776 cudaFree(d_in_q);
777 cudaFree(d_out);
778 }
779
780 template <typename Scalar>
test_cuda_polygamma()781 void test_cuda_polygamma()
782 {
783 Tensor<Scalar, 1> in_x(7);
784 Tensor<Scalar, 1> in_n(7);
785 Tensor<Scalar, 1> out(7);
786 Tensor<Scalar, 1> expected_out(7);
787 out.setZero();
788
789 in_n(0) = Scalar(1);
790 in_n(1) = Scalar(1);
791 in_n(2) = Scalar(1);
792 in_n(3) = Scalar(17);
793 in_n(4) = Scalar(31);
794 in_n(5) = Scalar(28);
795 in_n(6) = Scalar(8);
796
797 in_x(0) = Scalar(2);
798 in_x(1) = Scalar(3);
799 in_x(2) = Scalar(25.5);
800 in_x(3) = Scalar(4.7);
801 in_x(4) = Scalar(11.8);
802 in_x(5) = Scalar(17.7);
803 in_x(6) = Scalar(30.2);
804
805 expected_out(0) = Scalar(0.644934066848);
806 expected_out(1) = Scalar(0.394934066848);
807 expected_out(2) = Scalar(0.0399946696496);
808 expected_out(3) = Scalar(293.334565435);
809 expected_out(4) = Scalar(0.445487887616);
810 expected_out(5) = Scalar(-2.47810300902e-07);
811 expected_out(6) = Scalar(-8.29668781082e-09);
812
813 std::size_t bytes = in_x.size() * sizeof(Scalar);
814
815 Scalar* d_in_x;
816 Scalar* d_in_n;
817 Scalar* d_out;
818 cudaMalloc((void**)(&d_in_x), bytes);
819 cudaMalloc((void**)(&d_in_n), bytes);
820 cudaMalloc((void**)(&d_out), bytes);
821
822 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
823 cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
824
825 Eigen::CudaStreamDevice stream;
826 Eigen::GpuDevice gpu_device(&stream);
827
828 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
829 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
830 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
831
832 gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
833
834 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
835 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
836
837 for (int i = 0; i < 7; ++i) {
838 VERIFY_IS_APPROX(out(i), expected_out(i));
839 }
840
841 cudaFree(d_in_x);
842 cudaFree(d_in_n);
843 cudaFree(d_out);
844 }
845
846 template <typename Scalar>
test_cuda_igamma()847 void test_cuda_igamma()
848 {
849 Tensor<Scalar, 2> a(6, 6);
850 Tensor<Scalar, 2> x(6, 6);
851 Tensor<Scalar, 2> out(6, 6);
852 out.setZero();
853
854 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
855 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
856
857 for (int i = 0; i < 6; ++i) {
858 for (int j = 0; j < 6; ++j) {
859 a(i, j) = a_s[i];
860 x(i, j) = x_s[j];
861 }
862 }
863
864 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
865 Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
866 {0.0, 0.6321205588285578, 0.7768698398515702,
867 0.9816843611112658, 9.999500016666262e-05, 1.0},
868 {0.0, 0.4275932955291202, 0.608374823728911,
869 0.9539882943107686, 7.522076445089201e-07, 1.0},
870 {0.0, 0.01898815687615381, 0.06564245437845008,
871 0.5665298796332909, 4.166333347221828e-18, 1.0},
872 {0.0, 0.9999780593618628, 0.9999899967080838,
873 0.9999996219837988, 0.9991370418689945, 1.0},
874 {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
875
876
877
878 std::size_t bytes = a.size() * sizeof(Scalar);
879
880 Scalar* d_a;
881 Scalar* d_x;
882 Scalar* d_out;
883 assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
884 assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
885 assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
886
887 cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
888 cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
889
890 Eigen::CudaStreamDevice stream;
891 Eigen::GpuDevice gpu_device(&stream);
892
893 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
894 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
895 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
896
897 gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
898
899 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
900 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
901
902 for (int i = 0; i < 6; ++i) {
903 for (int j = 0; j < 6; ++j) {
904 if ((std::isnan)(igamma_s[i][j])) {
905 VERIFY((std::isnan)(out(i, j)));
906 } else {
907 VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
908 }
909 }
910 }
911
912 cudaFree(d_a);
913 cudaFree(d_x);
914 cudaFree(d_out);
915 }
916
917 template <typename Scalar>
test_cuda_igammac()918 void test_cuda_igammac()
919 {
920 Tensor<Scalar, 2> a(6, 6);
921 Tensor<Scalar, 2> x(6, 6);
922 Tensor<Scalar, 2> out(6, 6);
923 out.setZero();
924
925 Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
926 Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
927
928 for (int i = 0; i < 6; ++i) {
929 for (int j = 0; j < 6; ++j) {
930 a(i, j) = a_s[i];
931 x(i, j) = x_s[j];
932 }
933 }
934
935 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
936 Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
937 {1.0, 0.36787944117144233, 0.22313016014842982,
938 0.018315638888734182, 0.9999000049998333, 0.0},
939 {1.0, 0.5724067044708798, 0.3916251762710878,
940 0.04601170568923136, 0.9999992477923555, 0.0},
941 {1.0, 0.9810118431238462, 0.9343575456215499,
942 0.4334701203667089, 1.0, 0.0},
943 {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
944 3.7801620118431334e-07, 0.0008629581310054535,
945 0.0},
946 {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
947
948 std::size_t bytes = a.size() * sizeof(Scalar);
949
950 Scalar* d_a;
951 Scalar* d_x;
952 Scalar* d_out;
953 cudaMalloc((void**)(&d_a), bytes);
954 cudaMalloc((void**)(&d_x), bytes);
955 cudaMalloc((void**)(&d_out), bytes);
956
957 cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
958 cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
959
960 Eigen::CudaStreamDevice stream;
961 Eigen::GpuDevice gpu_device(&stream);
962
963 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
964 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
965 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
966
967 gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
968
969 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
970 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
971
972 for (int i = 0; i < 6; ++i) {
973 for (int j = 0; j < 6; ++j) {
974 if ((std::isnan)(igammac_s[i][j])) {
975 VERIFY((std::isnan)(out(i, j)));
976 } else {
977 VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
978 }
979 }
980 }
981
982 cudaFree(d_a);
983 cudaFree(d_x);
984 cudaFree(d_out);
985 }
986
987 template <typename Scalar>
test_cuda_erf(const Scalar stddev)988 void test_cuda_erf(const Scalar stddev)
989 {
990 Tensor<Scalar, 2> in(72,97);
991 in.setRandom();
992 in *= in.constant(stddev);
993 Tensor<Scalar, 2> out(72,97);
994 out.setZero();
995
996 std::size_t bytes = in.size() * sizeof(Scalar);
997
998 Scalar* d_in;
999 Scalar* d_out;
1000 assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
1001 assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
1002
1003 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
1004
1005 Eigen::CudaStreamDevice stream;
1006 Eigen::GpuDevice gpu_device(&stream);
1007
1008 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1009 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1010
1011 gpu_out.device(gpu_device) = gpu_in.erf();
1012
1013 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1014 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1015
1016 for (int i = 0; i < 72; ++i) {
1017 for (int j = 0; j < 97; ++j) {
1018 VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
1019 }
1020 }
1021
1022 cudaFree(d_in);
1023 cudaFree(d_out);
1024 }
1025
1026 template <typename Scalar>
test_cuda_erfc(const Scalar stddev)1027 void test_cuda_erfc(const Scalar stddev)
1028 {
1029 Tensor<Scalar, 2> in(72,97);
1030 in.setRandom();
1031 in *= in.constant(stddev);
1032 Tensor<Scalar, 2> out(72,97);
1033 out.setZero();
1034
1035 std::size_t bytes = in.size() * sizeof(Scalar);
1036
1037 Scalar* d_in;
1038 Scalar* d_out;
1039 cudaMalloc((void**)(&d_in), bytes);
1040 cudaMalloc((void**)(&d_out), bytes);
1041
1042 cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
1043
1044 Eigen::CudaStreamDevice stream;
1045 Eigen::GpuDevice gpu_device(&stream);
1046
1047 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1048 Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1049
1050 gpu_out.device(gpu_device) = gpu_in.erfc();
1051
1052 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1053 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1054
1055 for (int i = 0; i < 72; ++i) {
1056 for (int j = 0; j < 97; ++j) {
1057 VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
1058 }
1059 }
1060
1061 cudaFree(d_in);
1062 cudaFree(d_out);
1063 }
1064
1065 template <typename Scalar>
test_cuda_betainc()1066 void test_cuda_betainc()
1067 {
1068 Tensor<Scalar, 1> in_x(125);
1069 Tensor<Scalar, 1> in_a(125);
1070 Tensor<Scalar, 1> in_b(125);
1071 Tensor<Scalar, 1> out(125);
1072 Tensor<Scalar, 1> expected_out(125);
1073 out.setZero();
1074
1075 Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
1076
1077 Array<Scalar, 1, Dynamic> x(125);
1078 Array<Scalar, 1, Dynamic> a(125);
1079 Array<Scalar, 1, Dynamic> b(125);
1080 Array<Scalar, 1, Dynamic> v(125);
1081
1082 a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1083 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1084 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1085 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1086 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1087 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1088 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1089 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1090 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1091 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1092 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1093 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1094 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1095 31.62177660168379, 31.62177660168379, 31.62177660168379,
1096 31.62177660168379, 31.62177660168379, 31.62177660168379,
1097 31.62177660168379, 31.62177660168379, 31.62177660168379,
1098 31.62177660168379, 31.62177660168379, 31.62177660168379,
1099 31.62177660168379, 31.62177660168379, 31.62177660168379,
1100 31.62177660168379, 31.62177660168379, 31.62177660168379,
1101 31.62177660168379, 31.62177660168379, 31.62177660168379,
1102 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1103 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1104 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1105 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
1106
1107 b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1108 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1109 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1110 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1111 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1112 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1113 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1114 31.62177660168379, 31.62177660168379, 31.62177660168379,
1115 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1116 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1117 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1118 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1119 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1120 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1121 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1122 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1123 31.62177660168379, 31.62177660168379, 31.62177660168379,
1124 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1125 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1126 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1127 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1128 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1129 999.999, 999.999, 999.999;
1130
1131 x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1132 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1133 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1134 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
1135 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
1136 -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1137 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1138 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1139 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
1140
1141 v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1142 nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1143 nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
1144 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
1145 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
1146 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
1147 nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
1148 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
1149 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
1150 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
1151 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
1152 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
1153 nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
1154 nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
1155 nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
1156 nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
1157 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
1158 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
1159 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
1160 0.5000000000004947, 0.9999999999999999, nan;
1161
1162 for (int i = 0; i < 125; ++i) {
1163 in_x(i) = x(i);
1164 in_a(i) = a(i);
1165 in_b(i) = b(i);
1166 expected_out(i) = v(i);
1167 }
1168
1169 std::size_t bytes = in_x.size() * sizeof(Scalar);
1170
1171 Scalar* d_in_x;
1172 Scalar* d_in_a;
1173 Scalar* d_in_b;
1174 Scalar* d_out;
1175 cudaMalloc((void**)(&d_in_x), bytes);
1176 cudaMalloc((void**)(&d_in_a), bytes);
1177 cudaMalloc((void**)(&d_in_b), bytes);
1178 cudaMalloc((void**)(&d_out), bytes);
1179
1180 cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
1181 cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
1182 cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
1183
1184 Eigen::CudaStreamDevice stream;
1185 Eigen::GpuDevice gpu_device(&stream);
1186
1187 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
1188 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
1189 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
1190 Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
1191
1192 gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
1193
1194 assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1195 assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1196
1197 for (int i = 1; i < 125; ++i) {
1198 if ((std::isnan)(expected_out(i))) {
1199 VERIFY((std::isnan)(out(i)));
1200 } else {
1201 VERIFY_IS_APPROX(out(i), expected_out(i));
1202 }
1203 }
1204
1205 cudaFree(d_in_x);
1206 cudaFree(d_in_a);
1207 cudaFree(d_in_b);
1208 cudaFree(d_out);
1209 }
1210
1211
test_cxx11_tensor_cuda()1212 void test_cxx11_tensor_cuda()
1213 {
1214 CALL_SUBTEST_1(test_cuda_nullary());
1215 CALL_SUBTEST_1(test_cuda_elementwise_small());
1216 CALL_SUBTEST_1(test_cuda_elementwise());
1217 CALL_SUBTEST_1(test_cuda_props());
1218 CALL_SUBTEST_1(test_cuda_reduction());
1219 CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
1220 CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
1221 CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
1222 CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
1223 CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
1224 CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
1225 CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
1226 CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
1227 CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
1228 CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
1229
1230 #if __cplusplus > 199711L
1231 // std::erf, std::erfc, and so on where only added in c++11. We use them
1232 // as a golden reference to validate the results produced by Eigen. Therefore
1233 // we can only run these tests if we use a c++11 compiler.
1234 CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
1235 CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
1236 CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
1237 CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
1238
1239 CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
1240 CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
1241 CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
1242 CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
1243
1244 CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
1245 CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
1246 CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
1247 CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
1248
1249 CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
1250 // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
1251 CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
1252 CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
1253 CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
1254
1255 CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
1256 CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
1257 CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
1258 CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
1259
1260 CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
1261 // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
1262 CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
1263 CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
1264 CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
1265
1266 CALL_SUBTEST_5(test_cuda_digamma<float>());
1267 CALL_SUBTEST_5(test_cuda_digamma<double>());
1268
1269 CALL_SUBTEST_5(test_cuda_polygamma<float>());
1270 CALL_SUBTEST_5(test_cuda_polygamma<double>());
1271
1272 CALL_SUBTEST_5(test_cuda_zeta<float>());
1273 CALL_SUBTEST_5(test_cuda_zeta<double>());
1274
1275 CALL_SUBTEST_5(test_cuda_igamma<float>());
1276 CALL_SUBTEST_5(test_cuda_igammac<float>());
1277
1278 CALL_SUBTEST_5(test_cuda_igamma<double>());
1279 CALL_SUBTEST_5(test_cuda_igammac<double>());
1280
1281 CALL_SUBTEST_6(test_cuda_betainc<float>());
1282 CALL_SUBTEST_6(test_cuda_betainc<double>());
1283 #endif
1284 }
1285