1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #define EIGEN_TEST_NO_LONGDOUBLE
11 #define EIGEN_TEST_NO_COMPLEX
12 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
13 #define EIGEN_USE_GPU
14 
15 #include "main.h"
16 #include <unsupported/Eigen/CXX11/Tensor>
17 
18 using Eigen::Tensor;
19 
test_cuda_nullary()20 void test_cuda_nullary() {
21   Tensor<float, 1, 0, int> in1(2);
22   Tensor<float, 1, 0, int> in2(2);
23   in1.setRandom();
24   in2.setRandom();
25 
26   std::size_t tensor_bytes = in1.size() * sizeof(float);
27 
28   float* d_in1;
29   float* d_in2;
30   cudaMalloc((void**)(&d_in1), tensor_bytes);
31   cudaMalloc((void**)(&d_in2), tensor_bytes);
32   cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
33   cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
34 
35   Eigen::CudaStreamDevice stream;
36   Eigen::GpuDevice gpu_device(&stream);
37 
38   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
39       d_in1, 2);
40   Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
41       d_in2, 2);
42 
43   gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
44   gpu_in2.device(gpu_device) = gpu_in2.random();
45 
46   Tensor<float, 1, 0, int> new1(2);
47   Tensor<float, 1, 0, int> new2(2);
48 
49   assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
50                          gpu_device.stream()) == cudaSuccess);
51   assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
52                          gpu_device.stream()) == cudaSuccess);
53 
54   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
55 
56   for (int i = 0; i < 2; ++i) {
57     VERIFY_IS_APPROX(new1(i), 3.14f);
58     VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
59   }
60 
61   cudaFree(d_in1);
62   cudaFree(d_in2);
63 }
64 
test_cuda_elementwise_small()65 void test_cuda_elementwise_small() {
66   Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
67   Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
68   Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
69   in1.setRandom();
70   in2.setRandom();
71 
72   std::size_t in1_bytes = in1.size() * sizeof(float);
73   std::size_t in2_bytes = in2.size() * sizeof(float);
74   std::size_t out_bytes = out.size() * sizeof(float);
75 
76   float* d_in1;
77   float* d_in2;
78   float* d_out;
79   cudaMalloc((void**)(&d_in1), in1_bytes);
80   cudaMalloc((void**)(&d_in2), in2_bytes);
81   cudaMalloc((void**)(&d_out), out_bytes);
82 
83   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
84   cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
85 
86   Eigen::CudaStreamDevice stream;
87   Eigen::GpuDevice gpu_device(&stream);
88 
89   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
90       d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
91   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
92       d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
93   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
94       d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
95 
96   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
97 
98   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
99                          gpu_device.stream()) == cudaSuccess);
100   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
101 
102   for (int i = 0; i < 2; ++i) {
103     VERIFY_IS_APPROX(
104         out(Eigen::array<Eigen::DenseIndex, 1>(i)),
105         in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
106   }
107 
108   cudaFree(d_in1);
109   cudaFree(d_in2);
110   cudaFree(d_out);
111 }
112 
test_cuda_elementwise()113 void test_cuda_elementwise()
114 {
115   Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
116   Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
117   Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
118   Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
119   in1.setRandom();
120   in2.setRandom();
121   in3.setRandom();
122 
123   std::size_t in1_bytes = in1.size() * sizeof(float);
124   std::size_t in2_bytes = in2.size() * sizeof(float);
125   std::size_t in3_bytes = in3.size() * sizeof(float);
126   std::size_t out_bytes = out.size() * sizeof(float);
127 
128   float* d_in1;
129   float* d_in2;
130   float* d_in3;
131   float* d_out;
132   cudaMalloc((void**)(&d_in1), in1_bytes);
133   cudaMalloc((void**)(&d_in2), in2_bytes);
134   cudaMalloc((void**)(&d_in3), in3_bytes);
135   cudaMalloc((void**)(&d_out), out_bytes);
136 
137   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
138   cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
139   cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
140 
141   Eigen::CudaStreamDevice stream;
142   Eigen::GpuDevice gpu_device(&stream);
143 
144   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
145   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
146   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
147   Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
148 
149   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
150 
151   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
152   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
153 
154   for (int i = 0; i < 72; ++i) {
155     for (int j = 0; j < 53; ++j) {
156       for (int k = 0; k < 97; ++k) {
157         VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
158       }
159     }
160   }
161 
162   cudaFree(d_in1);
163   cudaFree(d_in2);
164   cudaFree(d_in3);
165   cudaFree(d_out);
166 }
167 
test_cuda_props()168 void test_cuda_props() {
169   Tensor<float, 1> in1(200);
170   Tensor<bool, 1> out(200);
171   in1.setRandom();
172 
173   std::size_t in1_bytes = in1.size() * sizeof(float);
174   std::size_t out_bytes = out.size() * sizeof(bool);
175 
176   float* d_in1;
177   bool* d_out;
178   cudaMalloc((void**)(&d_in1), in1_bytes);
179   cudaMalloc((void**)(&d_out), out_bytes);
180 
181   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
182 
183   Eigen::CudaStreamDevice stream;
184   Eigen::GpuDevice gpu_device(&stream);
185 
186   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
187       d_in1, 200);
188   Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
189       d_out, 200);
190 
191   gpu_out.device(gpu_device) = (gpu_in1.isnan)();
192 
193   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
194                          gpu_device.stream()) == cudaSuccess);
195   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
196 
197   for (int i = 0; i < 200; ++i) {
198     VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
199   }
200 
201   cudaFree(d_in1);
202   cudaFree(d_out);
203 }
204 
test_cuda_reduction()205 void test_cuda_reduction()
206 {
207   Tensor<float, 4> in1(72,53,97,113);
208   Tensor<float, 2> out(72,97);
209   in1.setRandom();
210 
211   std::size_t in1_bytes = in1.size() * sizeof(float);
212   std::size_t out_bytes = out.size() * sizeof(float);
213 
214   float* d_in1;
215   float* d_out;
216   cudaMalloc((void**)(&d_in1), in1_bytes);
217   cudaMalloc((void**)(&d_out), out_bytes);
218 
219   cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
220 
221   Eigen::CudaStreamDevice stream;
222   Eigen::GpuDevice gpu_device(&stream);
223 
224   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
225   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
226 
227   array<Eigen::DenseIndex, 2> reduction_axis;
228   reduction_axis[0] = 1;
229   reduction_axis[1] = 3;
230 
231   gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
232 
233   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
234   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
235 
236   for (int i = 0; i < 72; ++i) {
237     for (int j = 0; j < 97; ++j) {
238       float expected = 0;
239       for (int k = 0; k < 53; ++k) {
240         for (int l = 0; l < 113; ++l) {
241           expected =
242               std::max<float>(expected, in1(i, k, j, l));
243         }
244       }
245       VERIFY_IS_APPROX(out(i,j), expected);
246     }
247   }
248 
249   cudaFree(d_in1);
250   cudaFree(d_out);
251 }
252 
253 template<int DataLayout>
test_cuda_contraction()254 void test_cuda_contraction()
255 {
256   // with these dimensions, the output has 300 * 140 elements, which is
257   // more than 30 * 1024, which is the number of threads in blocks on
258   // a 15 SM GK110 GPU
259   Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
260   Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
261   Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
262 
263   t_left.setRandom();
264   t_right.setRandom();
265 
266   std::size_t t_left_bytes = t_left.size()  * sizeof(float);
267   std::size_t t_right_bytes = t_right.size() * sizeof(float);
268   std::size_t t_result_bytes = t_result.size() * sizeof(float);
269 
270   float* d_t_left;
271   float* d_t_right;
272   float* d_t_result;
273 
274   cudaMalloc((void**)(&d_t_left), t_left_bytes);
275   cudaMalloc((void**)(&d_t_right), t_right_bytes);
276   cudaMalloc((void**)(&d_t_result), t_result_bytes);
277 
278   cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
279   cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
280 
281   Eigen::CudaStreamDevice stream;
282   Eigen::GpuDevice gpu_device(&stream);
283 
284   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
285   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
286   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
287 
288   typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
289   MapXf m_left(t_left.data(), 300, 93);
290   MapXf m_right(t_right.data(), 93, 140);
291   Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
292 
293   typedef Tensor<float, 1>::DimensionPair DimPair;
294   Eigen::array<DimPair, 2> dims;
295   dims[0] = DimPair(2, 0);
296   dims[1] = DimPair(3, 1);
297 
298   m_result = m_left * m_right;
299   gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
300 
301   cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
302 
303   for (DenseIndex i = 0; i < t_result.size(); i++) {
304     if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
305       std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
306       assert(false);
307     }
308   }
309 
310   cudaFree(d_t_left);
311   cudaFree(d_t_right);
312   cudaFree(d_t_result);
313 }
314 
315 template<int DataLayout>
test_cuda_convolution_1d()316 void test_cuda_convolution_1d()
317 {
318   Tensor<float, 4, DataLayout> input(74,37,11,137);
319   Tensor<float, 1, DataLayout> kernel(4);
320   Tensor<float, 4, DataLayout> out(74,34,11,137);
321   input = input.constant(10.0f) + input.random();
322   kernel = kernel.constant(7.0f) + kernel.random();
323 
324   std::size_t input_bytes = input.size() * sizeof(float);
325   std::size_t kernel_bytes = kernel.size() * sizeof(float);
326   std::size_t out_bytes = out.size() * sizeof(float);
327 
328   float* d_input;
329   float* d_kernel;
330   float* d_out;
331   cudaMalloc((void**)(&d_input), input_bytes);
332   cudaMalloc((void**)(&d_kernel), kernel_bytes);
333   cudaMalloc((void**)(&d_out), out_bytes);
334 
335   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
336   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
337 
338   Eigen::CudaStreamDevice stream;
339   Eigen::GpuDevice gpu_device(&stream);
340 
341   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
342   Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
343   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
344 
345   Eigen::array<Eigen::DenseIndex, 1> dims(1);
346   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
347 
348   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
349   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
350 
351   for (int i = 0; i < 74; ++i) {
352     for (int j = 0; j < 34; ++j) {
353       for (int k = 0; k < 11; ++k) {
354         for (int l = 0; l < 137; ++l) {
355           const float result = out(i,j,k,l);
356           const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
357                                  input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
358           VERIFY_IS_APPROX(result, expected);
359         }
360       }
361     }
362   }
363 
364   cudaFree(d_input);
365   cudaFree(d_kernel);
366   cudaFree(d_out);
367 }
368 
test_cuda_convolution_inner_dim_col_major_1d()369 void test_cuda_convolution_inner_dim_col_major_1d()
370 {
371   Tensor<float, 4, ColMajor> input(74,9,11,7);
372   Tensor<float, 1, ColMajor> kernel(4);
373   Tensor<float, 4, ColMajor> out(71,9,11,7);
374   input = input.constant(10.0f) + input.random();
375   kernel = kernel.constant(7.0f) + kernel.random();
376 
377   std::size_t input_bytes = input.size() * sizeof(float);
378   std::size_t kernel_bytes = kernel.size() * sizeof(float);
379   std::size_t out_bytes = out.size() * sizeof(float);
380 
381   float* d_input;
382   float* d_kernel;
383   float* d_out;
384   cudaMalloc((void**)(&d_input), input_bytes);
385   cudaMalloc((void**)(&d_kernel), kernel_bytes);
386   cudaMalloc((void**)(&d_out), out_bytes);
387 
388   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
389   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
390 
391   Eigen::CudaStreamDevice stream;
392   Eigen::GpuDevice gpu_device(&stream);
393 
394   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
395   Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
396   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
397 
398   Eigen::array<Eigen::DenseIndex, 1> dims(0);
399   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
400 
401   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
402   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
403 
404   for (int i = 0; i < 71; ++i) {
405     for (int j = 0; j < 9; ++j) {
406       for (int k = 0; k < 11; ++k) {
407         for (int l = 0; l < 7; ++l) {
408           const float result = out(i,j,k,l);
409           const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
410                                  input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
411           VERIFY_IS_APPROX(result, expected);
412         }
413       }
414     }
415   }
416 
417   cudaFree(d_input);
418   cudaFree(d_kernel);
419   cudaFree(d_out);
420 }
421 
test_cuda_convolution_inner_dim_row_major_1d()422 void test_cuda_convolution_inner_dim_row_major_1d()
423 {
424   Tensor<float, 4, RowMajor> input(7,9,11,74);
425   Tensor<float, 1, RowMajor> kernel(4);
426   Tensor<float, 4, RowMajor> out(7,9,11,71);
427   input = input.constant(10.0f) + input.random();
428   kernel = kernel.constant(7.0f) + kernel.random();
429 
430   std::size_t input_bytes = input.size() * sizeof(float);
431   std::size_t kernel_bytes = kernel.size() * sizeof(float);
432   std::size_t out_bytes = out.size() * sizeof(float);
433 
434   float* d_input;
435   float* d_kernel;
436   float* d_out;
437   cudaMalloc((void**)(&d_input), input_bytes);
438   cudaMalloc((void**)(&d_kernel), kernel_bytes);
439   cudaMalloc((void**)(&d_out), out_bytes);
440 
441   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
442   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
443 
444   Eigen::CudaStreamDevice stream;
445   Eigen::GpuDevice gpu_device(&stream);
446 
447   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
448   Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
449   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
450 
451   Eigen::array<Eigen::DenseIndex, 1> dims(3);
452   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
453 
454   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
455   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
456 
457   for (int i = 0; i < 7; ++i) {
458     for (int j = 0; j < 9; ++j) {
459       for (int k = 0; k < 11; ++k) {
460         for (int l = 0; l < 71; ++l) {
461           const float result = out(i,j,k,l);
462           const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
463                                  input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
464           VERIFY_IS_APPROX(result, expected);
465         }
466       }
467     }
468   }
469 
470   cudaFree(d_input);
471   cudaFree(d_kernel);
472   cudaFree(d_out);
473 }
474 
475 template<int DataLayout>
test_cuda_convolution_2d()476 void test_cuda_convolution_2d()
477 {
478   Tensor<float, 4, DataLayout> input(74,37,11,137);
479   Tensor<float, 2, DataLayout> kernel(3,4);
480   Tensor<float, 4, DataLayout> out(74,35,8,137);
481   input = input.constant(10.0f) + input.random();
482   kernel = kernel.constant(7.0f) + kernel.random();
483 
484   std::size_t input_bytes = input.size() * sizeof(float);
485   std::size_t kernel_bytes = kernel.size() * sizeof(float);
486   std::size_t out_bytes = out.size() * sizeof(float);
487 
488   float* d_input;
489   float* d_kernel;
490   float* d_out;
491   cudaMalloc((void**)(&d_input), input_bytes);
492   cudaMalloc((void**)(&d_kernel), kernel_bytes);
493   cudaMalloc((void**)(&d_out), out_bytes);
494 
495   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
496   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
497 
498   Eigen::CudaStreamDevice stream;
499   Eigen::GpuDevice gpu_device(&stream);
500 
501   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
502   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
503   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
504 
505   Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
506   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
507 
508   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
509   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
510 
511   for (int i = 0; i < 74; ++i) {
512     for (int j = 0; j < 35; ++j) {
513       for (int k = 0; k < 8; ++k) {
514         for (int l = 0; l < 137; ++l) {
515           const float result = out(i,j,k,l);
516           const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
517                                  input(i,j+1,k+0,l) * kernel(1,0) +
518                                  input(i,j+2,k+0,l) * kernel(2,0) +
519                                  input(i,j+0,k+1,l) * kernel(0,1) +
520                                  input(i,j+1,k+1,l) * kernel(1,1) +
521                                  input(i,j+2,k+1,l) * kernel(2,1) +
522                                  input(i,j+0,k+2,l) * kernel(0,2) +
523                                  input(i,j+1,k+2,l) * kernel(1,2) +
524                                  input(i,j+2,k+2,l) * kernel(2,2) +
525                                  input(i,j+0,k+3,l) * kernel(0,3) +
526                                  input(i,j+1,k+3,l) * kernel(1,3) +
527                                  input(i,j+2,k+3,l) * kernel(2,3);
528           VERIFY_IS_APPROX(result, expected);
529         }
530       }
531     }
532   }
533 
534   cudaFree(d_input);
535   cudaFree(d_kernel);
536   cudaFree(d_out);
537 }
538 
539 template<int DataLayout>
test_cuda_convolution_3d()540 void test_cuda_convolution_3d()
541 {
542   Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
543   Tensor<float, 3, DataLayout> kernel(3,4,2);
544   Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
545   input = input.constant(10.0f) + input.random();
546   kernel = kernel.constant(7.0f) + kernel.random();
547 
548   std::size_t input_bytes = input.size() * sizeof(float);
549   std::size_t kernel_bytes = kernel.size() * sizeof(float);
550   std::size_t out_bytes = out.size() * sizeof(float);
551 
552   float* d_input;
553   float* d_kernel;
554   float* d_out;
555   cudaMalloc((void**)(&d_input), input_bytes);
556   cudaMalloc((void**)(&d_kernel), kernel_bytes);
557   cudaMalloc((void**)(&d_out), out_bytes);
558 
559   cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
560   cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
561 
562   Eigen::CudaStreamDevice stream;
563   Eigen::GpuDevice gpu_device(&stream);
564 
565   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
566   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
567   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
568 
569   Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
570   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
571 
572   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
573   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
574 
575   for (int i = 0; i < 74; ++i) {
576     for (int j = 0; j < 35; ++j) {
577       for (int k = 0; k < 8; ++k) {
578         for (int l = 0; l < 136; ++l) {
579           for (int m = 0; m < 17; ++m) {
580             const float result = out(i,j,k,l,m);
581             const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
582                                    input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
583                                    input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
584                                    input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
585                                    input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
586                                    input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
587                                    input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
588                                    input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
589                                    input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
590                                    input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
591                                    input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
592                                    input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
593                                    input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
594                                    input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
595                                    input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
596                                    input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
597                                    input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
598                                    input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
599                                    input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
600                                    input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
601                                    input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
602                                    input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
603                                    input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
604                                    input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
605             VERIFY_IS_APPROX(result, expected);
606           }
607         }
608       }
609     }
610   }
611 
612   cudaFree(d_input);
613   cudaFree(d_kernel);
614   cudaFree(d_out);
615 }
616 
617 
618 template <typename Scalar>
test_cuda_lgamma(const Scalar stddev)619 void test_cuda_lgamma(const Scalar stddev)
620 {
621   Tensor<Scalar, 2> in(72,97);
622   in.setRandom();
623   in *= in.constant(stddev);
624   Tensor<Scalar, 2> out(72,97);
625   out.setZero();
626 
627   std::size_t bytes = in.size() * sizeof(Scalar);
628 
629   Scalar* d_in;
630   Scalar* d_out;
631   cudaMalloc((void**)(&d_in), bytes);
632   cudaMalloc((void**)(&d_out), bytes);
633 
634   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
635 
636   Eigen::CudaStreamDevice stream;
637   Eigen::GpuDevice gpu_device(&stream);
638 
639   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
640   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
641 
642   gpu_out.device(gpu_device) = gpu_in.lgamma();
643 
644   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
645   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
646 
647   for (int i = 0; i < 72; ++i) {
648     for (int j = 0; j < 97; ++j) {
649       VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
650     }
651   }
652 
653   cudaFree(d_in);
654   cudaFree(d_out);
655 }
656 
657 template <typename Scalar>
test_cuda_digamma()658 void test_cuda_digamma()
659 {
660   Tensor<Scalar, 1> in(7);
661   Tensor<Scalar, 1> out(7);
662   Tensor<Scalar, 1> expected_out(7);
663   out.setZero();
664 
665   in(0) = Scalar(1);
666   in(1) = Scalar(1.5);
667   in(2) = Scalar(4);
668   in(3) = Scalar(-10.5);
669   in(4) = Scalar(10000.5);
670   in(5) = Scalar(0);
671   in(6) = Scalar(-1);
672 
673   expected_out(0) = Scalar(-0.5772156649015329);
674   expected_out(1) = Scalar(0.03648997397857645);
675   expected_out(2) = Scalar(1.2561176684318);
676   expected_out(3) = Scalar(2.398239129535781);
677   expected_out(4) = Scalar(9.210340372392849);
678   expected_out(5) = std::numeric_limits<Scalar>::infinity();
679   expected_out(6) = std::numeric_limits<Scalar>::infinity();
680 
681   std::size_t bytes = in.size() * sizeof(Scalar);
682 
683   Scalar* d_in;
684   Scalar* d_out;
685   cudaMalloc((void**)(&d_in), bytes);
686   cudaMalloc((void**)(&d_out), bytes);
687 
688   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
689 
690   Eigen::CudaStreamDevice stream;
691   Eigen::GpuDevice gpu_device(&stream);
692 
693   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
694   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
695 
696   gpu_out.device(gpu_device) = gpu_in.digamma();
697 
698   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
699   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
700 
701   for (int i = 0; i < 5; ++i) {
702     VERIFY_IS_APPROX(out(i), expected_out(i));
703   }
704   for (int i = 5; i < 7; ++i) {
705     VERIFY_IS_EQUAL(out(i), expected_out(i));
706   }
707 
708   cudaFree(d_in);
709   cudaFree(d_out);
710 }
711 
712 template <typename Scalar>
test_cuda_zeta()713 void test_cuda_zeta()
714 {
715   Tensor<Scalar, 1> in_x(6);
716   Tensor<Scalar, 1> in_q(6);
717   Tensor<Scalar, 1> out(6);
718   Tensor<Scalar, 1> expected_out(6);
719   out.setZero();
720 
721   in_x(0) = Scalar(1);
722   in_x(1) = Scalar(1.5);
723   in_x(2) = Scalar(4);
724   in_x(3) = Scalar(-10.5);
725   in_x(4) = Scalar(10000.5);
726   in_x(5) = Scalar(3);
727 
728   in_q(0) = Scalar(1.2345);
729   in_q(1) = Scalar(2);
730   in_q(2) = Scalar(1.5);
731   in_q(3) = Scalar(3);
732   in_q(4) = Scalar(1.0001);
733   in_q(5) = Scalar(-2.5);
734 
735   expected_out(0) = std::numeric_limits<Scalar>::infinity();
736   expected_out(1) = Scalar(1.61237534869);
737   expected_out(2) = Scalar(0.234848505667);
738   expected_out(3) = Scalar(1.03086757337e-5);
739   expected_out(4) = Scalar(0.367879440865);
740   expected_out(5) = Scalar(0.054102025820864097);
741 
742   std::size_t bytes = in_x.size() * sizeof(Scalar);
743 
744   Scalar* d_in_x;
745   Scalar* d_in_q;
746   Scalar* d_out;
747   cudaMalloc((void**)(&d_in_x), bytes);
748   cudaMalloc((void**)(&d_in_q), bytes);
749   cudaMalloc((void**)(&d_out), bytes);
750 
751   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
752   cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
753 
754   Eigen::CudaStreamDevice stream;
755   Eigen::GpuDevice gpu_device(&stream);
756 
757   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
758   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
759   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
760 
761   gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
762 
763   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
764   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
765 
766   VERIFY_IS_EQUAL(out(0), expected_out(0));
767   VERIFY((std::isnan)(out(3)));
768 
769   for (int i = 1; i < 6; ++i) {
770     if (i != 3) {
771       VERIFY_IS_APPROX(out(i), expected_out(i));
772     }
773   }
774 
775   cudaFree(d_in_x);
776   cudaFree(d_in_q);
777   cudaFree(d_out);
778 }
779 
780 template <typename Scalar>
test_cuda_polygamma()781 void test_cuda_polygamma()
782 {
783   Tensor<Scalar, 1> in_x(7);
784   Tensor<Scalar, 1> in_n(7);
785   Tensor<Scalar, 1> out(7);
786   Tensor<Scalar, 1> expected_out(7);
787   out.setZero();
788 
789   in_n(0) = Scalar(1);
790   in_n(1) = Scalar(1);
791   in_n(2) = Scalar(1);
792   in_n(3) = Scalar(17);
793   in_n(4) = Scalar(31);
794   in_n(5) = Scalar(28);
795   in_n(6) = Scalar(8);
796 
797   in_x(0) = Scalar(2);
798   in_x(1) = Scalar(3);
799   in_x(2) = Scalar(25.5);
800   in_x(3) = Scalar(4.7);
801   in_x(4) = Scalar(11.8);
802   in_x(5) = Scalar(17.7);
803   in_x(6) = Scalar(30.2);
804 
805   expected_out(0) = Scalar(0.644934066848);
806   expected_out(1) = Scalar(0.394934066848);
807   expected_out(2) = Scalar(0.0399946696496);
808   expected_out(3) = Scalar(293.334565435);
809   expected_out(4) = Scalar(0.445487887616);
810   expected_out(5) = Scalar(-2.47810300902e-07);
811   expected_out(6) = Scalar(-8.29668781082e-09);
812 
813   std::size_t bytes = in_x.size() * sizeof(Scalar);
814 
815   Scalar* d_in_x;
816   Scalar* d_in_n;
817   Scalar* d_out;
818   cudaMalloc((void**)(&d_in_x), bytes);
819   cudaMalloc((void**)(&d_in_n), bytes);
820   cudaMalloc((void**)(&d_out), bytes);
821 
822   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
823   cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
824 
825   Eigen::CudaStreamDevice stream;
826   Eigen::GpuDevice gpu_device(&stream);
827 
828   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
829   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
830   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
831 
832   gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
833 
834   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
835   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
836 
837   for (int i = 0; i < 7; ++i) {
838     VERIFY_IS_APPROX(out(i), expected_out(i));
839   }
840 
841   cudaFree(d_in_x);
842   cudaFree(d_in_n);
843   cudaFree(d_out);
844 }
845 
846 template <typename Scalar>
test_cuda_igamma()847 void test_cuda_igamma()
848 {
849   Tensor<Scalar, 2> a(6, 6);
850   Tensor<Scalar, 2> x(6, 6);
851   Tensor<Scalar, 2> out(6, 6);
852   out.setZero();
853 
854   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
855   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
856 
857   for (int i = 0; i < 6; ++i) {
858     for (int j = 0; j < 6; ++j) {
859       a(i, j) = a_s[i];
860       x(i, j) = x_s[j];
861     }
862   }
863 
864   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
865   Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
866                           {0.0, 0.6321205588285578, 0.7768698398515702,
867                            0.9816843611112658, 9.999500016666262e-05, 1.0},
868                           {0.0, 0.4275932955291202, 0.608374823728911,
869                            0.9539882943107686, 7.522076445089201e-07, 1.0},
870                           {0.0, 0.01898815687615381, 0.06564245437845008,
871                            0.5665298796332909, 4.166333347221828e-18, 1.0},
872                           {0.0, 0.9999780593618628, 0.9999899967080838,
873                            0.9999996219837988, 0.9991370418689945, 1.0},
874                           {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
875 
876 
877 
878   std::size_t bytes = a.size() * sizeof(Scalar);
879 
880   Scalar* d_a;
881   Scalar* d_x;
882   Scalar* d_out;
883   assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
884   assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
885   assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
886 
887   cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
888   cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
889 
890   Eigen::CudaStreamDevice stream;
891   Eigen::GpuDevice gpu_device(&stream);
892 
893   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
894   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
895   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
896 
897   gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
898 
899   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
900   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
901 
902   for (int i = 0; i < 6; ++i) {
903     for (int j = 0; j < 6; ++j) {
904       if ((std::isnan)(igamma_s[i][j])) {
905         VERIFY((std::isnan)(out(i, j)));
906       } else {
907         VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
908       }
909     }
910   }
911 
912   cudaFree(d_a);
913   cudaFree(d_x);
914   cudaFree(d_out);
915 }
916 
917 template <typename Scalar>
test_cuda_igammac()918 void test_cuda_igammac()
919 {
920   Tensor<Scalar, 2> a(6, 6);
921   Tensor<Scalar, 2> x(6, 6);
922   Tensor<Scalar, 2> out(6, 6);
923   out.setZero();
924 
925   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
926   Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
927 
928   for (int i = 0; i < 6; ++i) {
929     for (int j = 0; j < 6; ++j) {
930       a(i, j) = a_s[i];
931       x(i, j) = x_s[j];
932     }
933   }
934 
935   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
936   Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
937                            {1.0, 0.36787944117144233, 0.22313016014842982,
938                             0.018315638888734182, 0.9999000049998333, 0.0},
939                            {1.0, 0.5724067044708798, 0.3916251762710878,
940                             0.04601170568923136, 0.9999992477923555, 0.0},
941                            {1.0, 0.9810118431238462, 0.9343575456215499,
942                             0.4334701203667089, 1.0, 0.0},
943                            {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
944                             3.7801620118431334e-07, 0.0008629581310054535,
945                             0.0},
946                            {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
947 
948   std::size_t bytes = a.size() * sizeof(Scalar);
949 
950   Scalar* d_a;
951   Scalar* d_x;
952   Scalar* d_out;
953   cudaMalloc((void**)(&d_a), bytes);
954   cudaMalloc((void**)(&d_x), bytes);
955   cudaMalloc((void**)(&d_out), bytes);
956 
957   cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
958   cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
959 
960   Eigen::CudaStreamDevice stream;
961   Eigen::GpuDevice gpu_device(&stream);
962 
963   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
964   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
965   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
966 
967   gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
968 
969   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
970   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
971 
972   for (int i = 0; i < 6; ++i) {
973     for (int j = 0; j < 6; ++j) {
974       if ((std::isnan)(igammac_s[i][j])) {
975         VERIFY((std::isnan)(out(i, j)));
976       } else {
977         VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
978       }
979     }
980   }
981 
982   cudaFree(d_a);
983   cudaFree(d_x);
984   cudaFree(d_out);
985 }
986 
987 template <typename Scalar>
test_cuda_erf(const Scalar stddev)988 void test_cuda_erf(const Scalar stddev)
989 {
990   Tensor<Scalar, 2> in(72,97);
991   in.setRandom();
992   in *= in.constant(stddev);
993   Tensor<Scalar, 2> out(72,97);
994   out.setZero();
995 
996   std::size_t bytes = in.size() * sizeof(Scalar);
997 
998   Scalar* d_in;
999   Scalar* d_out;
1000   assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
1001   assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
1002 
1003   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
1004 
1005   Eigen::CudaStreamDevice stream;
1006   Eigen::GpuDevice gpu_device(&stream);
1007 
1008   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1009   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1010 
1011   gpu_out.device(gpu_device) = gpu_in.erf();
1012 
1013   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1014   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1015 
1016   for (int i = 0; i < 72; ++i) {
1017     for (int j = 0; j < 97; ++j) {
1018       VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
1019     }
1020   }
1021 
1022   cudaFree(d_in);
1023   cudaFree(d_out);
1024 }
1025 
1026 template <typename Scalar>
test_cuda_erfc(const Scalar stddev)1027 void test_cuda_erfc(const Scalar stddev)
1028 {
1029   Tensor<Scalar, 2> in(72,97);
1030   in.setRandom();
1031   in *= in.constant(stddev);
1032   Tensor<Scalar, 2> out(72,97);
1033   out.setZero();
1034 
1035   std::size_t bytes = in.size() * sizeof(Scalar);
1036 
1037   Scalar* d_in;
1038   Scalar* d_out;
1039   cudaMalloc((void**)(&d_in), bytes);
1040   cudaMalloc((void**)(&d_out), bytes);
1041 
1042   cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
1043 
1044   Eigen::CudaStreamDevice stream;
1045   Eigen::GpuDevice gpu_device(&stream);
1046 
1047   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
1048   Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
1049 
1050   gpu_out.device(gpu_device) = gpu_in.erfc();
1051 
1052   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1053   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1054 
1055   for (int i = 0; i < 72; ++i) {
1056     for (int j = 0; j < 97; ++j) {
1057       VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
1058     }
1059   }
1060 
1061   cudaFree(d_in);
1062   cudaFree(d_out);
1063 }
1064 
1065 template <typename Scalar>
test_cuda_betainc()1066 void test_cuda_betainc()
1067 {
1068   Tensor<Scalar, 1> in_x(125);
1069   Tensor<Scalar, 1> in_a(125);
1070   Tensor<Scalar, 1> in_b(125);
1071   Tensor<Scalar, 1> out(125);
1072   Tensor<Scalar, 1> expected_out(125);
1073   out.setZero();
1074 
1075   Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
1076 
1077   Array<Scalar, 1, Dynamic> x(125);
1078   Array<Scalar, 1, Dynamic> a(125);
1079   Array<Scalar, 1, Dynamic> b(125);
1080   Array<Scalar, 1, Dynamic> v(125);
1081 
1082   a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1083       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
1084       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1085       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1086       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1087       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1088       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1089       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1090       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1091       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1092       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1093       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
1094       0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1095       31.62177660168379, 31.62177660168379, 31.62177660168379,
1096       31.62177660168379, 31.62177660168379, 31.62177660168379,
1097       31.62177660168379, 31.62177660168379, 31.62177660168379,
1098       31.62177660168379, 31.62177660168379, 31.62177660168379,
1099       31.62177660168379, 31.62177660168379, 31.62177660168379,
1100       31.62177660168379, 31.62177660168379, 31.62177660168379,
1101       31.62177660168379, 31.62177660168379, 31.62177660168379,
1102       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1103       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1104       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
1105       999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
1106 
1107   b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1108       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1109       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1110       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1111       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1112       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1113       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1114       31.62177660168379, 31.62177660168379, 31.62177660168379,
1115       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1116       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1117       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1118       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1119       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1120       999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
1121       0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
1122       0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
1123       31.62177660168379, 31.62177660168379, 31.62177660168379,
1124       31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
1125       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
1126       0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
1127       0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
1128       31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
1129       999.999, 999.999, 999.999;
1130 
1131   x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1132       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1133       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1134       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
1135       0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
1136       -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
1137       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
1138       0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
1139       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
1140 
1141   v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1142       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
1143       nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
1144       0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
1145       0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
1146       0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
1147       nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
1148       0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
1149       0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
1150       0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
1151       0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
1152       1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
1153       nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
1154       nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
1155       nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
1156       nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
1157       2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
1158       1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
1159       2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
1160       0.5000000000004947, 0.9999999999999999, nan;
1161 
1162   for (int i = 0; i < 125; ++i) {
1163     in_x(i) = x(i);
1164     in_a(i) = a(i);
1165     in_b(i) = b(i);
1166     expected_out(i) = v(i);
1167   }
1168 
1169   std::size_t bytes = in_x.size() * sizeof(Scalar);
1170 
1171   Scalar* d_in_x;
1172   Scalar* d_in_a;
1173   Scalar* d_in_b;
1174   Scalar* d_out;
1175   cudaMalloc((void**)(&d_in_x), bytes);
1176   cudaMalloc((void**)(&d_in_a), bytes);
1177   cudaMalloc((void**)(&d_in_b), bytes);
1178   cudaMalloc((void**)(&d_out), bytes);
1179 
1180   cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
1181   cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
1182   cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
1183 
1184   Eigen::CudaStreamDevice stream;
1185   Eigen::GpuDevice gpu_device(&stream);
1186 
1187   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
1188   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
1189   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
1190   Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
1191 
1192   gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
1193 
1194   assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
1195   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
1196 
1197   for (int i = 1; i < 125; ++i) {
1198     if ((std::isnan)(expected_out(i))) {
1199       VERIFY((std::isnan)(out(i)));
1200     } else {
1201       VERIFY_IS_APPROX(out(i), expected_out(i));
1202     }
1203   }
1204 
1205   cudaFree(d_in_x);
1206   cudaFree(d_in_a);
1207   cudaFree(d_in_b);
1208   cudaFree(d_out);
1209 }
1210 
1211 
test_cxx11_tensor_cuda()1212 void test_cxx11_tensor_cuda()
1213 {
1214   CALL_SUBTEST_1(test_cuda_nullary());
1215   CALL_SUBTEST_1(test_cuda_elementwise_small());
1216   CALL_SUBTEST_1(test_cuda_elementwise());
1217   CALL_SUBTEST_1(test_cuda_props());
1218   CALL_SUBTEST_1(test_cuda_reduction());
1219   CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
1220   CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
1221   CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
1222   CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
1223   CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
1224   CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
1225   CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
1226   CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
1227   CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
1228   CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
1229 
1230 #if __cplusplus > 199711L
1231   // std::erf, std::erfc, and so on where only added in c++11. We use them
1232   // as a golden reference to validate the results produced by Eigen. Therefore
1233   // we can only run these tests if we use a c++11 compiler.
1234   CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
1235   CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
1236   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
1237   CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
1238 
1239   CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
1240   CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
1241   CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
1242   CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
1243 
1244   CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
1245   CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
1246   CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
1247   CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
1248 
1249   CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
1250   // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
1251   CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
1252   CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
1253   CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
1254 
1255   CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
1256   CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
1257   CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
1258   CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
1259 
1260   CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
1261   // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
1262   CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
1263   CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
1264   CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
1265 
1266   CALL_SUBTEST_5(test_cuda_digamma<float>());
1267   CALL_SUBTEST_5(test_cuda_digamma<double>());
1268 
1269   CALL_SUBTEST_5(test_cuda_polygamma<float>());
1270   CALL_SUBTEST_5(test_cuda_polygamma<double>());
1271 
1272   CALL_SUBTEST_5(test_cuda_zeta<float>());
1273   CALL_SUBTEST_5(test_cuda_zeta<double>());
1274 
1275   CALL_SUBTEST_5(test_cuda_igamma<float>());
1276   CALL_SUBTEST_5(test_cuda_igammac<float>());
1277 
1278   CALL_SUBTEST_5(test_cuda_igamma<double>());
1279   CALL_SUBTEST_5(test_cuda_igammac<double>());
1280 
1281   CALL_SUBTEST_6(test_cuda_betainc<float>());
1282   CALL_SUBTEST_6(test_cuda_betainc<double>());
1283 #endif
1284 }
1285