1 /* =========================================================================
2    Copyright (c) 2010-2016, Institute for Microelectronics,
3                             Institute for Analysis and Scientific Computing,
4                             TU Wien.
5    Portions of this software are copyright by UChicago Argonne, LLC.
6 
7                             -----------------
8                   ViennaCL - The Vienna Computing Library
9                             -----------------
10 
11    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
12 
13    (A list of authors and contributors can be found in the PDF manual)
14 
15    License:         MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17 
18 
19 /*
20 *   Benchmark:  Sparse matrix operations, i.e. matrix-vector products (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
21 *
22 */
23 
24 //#define VIENNACL_BUILD_INFO
25 #ifndef NDEBUG
26  #define NDEBUG
27 #endif
28 
29 #define VIENNACL_WITH_UBLAS 1
30 
31 #include <boost/numeric/ublas/triangular.hpp>
32 #include <boost/numeric/ublas/vector.hpp>
33 #include <boost/numeric/ublas/vector_proxy.hpp>
34 #include <boost/numeric/ublas/matrix_sparse.hpp>
35 #include <boost/numeric/ublas/operation_sparse.hpp>
36 #include <boost/numeric/ublas/lu.hpp>
37 
38 
39 #include "viennacl/scalar.hpp"
40 #include "viennacl/vector.hpp"
41 #include "viennacl/coordinate_matrix.hpp"
42 #include "viennacl/compressed_matrix.hpp"
43 #include "viennacl/ell_matrix.hpp"
44 #include "viennacl/hyb_matrix.hpp"
45 #include "viennacl/sliced_ell_matrix.hpp"
46 #include "viennacl/linalg/prod.hpp"
47 #include "viennacl/linalg/norm_2.hpp"
48 #include "viennacl/io/matrix_market.hpp"
49 #include "viennacl/linalg/ilu.hpp"
50 #include "viennacl/tools/timer.hpp"
51 
52 
53 #include <iostream>
54 #include <vector>
55 
56 
57 #define BENCHMARK_RUNS          10
58 
59 
printOps(double num_ops,double exec_time)60 inline void printOps(double num_ops, double exec_time)
61 {
62   std::cout << "GFLOPs: " << num_ops / (1000000 * exec_time * 1000) << std::endl;
63 }
64 
65 
66 template<typename ScalarType>
run_benchmark()67 int run_benchmark()
68 {
69   viennacl::tools::timer timer;
70   double exec_time;
71 
72   ScalarType std_factor1 = ScalarType(3.1415);
73   ScalarType std_factor2 = ScalarType(42.0);
74   viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
75   viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
76 
77   boost::numeric::ublas::vector<ScalarType> ublas_vec1;
78   boost::numeric::ublas::vector<ScalarType> ublas_vec2;
79 
80   boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
81   if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
82   {
83     std::cout << "Error reading Matrix file" << std::endl;
84     return 0;
85   }
86   //unsigned int cg_mat_size = cg_mat.size();
87   std::cout << "done reading matrix" << std::endl;
88 
89   ublas_vec1 = boost::numeric::ublas::scalar_vector<ScalarType>(ublas_matrix.size1(), ScalarType(1.0));
90   ublas_vec2 = ublas_vec1;
91 
92   viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
93   viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
94   viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
95 
96   viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
97 
98   viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
99   viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
100   viennacl::sliced_ell_matrix<ScalarType> vcl_sliced_ell_matrix_1;
101 
102   viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
103   viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size());
104 
105   //cpu to gpu:
106   viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
107   #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
108   viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
109   viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
110   #endif
111   viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
112   viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
113   viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
114   viennacl::copy(ublas_matrix, vcl_sliced_ell_matrix_1);
115   viennacl::copy(ublas_vec1, vcl_vec1);
116   viennacl::copy(ublas_vec2, vcl_vec2);
117 
118 
119   ///////////// Matrix operations /////////////////
120 
121   std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
122   timer.start();
123   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
124   {
125     //ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
126     boost::numeric::ublas::axpy_prod(ublas_matrix, ublas_vec2, ublas_vec1, true);
127   }
128   exec_time = timer.get();
129   std::cout << "CPU time: " << exec_time << std::endl;
130   std::cout << "CPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
131   std::cout << ublas_vec1[0] << std::endl;
132 
133 
134   std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
135 
136 
137   vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
138   vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
139   vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
140   //std_result = 0.0;
141 
142   viennacl::backend::finish();
143   timer.start();
144   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
145   {
146     vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
147   }
148   viennacl::backend::finish();
149   exec_time = timer.get();
150   std::cout << "GPU time align1: " << exec_time << std::endl;
151   std::cout << "GPU align1 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
152   std::cout << vcl_vec1[0] << std::endl;
153 
154   std::cout << "Testing triangular solves: compressed_matrix" << std::endl;
155 
156   viennacl::copy(ublas_vec1, vcl_vec1);
157   viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
158   viennacl::copy(ublas_vec1, vcl_vec1);
159   std::cout << "ublas..." << std::endl;
160   timer.start();
161   boost::numeric::ublas::inplace_solve(trans(ublas_matrix), ublas_vec1, boost::numeric::ublas::unit_lower_tag());
162   std::cout << "Time elapsed: " << timer.get() << std::endl;
163   std::cout << "ViennaCL..." << std::endl;
164   viennacl::backend::finish();
165   timer.start();
166   viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
167   viennacl::backend::finish();
168   std::cout << "Time elapsed: " << timer.get() << std::endl;
169 
170   ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
171 
172   viennacl::backend::finish();
173   timer.start();
174   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
175   {
176     vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
177   }
178   viennacl::backend::finish();
179   exec_time = timer.get();
180   std::cout << "GPU time align4: " << exec_time << std::endl;
181   std::cout << "GPU align4 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
182   std::cout << vcl_vec1[0] << std::endl;
183 
184   viennacl::backend::finish();
185   timer.start();
186   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
187   {
188     vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
189   }
190   viennacl::backend::finish();
191   exec_time = timer.get();
192   std::cout << "GPU time align8: " << exec_time << std::endl;
193   std::cout << "GPU align8 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
194   std::cout << vcl_vec1[0] << std::endl;
195 
196 
197   std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
198   vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
199   viennacl::backend::finish();
200 
201   viennacl::copy(vcl_vec1, ublas_vec2);
202   long err_cnt = 0;
203   for (std::size_t i=0; i<ublas_vec1.size(); ++i)
204   {
205     if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
206     {
207       std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
208       ++err_cnt;
209       if (err_cnt > 5)
210         break;
211     }
212   }
213 
214   viennacl::backend::finish();
215   timer.start();
216   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
217   {
218     vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
219   }
220   viennacl::backend::finish();
221   exec_time = timer.get();
222   std::cout << "GPU time: " << exec_time << std::endl;
223   std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
224   std::cout << vcl_vec1[0] << std::endl;
225 
226 
227   std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
228   vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
229   viennacl::backend::finish();
230 
231   viennacl::copy(vcl_vec1, ublas_vec2);
232   err_cnt = 0;
233   for (std::size_t i=0; i<ublas_vec1.size(); ++i)
234   {
235     if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
236     {
237       std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
238       ++err_cnt;
239       if (err_cnt > 5)
240         break;
241     }
242   }
243 
244   viennacl::backend::finish();
245   timer.start();
246   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
247   {
248     vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
249   }
250   viennacl::backend::finish();
251   exec_time = timer.get();
252   std::cout << "GPU time: " << exec_time << std::endl;
253   std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
254   std::cout << vcl_vec1[0] << std::endl;
255 
256 
257   std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
258   vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
259   viennacl::backend::finish();
260 
261   viennacl::copy(vcl_vec1, ublas_vec2);
262   err_cnt = 0;
263   for (std::size_t i=0; i<ublas_vec1.size(); ++i)
264   {
265     if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
266     {
267       std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
268       ++err_cnt;
269       if (err_cnt > 5)
270         break;
271     }
272   }
273 
274   viennacl::backend::finish();
275   timer.start();
276   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
277   {
278     vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
279   }
280   viennacl::backend::finish();
281   exec_time = timer.get();
282   std::cout << "GPU time: " << exec_time << std::endl;
283   std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
284   std::cout << vcl_vec1[0] << std::endl;
285 
286 
287   std::cout << "------- Matrix-Vector product with sliced_ell_matrix ----------" << std::endl;
288   vcl_vec1 = viennacl::linalg::prod(vcl_sliced_ell_matrix_1, vcl_vec2); //startup calculation
289   viennacl::backend::finish();
290 
291   viennacl::copy(vcl_vec1, ublas_vec2);
292   err_cnt = 0;
293   for (std::size_t i=0; i<ublas_vec1.size(); ++i)
294   {
295     if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
296     {
297       std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
298       ++err_cnt;
299       if (err_cnt > 5)
300         break;
301     }
302   }
303 
304   viennacl::backend::finish();
305   timer.start();
306   for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
307   {
308     vcl_vec1 = viennacl::linalg::prod(vcl_sliced_ell_matrix_1, vcl_vec2);
309   }
310   viennacl::backend::finish();
311   exec_time = timer.get();
312   std::cout << "GPU time: " << exec_time << std::endl;
313   std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
314   std::cout << vcl_vec1[0] << std::endl;
315 
316   return EXIT_SUCCESS;
317 }
318 
319 
main()320 int main()
321 {
322   std::cout << std::endl;
323   std::cout << "----------------------------------------------" << std::endl;
324   std::cout << "               Device Info" << std::endl;
325   std::cout << "----------------------------------------------" << std::endl;
326 
327 #ifdef VIENNACL_WITH_OPENCL
328   std::cout << viennacl::ocl::current_device().info() << std::endl;
329 #endif
330   std::cout << std::endl;
331   std::cout << "----------------------------------------------" << std::endl;
332   std::cout << "----------------------------------------------" << std::endl;
333   std::cout << "## Benchmark :: Sparse" << std::endl;
334   std::cout << "----------------------------------------------" << std::endl;
335   std::cout << std::endl;
336   std::cout << "   -------------------------------" << std::endl;
337   std::cout << "   # benchmarking single-precision" << std::endl;
338   std::cout << "   -------------------------------" << std::endl;
339   run_benchmark<float>();
340 #ifdef VIENNACL_WITH_OPENCL
341   if ( viennacl::ocl::current_device().double_support() )
342 #endif
343   {
344     std::cout << std::endl;
345     std::cout << "   -------------------------------" << std::endl;
346     std::cout << "   # benchmarking double-precision" << std::endl;
347     std::cout << "   -------------------------------" << std::endl;
348     run_benchmark<double>();
349   }
350   return 0;
351 }
352 
353