1 /* =========================================================================
2 Copyright (c) 2010-2016, Institute for Microelectronics,
3 Institute for Analysis and Scientific Computing,
4 TU Wien.
5 Portions of this software are copyright by UChicago Argonne, LLC.
6
7 -----------------
8 ViennaCL - The Vienna Computing Library
9 -----------------
10
11 Project Head: Karl Rupp rupp@iue.tuwien.ac.at
12
13 (A list of authors and contributors can be found in the PDF manual)
14
15 License: MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17
18
19 /*
20 * Benchmark: Sparse matrix operations, i.e. matrix-vector products (sparse.cpp and sparse.cu are identical, the latter being required for compilation using CUDA nvcc)
21 *
22 */
23
24 //#define VIENNACL_BUILD_INFO
25 #ifndef NDEBUG
26 #define NDEBUG
27 #endif
28
29 #define VIENNACL_WITH_UBLAS 1
30
31 #include <boost/numeric/ublas/triangular.hpp>
32 #include <boost/numeric/ublas/vector.hpp>
33 #include <boost/numeric/ublas/vector_proxy.hpp>
34 #include <boost/numeric/ublas/matrix_sparse.hpp>
35 #include <boost/numeric/ublas/operation_sparse.hpp>
36 #include <boost/numeric/ublas/lu.hpp>
37
38
39 #include "viennacl/scalar.hpp"
40 #include "viennacl/vector.hpp"
41 #include "viennacl/coordinate_matrix.hpp"
42 #include "viennacl/compressed_matrix.hpp"
43 #include "viennacl/ell_matrix.hpp"
44 #include "viennacl/hyb_matrix.hpp"
45 #include "viennacl/sliced_ell_matrix.hpp"
46 #include "viennacl/linalg/prod.hpp"
47 #include "viennacl/linalg/norm_2.hpp"
48 #include "viennacl/io/matrix_market.hpp"
49 #include "viennacl/linalg/ilu.hpp"
50 #include "viennacl/tools/timer.hpp"
51
52
53 #include <iostream>
54 #include <vector>
55
56
57 #define BENCHMARK_RUNS 10
58
59
printOps(double num_ops,double exec_time)60 inline void printOps(double num_ops, double exec_time)
61 {
62 std::cout << "GFLOPs: " << num_ops / (1000000 * exec_time * 1000) << std::endl;
63 }
64
65
66 template<typename ScalarType>
run_benchmark()67 int run_benchmark()
68 {
69 viennacl::tools::timer timer;
70 double exec_time;
71
72 ScalarType std_factor1 = ScalarType(3.1415);
73 ScalarType std_factor2 = ScalarType(42.0);
74 viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
75 viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
76
77 boost::numeric::ublas::vector<ScalarType> ublas_vec1;
78 boost::numeric::ublas::vector<ScalarType> ublas_vec2;
79
80 boost::numeric::ublas::compressed_matrix<ScalarType> ublas_matrix;
81 if (!viennacl::io::read_matrix_market_file(ublas_matrix, "../examples/testdata/mat65k.mtx"))
82 {
83 std::cout << "Error reading Matrix file" << std::endl;
84 return 0;
85 }
86 //unsigned int cg_mat_size = cg_mat.size();
87 std::cout << "done reading matrix" << std::endl;
88
89 ublas_vec1 = boost::numeric::ublas::scalar_vector<ScalarType>(ublas_matrix.size1(), ScalarType(1.0));
90 ublas_vec2 = ublas_vec1;
91
92 viennacl::compressed_matrix<ScalarType, 1> vcl_compressed_matrix_1;
93 viennacl::compressed_matrix<ScalarType, 4> vcl_compressed_matrix_4;
94 viennacl::compressed_matrix<ScalarType, 8> vcl_compressed_matrix_8;
95
96 viennacl::coordinate_matrix<ScalarType> vcl_coordinate_matrix_128;
97
98 viennacl::ell_matrix<ScalarType, 1> vcl_ell_matrix_1;
99 viennacl::hyb_matrix<ScalarType, 1> vcl_hyb_matrix_1;
100 viennacl::sliced_ell_matrix<ScalarType> vcl_sliced_ell_matrix_1;
101
102 viennacl::vector<ScalarType> vcl_vec1(ublas_vec1.size());
103 viennacl::vector<ScalarType> vcl_vec2(ublas_vec1.size());
104
105 //cpu to gpu:
106 viennacl::copy(ublas_matrix, vcl_compressed_matrix_1);
107 #ifndef VIENNACL_EXPERIMENTAL_DOUBLE_PRECISION_WITH_STREAM_SDK_ON_GPU
108 viennacl::copy(ublas_matrix, vcl_compressed_matrix_4);
109 viennacl::copy(ublas_matrix, vcl_compressed_matrix_8);
110 #endif
111 viennacl::copy(ublas_matrix, vcl_coordinate_matrix_128);
112 viennacl::copy(ublas_matrix, vcl_ell_matrix_1);
113 viennacl::copy(ublas_matrix, vcl_hyb_matrix_1);
114 viennacl::copy(ublas_matrix, vcl_sliced_ell_matrix_1);
115 viennacl::copy(ublas_vec1, vcl_vec1);
116 viennacl::copy(ublas_vec2, vcl_vec2);
117
118
119 ///////////// Matrix operations /////////////////
120
121 std::cout << "------- Matrix-Vector product on CPU ----------" << std::endl;
122 timer.start();
123 for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
124 {
125 //ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
126 boost::numeric::ublas::axpy_prod(ublas_matrix, ublas_vec2, ublas_vec1, true);
127 }
128 exec_time = timer.get();
129 std::cout << "CPU time: " << exec_time << std::endl;
130 std::cout << "CPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
131 std::cout << ublas_vec1[0] << std::endl;
132
133
134 std::cout << "------- Matrix-Vector product with compressed_matrix ----------" << std::endl;
135
136
137 vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2); //startup calculation
138 vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2); //startup calculation
139 vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2); //startup calculation
140 //std_result = 0.0;
141
142 viennacl::backend::finish();
143 timer.start();
144 for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
145 {
146 vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_1, vcl_vec2);
147 }
148 viennacl::backend::finish();
149 exec_time = timer.get();
150 std::cout << "GPU time align1: " << exec_time << std::endl;
151 std::cout << "GPU align1 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
152 std::cout << vcl_vec1[0] << std::endl;
153
154 std::cout << "Testing triangular solves: compressed_matrix" << std::endl;
155
156 viennacl::copy(ublas_vec1, vcl_vec1);
157 viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
158 viennacl::copy(ublas_vec1, vcl_vec1);
159 std::cout << "ublas..." << std::endl;
160 timer.start();
161 boost::numeric::ublas::inplace_solve(trans(ublas_matrix), ublas_vec1, boost::numeric::ublas::unit_lower_tag());
162 std::cout << "Time elapsed: " << timer.get() << std::endl;
163 std::cout << "ViennaCL..." << std::endl;
164 viennacl::backend::finish();
165 timer.start();
166 viennacl::linalg::inplace_solve(trans(vcl_compressed_matrix_1), vcl_vec1, viennacl::linalg::unit_lower_tag());
167 viennacl::backend::finish();
168 std::cout << "Time elapsed: " << timer.get() << std::endl;
169
170 ublas_vec1 = boost::numeric::ublas::prod(ublas_matrix, ublas_vec2);
171
172 viennacl::backend::finish();
173 timer.start();
174 for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
175 {
176 vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_4, vcl_vec2);
177 }
178 viennacl::backend::finish();
179 exec_time = timer.get();
180 std::cout << "GPU time align4: " << exec_time << std::endl;
181 std::cout << "GPU align4 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
182 std::cout << vcl_vec1[0] << std::endl;
183
184 viennacl::backend::finish();
185 timer.start();
186 for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
187 {
188 vcl_vec1 = viennacl::linalg::prod(vcl_compressed_matrix_8, vcl_vec2);
189 }
190 viennacl::backend::finish();
191 exec_time = timer.get();
192 std::cout << "GPU time align8: " << exec_time << std::endl;
193 std::cout << "GPU align8 "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
194 std::cout << vcl_vec1[0] << std::endl;
195
196
197 std::cout << "------- Matrix-Vector product with coordinate_matrix ----------" << std::endl;
198 vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2); //startup calculation
199 viennacl::backend::finish();
200
201 viennacl::copy(vcl_vec1, ublas_vec2);
202 long err_cnt = 0;
203 for (std::size_t i=0; i<ublas_vec1.size(); ++i)
204 {
205 if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
206 {
207 std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
208 ++err_cnt;
209 if (err_cnt > 5)
210 break;
211 }
212 }
213
214 viennacl::backend::finish();
215 timer.start();
216 for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
217 {
218 vcl_vec1 = viennacl::linalg::prod(vcl_coordinate_matrix_128, vcl_vec2);
219 }
220 viennacl::backend::finish();
221 exec_time = timer.get();
222 std::cout << "GPU time: " << exec_time << std::endl;
223 std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
224 std::cout << vcl_vec1[0] << std::endl;
225
226
227 std::cout << "------- Matrix-Vector product with ell_matrix ----------" << std::endl;
228 vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2); //startup calculation
229 viennacl::backend::finish();
230
231 viennacl::copy(vcl_vec1, ublas_vec2);
232 err_cnt = 0;
233 for (std::size_t i=0; i<ublas_vec1.size(); ++i)
234 {
235 if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
236 {
237 std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
238 ++err_cnt;
239 if (err_cnt > 5)
240 break;
241 }
242 }
243
244 viennacl::backend::finish();
245 timer.start();
246 for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
247 {
248 vcl_vec1 = viennacl::linalg::prod(vcl_ell_matrix_1, vcl_vec2);
249 }
250 viennacl::backend::finish();
251 exec_time = timer.get();
252 std::cout << "GPU time: " << exec_time << std::endl;
253 std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
254 std::cout << vcl_vec1[0] << std::endl;
255
256
257 std::cout << "------- Matrix-Vector product with hyb_matrix ----------" << std::endl;
258 vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2); //startup calculation
259 viennacl::backend::finish();
260
261 viennacl::copy(vcl_vec1, ublas_vec2);
262 err_cnt = 0;
263 for (std::size_t i=0; i<ublas_vec1.size(); ++i)
264 {
265 if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
266 {
267 std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
268 ++err_cnt;
269 if (err_cnt > 5)
270 break;
271 }
272 }
273
274 viennacl::backend::finish();
275 timer.start();
276 for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
277 {
278 vcl_vec1 = viennacl::linalg::prod(vcl_hyb_matrix_1, vcl_vec2);
279 }
280 viennacl::backend::finish();
281 exec_time = timer.get();
282 std::cout << "GPU time: " << exec_time << std::endl;
283 std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
284 std::cout << vcl_vec1[0] << std::endl;
285
286
287 std::cout << "------- Matrix-Vector product with sliced_ell_matrix ----------" << std::endl;
288 vcl_vec1 = viennacl::linalg::prod(vcl_sliced_ell_matrix_1, vcl_vec2); //startup calculation
289 viennacl::backend::finish();
290
291 viennacl::copy(vcl_vec1, ublas_vec2);
292 err_cnt = 0;
293 for (std::size_t i=0; i<ublas_vec1.size(); ++i)
294 {
295 if ( fabs(ublas_vec1[i] - ublas_vec2[i]) / std::max(fabs(ublas_vec1[i]), fabs(ublas_vec2[i])) > 1e-2)
296 {
297 std::cout << "Error at index " << i << ": Should: " << ublas_vec1[i] << ", Is: " << ublas_vec2[i] << std::endl;
298 ++err_cnt;
299 if (err_cnt > 5)
300 break;
301 }
302 }
303
304 viennacl::backend::finish();
305 timer.start();
306 for (int runs=0; runs<BENCHMARK_RUNS; ++runs)
307 {
308 vcl_vec1 = viennacl::linalg::prod(vcl_sliced_ell_matrix_1, vcl_vec2);
309 }
310 viennacl::backend::finish();
311 exec_time = timer.get();
312 std::cout << "GPU time: " << exec_time << std::endl;
313 std::cout << "GPU "; printOps(2.0 * static_cast<double>(ublas_matrix.nnz()), static_cast<double>(exec_time) / static_cast<double>(BENCHMARK_RUNS));
314 std::cout << vcl_vec1[0] << std::endl;
315
316 return EXIT_SUCCESS;
317 }
318
319
main()320 int main()
321 {
322 std::cout << std::endl;
323 std::cout << "----------------------------------------------" << std::endl;
324 std::cout << " Device Info" << std::endl;
325 std::cout << "----------------------------------------------" << std::endl;
326
327 #ifdef VIENNACL_WITH_OPENCL
328 std::cout << viennacl::ocl::current_device().info() << std::endl;
329 #endif
330 std::cout << std::endl;
331 std::cout << "----------------------------------------------" << std::endl;
332 std::cout << "----------------------------------------------" << std::endl;
333 std::cout << "## Benchmark :: Sparse" << std::endl;
334 std::cout << "----------------------------------------------" << std::endl;
335 std::cout << std::endl;
336 std::cout << " -------------------------------" << std::endl;
337 std::cout << " # benchmarking single-precision" << std::endl;
338 std::cout << " -------------------------------" << std::endl;
339 run_benchmark<float>();
340 #ifdef VIENNACL_WITH_OPENCL
341 if ( viennacl::ocl::current_device().double_support() )
342 #endif
343 {
344 std::cout << std::endl;
345 std::cout << " -------------------------------" << std::endl;
346 std::cout << " # benchmarking double-precision" << std::endl;
347 std::cout << " -------------------------------" << std::endl;
348 run_benchmark<double>();
349 }
350 return 0;
351 }
352
353