1 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
2 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
3
4 /* =========================================================================
5 Copyright (c) 2010-2016, Institute for Microelectronics,
6 Institute for Analysis and Scientific Computing,
7 TU Wien.
8 Portions of this software are copyright by UChicago Argonne, LLC.
9
10 -----------------
11 ViennaCL - The Vienna Computing Library
12 -----------------
13
14 Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15
16 (A list of authors and contributors can be found in the manual)
17
18 License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20
21 /** @file viennacl/linalg/detail/spai/spai.hpp
22 @brief Main implementation of SPAI (not FSPAI). Experimental.
23 */
24
25 #include <utility>
26 #include <iostream>
27 #include <fstream>
28 #include <string>
29 #include <algorithm>
30 #include <vector>
31 #include <math.h>
32 #include <map>
33
34 //local includes
35 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
36 #include "viennacl/linalg/qr.hpp"
37 #include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
38 #include "viennacl/linalg/detail/spai/spai-static.hpp"
39 #include "viennacl/linalg/detail/spai/sparse_vector.hpp"
40 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
41 #include "viennacl/linalg/detail/spai/block_vector.hpp"
42
43 //boost includes
44 #include "boost/numeric/ublas/vector.hpp"
45 #include "boost/numeric/ublas/matrix.hpp"
46 #include "boost/numeric/ublas/matrix_proxy.hpp"
47 #include "boost/numeric/ublas/vector_proxy.hpp"
48 #include "boost/numeric/ublas/storage.hpp"
49 #include "boost/numeric/ublas/io.hpp"
50 #include "boost/numeric/ublas/lu.hpp"
51 #include "boost/numeric/ublas/triangular.hpp"
52 #include "boost/numeric/ublas/matrix_expression.hpp"
53
54 // ViennaCL includes
55 #include "viennacl/linalg/prod.hpp"
56 #include "viennacl/matrix.hpp"
57 #include "viennacl/compressed_matrix.hpp"
58 #include "viennacl/linalg/sparse_matrix_operations.hpp"
59 #include "viennacl/linalg/matrix_operations.hpp"
60 #include "viennacl/scalar.hpp"
61 #include "viennacl/linalg/inner_prod.hpp"
62 #include "viennacl/linalg/ilu.hpp"
63 #include "viennacl/ocl/backend.hpp"
64 #include "viennacl/linalg/opencl/kernels/spai.hpp"
65
66
67
68 #define VIENNACL_SPAI_K_b 20
69
70 namespace viennacl
71 {
72 namespace linalg
73 {
74 namespace detail
75 {
76 namespace spai
77 {
78
79 //debug function for print
80 template<typename SparseVectorT>
print_sparse_vector(SparseVectorT const & v)81 void print_sparse_vector(SparseVectorT const & v)
82 {
83 for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it)
84 std::cout << "[ " << vec_it->first << " ]:" << vec_it->second << std::endl;
85 }
86
87 template<typename DenseMatrixT>
print_matrix(DenseMatrixT & m)88 void print_matrix(DenseMatrixT & m)
89 {
90 for (int i = 0; i < m.size2(); ++i)
91 {
92 for (int j = 0; j < m.size1(); ++j)
93 std::cout<<m(j, i)<<" ";
94 std::cout<<std::endl;
95 }
96 }
97
98 /** @brief Add two sparse vectors res_v = b*v
99 *
100 * @param v initial sparse vector
101 * @param b scalar
102 * @param res_v output vector
103 */
104 template<typename SparseVectorT, typename NumericT>
add_sparse_vectors(SparseVectorT const & v,NumericT b,SparseVectorT & res_v)105 void add_sparse_vectors(SparseVectorT const & v, NumericT b, SparseVectorT & res_v)
106 {
107 for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)
108 res_v[v_it->first] += b*v_it->second;
109 }
110
111 //sparse-matrix - vector product
112 /** @brief Computation of residual res = A*v - e
113 *
114 * @param A_v_c column major vectorized input sparse matrix
115 * @param v sparse vector, in this case new column of preconditioner matrix
116 * @param ind index for current column
117 * @param res residual
118 */
119 template<typename SparseVectorT, typename NumericT>
compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,SparseVectorT const & v,unsigned int ind,SparseVectorT & res)120 void compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,
121 SparseVectorT const & v,
122 unsigned int ind,
123 SparseVectorT & res)
124 {
125 for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)
126 add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
127
128 res[ind] -= NumericT(1);
129 }
130
131 /** @brief Setting up index set of columns and rows for certain column
132 *
133 * @param A_v_c column major vectorized initial sparse matrix
134 * @param v current column of preconditioner matrix
135 * @param J set of column indices
136 * @param I set of row indices
137 */
138 template<typename SparseVectorT>
build_index_set(std::vector<SparseVectorT> const & A_v_c,SparseVectorT const & v,std::vector<unsigned int> & J,std::vector<unsigned int> & I)139 void build_index_set(std::vector<SparseVectorT> const & A_v_c,
140 SparseVectorT const & v,
141 std::vector<unsigned int> & J,
142 std::vector<unsigned int> & I)
143 {
144 buildColumnIndexSet(v, J);
145 projectRows(A_v_c, J, I);
146 }
147
148 /** @brief Initializes a dense matrix from a sparse one
149 *
150 * @param A_in Oiginal sparse matrix
151 * @param J Set of column indices
152 * @param I Set of row indices
153 * @param A_out dense matrix output
154 */
155 template<typename SparseMatrixT, typename DenseMatrixT>
initProjectSubMatrix(SparseMatrixT const & A_in,std::vector<unsigned int> const & J,std::vector<unsigned int> & I,DenseMatrixT & A_out)156 void initProjectSubMatrix(SparseMatrixT const & A_in,
157 std::vector<unsigned int> const & J,
158 std::vector<unsigned int> & I,
159 DenseMatrixT & A_out)
160 {
161 A_out.resize(I.size(), J.size(), false);
162 for (vcl_size_t j = 0; j < J.size(); ++j)
163 for (vcl_size_t i = 0; i < I.size(); ++i)
164 A_out(i,j) = A_in(I[i],J[j]);
165 }
166
167
168 /************************************************** CPU BLOCK SET UP ***************************************/
169
170 /** @brief Setting up blocks and QR factorizing them on CPU
171 *
172 * @param A initial sparse matrix
173 * @param A_v_c column major vectorized initial sparse matrix
174 * @param M_v initialized preconditioner
175 * @param g_I container of row indices
176 * @param g_J container of column indices
177 * @param g_A_I_J container of dense matrices -> R matrices after QR factorization
178 * @param g_b_v container of vectors beta, necessary for Q recovery
179 */
180 template<typename SparseMatrixT, typename DenseMatrixT, typename SparseVectorT, typename VectorT>
block_set_up(SparseMatrixT const & A,std::vector<SparseVectorT> const & A_v_c,std::vector<SparseVectorT> const & M_v,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,std::vector<DenseMatrixT> & g_A_I_J,std::vector<VectorT> & g_b_v)181 void block_set_up(SparseMatrixT const & A,
182 std::vector<SparseVectorT> const & A_v_c,
183 std::vector<SparseVectorT> const & M_v,
184 std::vector<std::vector<unsigned int> >& g_I,
185 std::vector<std::vector<unsigned int> >& g_J,
186 std::vector<DenseMatrixT>& g_A_I_J,
187 std::vector<VectorT>& g_b_v)
188 {
189 #ifdef VIENNACL_WITH_OPENMP
190 #pragma omp parallel for
191 #endif
192 for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
193 {
194 vcl_size_t i = static_cast<vcl_size_t>(i2);
195 build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
196 initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
197 //print_matrix(g_A_I_J[i]);
198 single_qr(g_A_I_J[i], g_b_v[i]);
199 //print_matrix(g_A_I_J[i]);
200 }
201 }
202
203 /** @brief Setting up index set of columns and rows for all columns
204 *
205 * @param A_v_c column major vectorized initial sparse matrix
206 * @param M_v initialized preconditioner
207 * @param g_J container of column indices
208 * @param g_I container of row indices
209 */
210 template<typename SparseVectorT>
index_set_up(std::vector<SparseVectorT> const & A_v_c,std::vector<SparseVectorT> const & M_v,std::vector<std::vector<unsigned int>> & g_J,std::vector<std::vector<unsigned int>> & g_I)211 void index_set_up(std::vector<SparseVectorT> const & A_v_c,
212 std::vector<SparseVectorT> const & M_v,
213 std::vector<std::vector<unsigned int> > & g_J,
214 std::vector<std::vector<unsigned int> > & g_I)
215 {
216 #ifdef VIENNACL_WITH_OPENMP
217 #pragma omp parallel for
218 #endif
219 for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
220 {
221 vcl_size_t i = static_cast<vcl_size_t>(i2);
222 build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
223 }
224 }
225
226 /************************************************** GPU BLOCK SET UP ***************************************/
227
228 /** @brief Setting up blocks and QR factorizing them on GPU
229 *
230 * @param A initial sparse matrix
231 * @param A_v_c column major vectorized initial sparse matrix
232 * @param M_v initialized preconditioner
233 * @param g_is_update container that indicates which blocks are active
234 * @param g_I container of row indices
235 * @param g_J container of column indices
236 * @param g_A_I_J container of dense matrices -> R matrices after QR factorization
237 * @param g_bv container of vectors beta, necessary for Q recovery
238 */
239 template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
block_set_up(viennacl::compressed_matrix<NumericT,AlignmentV> const & A,std::vector<SparseVectorT> const & A_v_c,std::vector<SparseVectorT> const & M_v,std::vector<cl_uint> g_is_update,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,block_matrix & g_A_I_J,block_vector & g_bv)240 void block_set_up(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
241 std::vector<SparseVectorT> const & A_v_c,
242 std::vector<SparseVectorT> const & M_v,
243 std::vector<cl_uint> g_is_update,
244 std::vector<std::vector<unsigned int> > & g_I,
245 std::vector<std::vector<unsigned int> > & g_J,
246 block_matrix & g_A_I_J,
247 block_vector & g_bv)
248 {
249 viennacl::context ctx = viennacl::traits::context(A);
250 bool is_empty_block;
251
252 //build index set
253 index_set_up(A_v_c, M_v, g_J, g_I);
254 block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);
255 block_qr<NumericT>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);
256 }
257
258
259 /***************************************************************************************************/
260 /******************************** SOLVING LS PROBLEMS ON GPU ***************************************/
261 /***************************************************************************************************/
262
263 /** @brief Elicitation of sparse vector m for particular column from m_in - contigious vector for all columns
264 *
265 * @param m_in contigious sparse vector for all columns
266 * @param start_m_ind start index of particular vector
267 * @param J column index set
268 * @param m sparse vector for particular column
269 */
270 template<typename NumericT, typename SparseVectorT>
custom_fan_out(std::vector<NumericT> const & m_in,unsigned int start_m_ind,std::vector<unsigned int> const & J,SparseVectorT & m)271 void custom_fan_out(std::vector<NumericT> const & m_in,
272 unsigned int start_m_ind,
273 std::vector<unsigned int> const & J,
274 SparseVectorT & m)
275 {
276 unsigned int cnt = 0;
277 for (vcl_size_t i = 0; i < J.size(); ++i)
278 m[J[i]] = m_in[start_m_ind + cnt++];
279 }
280
281
282
283 //GPU based least square problem
284 /** @brief Solution of Least square problem on GPU
285 *
286 * @param A_v_c column-major vectorized initial sparse matrix
287 * @param M_v column-major vectorized sparse preconditioner matrix
288 * @param g_I container of row set indices
289 * @param g_J container of column set indices
290 * @param g_A_I_J_vcl contigious matrix that consists of blocks A(I_k, J_k)
291 * @param g_bv_vcl contigious vector that consists of betas, necessary for Q recovery
292 * @param g_res container of residuals
293 * @param g_is_update container with indicators which blocks are active
294 * @param tag spai tag
295 * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
296 */
297 template<typename SparseVectorT, typename NumericT>
least_square_solve(std::vector<SparseVectorT> & A_v_c,std::vector<SparseVectorT> & M_v,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,block_matrix & g_A_I_J_vcl,block_vector & g_bv_vcl,std::vector<SparseVectorT> & g_res,std::vector<cl_uint> & g_is_update,const spai_tag & tag,viennacl::context ctx)298 void least_square_solve(std::vector<SparseVectorT> & A_v_c,
299 std::vector<SparseVectorT> & M_v,
300 std::vector<std::vector<unsigned int> >& g_I,
301 std::vector<std::vector<unsigned int> > & g_J,
302 block_matrix & g_A_I_J_vcl,
303 block_vector & g_bv_vcl,
304 std::vector<SparseVectorT> & g_res,
305 std::vector<cl_uint> & g_is_update,
306 const spai_tag & tag,
307 viennacl::context ctx)
308 {
309 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
310 unsigned int y_sz, m_sz;
311 std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
312 std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
313
314 get_size(g_I, y_sz);
315 init_start_inds(g_I, y_inds);
316 init_start_inds(g_J, m_inds);
317
318 //create y_v
319 std::vector<NumericT> y_v(y_sz, NumericT(0));
320 for (vcl_size_t i = 0; i < M_v.size(); ++i)
321 {
322 for (vcl_size_t j = 0; j < g_I[i].size(); ++j)
323 {
324 if (g_I[i][j] == i)
325 y_v[y_inds[i] + j] = NumericT(1.0);
326 }
327 }
328 //compute m_v
329 get_size(g_J, m_sz);
330 std::vector<NumericT> m_v(m_sz, static_cast<cl_uint>(0));
331
332 block_vector y_v_vcl;
333 block_vector m_v_vcl;
334 //prepearing memory for least square problem on GPU
335 y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
336 static_cast<unsigned int>(sizeof(NumericT)*y_v.size()),
337 &(y_v[0]));
338 m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
339 static_cast<unsigned int>(sizeof(NumericT)*m_v.size()),
340 &(m_v[0]));
341 y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
342 static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
343 &(y_inds[0]));
344 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
345 static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
346 &(g_is_update[0]));
347 viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
348 viennacl::ocl::kernel & ls_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_least_squares");
349 ls_kernel.local_work_size(0, 1);
350 ls_kernel.global_work_size(0, 256);
351 viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
352 y_v_vcl.handle(), y_v_vcl.handle1(),
353 g_A_I_J_vcl.handle1(), g_is_update_vcl,
354 //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
355 static_cast<unsigned int>(M_v.size())));
356 //copy vector m_v back from GPU to CPU
357 cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),
358 m_v_vcl.handle().get(), CL_TRUE, 0,
359 sizeof(NumericT)*(m_v.size()),
360 &(m_v[0]), 0, NULL, NULL);
361 VIENNACL_ERR_CHECK(vcl_err);
362
363 //fan out vector in parallel
364 //#pragma omp parallel for
365 for (long i = 0; i < static_cast<long>(M_v.size()); ++i)
366 {
367 if (g_is_update[static_cast<vcl_size_t>(i)])
368 {
369 //faned out onto sparse vector
370 custom_fan_out(m_v, m_inds[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], M_v[static_cast<vcl_size_t>(i)]);
371 g_res[static_cast<vcl_size_t>(i)].clear();
372 compute_spai_residual<SparseVectorT, NumericT>(A_v_c, M_v[static_cast<vcl_size_t>(i)], static_cast<unsigned int>(i), g_res[static_cast<vcl_size_t>(i)]);
373 NumericT res_norm = 0;
374 //compute norm of res - just to make sure that this implementatino works correct
375 sparse_norm_2(g_res[static_cast<vcl_size_t>(i)], res_norm);
376 //std::cout<<"Residual norm of column #: "<<i<<std::endl;
377 //std::cout<<res_norm<<std::endl;
378 //std::cout<<"************************"<<std::endl;
379 g_is_update[static_cast<vcl_size_t>(i)] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
380 }
381 }
382 }
383
384 //CPU based least square problems
385 /** @brief Solution of Least square problem on CPU
386 *
387 * @param A_v_c column-major vectorized initial sparse matrix
388 * @param g_R blocks for least square solution
389 * @param g_b_v vectors beta, necessary for Q recovery
390 * @param g_I container of row index set for all columns of matrix M
391 * @param g_J container of column index set for all columns of matrix M
392 * @param g_res container of residuals
393 * @param g_is_update container with indicators which blocks are active
394 * @param M_v column-major vectorized sparse matrix, final preconditioner
395 * @param tag spai tag
396 */
397 template<typename SparseVectorT, typename DenseMatrixT, typename VectorT>
least_square_solve(std::vector<SparseVectorT> const & A_v_c,std::vector<DenseMatrixT> & g_R,std::vector<VectorT> & g_b_v,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,std::vector<SparseVectorT> & g_res,std::vector<bool> & g_is_update,std::vector<SparseVectorT> & M_v,spai_tag const & tag)398 void least_square_solve(std::vector<SparseVectorT> const & A_v_c,
399 std::vector<DenseMatrixT> & g_R,
400 std::vector<VectorT> & g_b_v,
401 std::vector<std::vector<unsigned int> > & g_I,
402 std::vector<std::vector<unsigned int> > & g_J,
403 std::vector<SparseVectorT> & g_res,
404 std::vector<bool> & g_is_update,
405 std::vector<SparseVectorT> & M_v,
406 spai_tag const & tag)
407 {
408 typedef typename DenseMatrixT::value_type NumericType;
409
410 #ifdef VIENNACL_WITH_OPENMP
411 #pragma omp parallel for
412 #endif
413 for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
414 {
415 vcl_size_t i = static_cast<vcl_size_t>(i2);
416 if (g_is_update[i])
417 {
418 VectorT y = boost::numeric::ublas::zero_vector<NumericType>(g_I[i].size());
419
420 projectI<VectorT, NumericType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + long(i)));
421 apply_q_trans_vec(g_R[i], g_b_v[i], y);
422
423 VectorT m_new = boost::numeric::ublas::zero_vector<NumericType>(g_R[i].size2());
424 backwardSolve(g_R[i], y, m_new);
425 fanOutVector(m_new, g_J[i], M_v[i]);
426 g_res[i].clear();
427
428 compute_spai_residual<SparseVectorT, NumericType>(A_v_c, M_v[i], static_cast<unsigned int>(tag.getBegInd() + long(i)), g_res[i]);
429
430 NumericType res_norm = 0;
431 sparse_norm_2(g_res[i], res_norm);
432 // std::cout<<"Residual norm of column #: "<<i<<std::endl;
433 // std::cout<<res_norm<<std::endl;
434 // std::cout<<"************************"<<std::endl;
435 g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());
436 }
437 }
438 }
439
440 //************************************ UPDATE CHECK ***************************************************//
441
442 template<typename VectorType>
is_all_update(VectorType & parallel_is_update)443 bool is_all_update(VectorType& parallel_is_update)
444 {
445 for (unsigned int i = 0; i < parallel_is_update.size(); ++i)
446 {
447 if (parallel_is_update[i])
448 return true;
449 }
450 return false;
451 }
452
453 //********************************** MATRIX VECTORIZATION ***********************************************//
454
455 //Matrix vectorization, column based approach
456 /** @brief Solution of Least square problem on CPU
457 *
458 * @param M_in input sparse, boost::numeric::ublas::compressed_matrix
459 * @param M_v array of sparse vectors
460 */
461 template<typename SparseMatrixT, typename SparseVectorT>
vectorize_column_matrix(SparseMatrixT const & M_in,std::vector<SparseVectorT> & M_v)462 void vectorize_column_matrix(SparseMatrixT const & M_in,
463 std::vector<SparseVectorT> & M_v)
464 {
465 for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)
466 for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
467 M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it;
468 }
469
470 //Matrix vectorization row based approach
471 template<typename SparseMatrixT, typename SparseVectorT>
vectorize_row_matrix(SparseMatrixT const & M_in,std::vector<SparseVectorT> & M_v)472 void vectorize_row_matrix(SparseMatrixT const & M_in,
473 std::vector<SparseVectorT> & M_v)
474 {
475 for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)
476 for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
477 M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it;
478 }
479
480 //************************************* BLOCK ASSEMBLY CODE *********************************************//
481
482
483 template<typename SizeT>
write_set_to_array(std::vector<std::vector<SizeT>> const & ind_set,std::vector<cl_uint> & a)484 void write_set_to_array(std::vector<std::vector<SizeT> > const & ind_set,
485 std::vector<cl_uint> & a)
486 {
487 vcl_size_t cnt = 0;
488
489 for (vcl_size_t i = 0; i < ind_set.size(); ++i)
490 for (vcl_size_t j = 0; j < ind_set[i].size(); ++j)
491 a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
492 }
493
494
495
496 //assembling blocks on GPU
497 /** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J
498 *
499 * @param A intial sparse matrix
500 * @param g_J container of column index set
501 * @param g_I container of row index set
502 * @param g_A_I_J_vcl contigious blocks A(I, J) using GPU memory
503 * @param g_is_update container with indicators which blocks are active
504 * @param is_empty_block parameter that indicates if no block were assembled
505 */
506 template<typename NumericT, unsigned int AlignmentV>
block_assembly(viennacl::compressed_matrix<NumericT,AlignmentV> const & A,std::vector<std::vector<unsigned int>> const & g_J,std::vector<std::vector<unsigned int>> const & g_I,block_matrix & g_A_I_J_vcl,std::vector<cl_uint> & g_is_update,bool & is_empty_block)507 void block_assembly(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
508 std::vector<std::vector<unsigned int> > const & g_J,
509 std::vector<std::vector<unsigned int> > const & g_I,
510 block_matrix & g_A_I_J_vcl,
511 std::vector<cl_uint> & g_is_update,
512 bool & is_empty_block)
513 {
514 //computing start indices for index sets and start indices for block matrices
515 unsigned int sz_I, sz_J, sz_blocks;
516 std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
517 std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
518 std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
519 std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
520 //
521 init_start_inds(g_J, j_ind);
522 init_start_inds(g_I, i_ind);
523 //
524 get_size(g_J, sz_J);
525 get_size(g_I, sz_I);
526 std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
527 //
528 std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
529
530 // computing size for blocks
531 // writing set to arrays
532 write_set_to_array(g_I, I_set);
533 write_set_to_array(g_J, J_set);
534
535 // if block for assembly does exist
536 if (I_set.size() > 0 && J_set.size() > 0)
537 {
538 viennacl::context ctx = viennacl::traits::context(A);
539 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
540 compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
541 std::vector<NumericT> con_A_I_J(sz_blocks, NumericT(0));
542
543 block_vector set_I_vcl, set_J_vcl;
544 //init memory on GPU
545 //contigious g_A_I_J
546 g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
547 static_cast<unsigned int>(sizeof(NumericT)*(sz_blocks)),
548 &(con_A_I_J[0]));
549 g_A_I_J_vcl.handle().context(opencl_ctx);
550
551 //matrix_dimensions
552 g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
553 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
554 &(matrix_dims[0]));
555 g_A_I_J_vcl.handle1().context(opencl_ctx);
556
557 //start_block inds
558 g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
559 static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
560 &(blocks_ind[0]));
561 g_A_I_J_vcl.handle2().context(opencl_ctx);
562
563 //set_I
564 set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
565 static_cast<unsigned int>(sizeof(cl_uint)*sz_I),
566 &(I_set[0]));
567 set_I_vcl.handle().context(opencl_ctx);
568
569 //set_J
570 set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
571 static_cast<unsigned int>(sizeof(cl_uint)*sz_J),
572 &(J_set[0]));
573 set_J_vcl.handle().context(opencl_ctx);
574
575 //i_ind
576 set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
577 static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
578 &(i_ind[0]));
579 set_I_vcl.handle().context(opencl_ctx);
580
581 //j_ind
582 set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
583 static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
584 &(j_ind[0]));
585 set_J_vcl.handle().context(opencl_ctx);
586
587 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
588 static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
589 &(g_is_update[0]));
590
591 viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
592 viennacl::ocl::kernel& assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "assemble_blocks");
593 assembly_kernel.local_work_size(0, 1);
594 assembly_kernel.global_work_size(0, 256);
595 viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
596 set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(),
597 set_J_vcl.handle1(),
598 g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
599 g_is_update_vcl,
600 static_cast<unsigned int>(g_I.size())));
601 is_empty_block = false;
602 }
603 else
604 is_empty_block = true;
605 }
606
607 /************************************************************************************************************************/
608
609 /** @brief Insertion of vectorized matrix column into original sparse matrix
610 *
611 * @param M_v column-major vectorized matrix
612 * @param M original sparse matrix
613 * @param is_right indicates if matrix should be transposed in the output
614 */
615 template<typename SparseMatrixT, typename SparseVectorT>
insert_sparse_columns(std::vector<SparseVectorT> const & M_v,SparseMatrixT & M,bool is_right)616 void insert_sparse_columns(std::vector<SparseVectorT> const & M_v,
617 SparseMatrixT& M,
618 bool is_right)
619 {
620 if (is_right)
621 {
622 for (unsigned int i = 0; i < M_v.size(); ++i)
623 for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)
624 M(vec_it->first, i) = vec_it->second;
625 }
626 else //transposed fill of M
627 {
628 for (unsigned int i = 0; i < M_v.size(); ++i)
629 for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)
630 M(i, vec_it->first) = vec_it->second;
631 }
632 }
633
634 /** @brief Transposition of sparse matrix
635 *
636 * @param A_in intial sparse matrix
637 * @param A output transposed matrix
638 */
639 template<typename MatrixT>
sparse_transpose(MatrixT const & A_in,MatrixT & A)640 void sparse_transpose(MatrixT const & A_in, MatrixT & A)
641 {
642 typedef typename MatrixT::value_type NumericType;
643
644 std::vector<std::map<vcl_size_t, NumericType> > temp_A(A_in.size2());
645 A.resize(A_in.size2(), A_in.size1(), false);
646
647 for (typename MatrixT::const_iterator1 row_it = A_in.begin1();
648 row_it != A_in.end1();
649 ++row_it)
650 {
651 for (typename MatrixT::const_iterator2 col_it = row_it.begin();
652 col_it != row_it.end();
653 ++col_it)
654 {
655 temp_A[col_it.index2()][col_it.index1()] = *col_it;
656 }
657 }
658
659 for (vcl_size_t i=0; i<temp_A.size(); ++i)
660 {
661 for (typename std::map<vcl_size_t, NumericType>::const_iterator it = temp_A[i].begin();
662 it != temp_A[i].end();
663 ++it)
664 A(i, it->first) = it->second;
665 }
666 }
667
668
669
670
671 // template<typename SparseVectorType>
672 // void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
673 // for (int i = 0; i < l_M_v.size(); ++i){
674 // l_M_v[i] = M_v[i + beg_ind];
675 // }
676 // }
677
678 //CPU version
679 /** @brief Construction of SPAI preconditioner on CPU
680 *
681 * @param A initial sparse matrix
682 * @param M output preconditioner
683 * @param tag spai tag
684 */
685 template<typename MatrixT>
computeSPAI(MatrixT const & A,MatrixT & M,spai_tag & tag)686 void computeSPAI(MatrixT const & A,
687 MatrixT & M,
688 spai_tag & tag)
689 {
690 typedef typename MatrixT::value_type NumericT;
691 typedef typename boost::numeric::ublas::vector<NumericT> VectorType;
692 typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT> SparseVectorType;
693 typedef typename boost::numeric::ublas::matrix<NumericT> DenseMatrixType;
694
695 //sparse matrix transpose...
696 unsigned int cur_iter = 0;
697 tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);
698 bool go_on = true;
699 std::vector<SparseVectorType> A_v_c(M.size2());
700 std::vector<SparseVectorType> M_v(M.size2());
701 vectorize_column_matrix(A, A_v_c);
702 vectorize_column_matrix(M, M_v);
703
704
705 while (go_on)
706 {
707 go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
708 cur_iter = 0;
709 unsigned int l_sz = static_cast<unsigned int>(tag.getEndInd() - tag.getBegInd());
710 //std::vector<bool> g_is_update(M.size2(), true);
711 std::vector<bool> g_is_update(l_sz, true);
712
713 //init is update
714 //init_parallel_is_update(g_is_update);
715 //std::vector<SparseVectorType> A_v_c(K);
716 //std::vector<SparseVectorType> M_v(K);
717 //vectorization of marices
718 //print_matrix(M_v);
719
720 std::vector<SparseVectorType> l_M_v(l_sz);
721 //custom_copy(M_v, l_M_v, beg_ind);
722 std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin());
723
724 //print_matrix(l_M_v);
725 //std::vector<SparseVectorType> l_A_v_c(K);
726 //custom_copy(A_v_c, l_A_v_c, beg_ind);
727 //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin());
728 //print_matrix(l_A_v_c);
729 //vectorize_row_matrix(A, A_v_r);
730 //working blocks
731
732 std::vector<DenseMatrixType> g_A_I_J(l_sz);
733 std::vector<VectorType> g_b_v(l_sz);
734 std::vector<SparseVectorType> g_res(l_sz);
735 std::vector<std::vector<unsigned int> > g_I(l_sz);
736 std::vector<std::vector<unsigned int> > g_J(l_sz);
737
738 while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
739 {
740 // SET UP THE BLOCKS..
741 // PHASE ONE
742 if (cur_iter == 0)
743 block_set_up(A, A_v_c, l_M_v, g_I, g_J, g_A_I_J, g_b_v);
744 else
745 block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);
746
747 //PHASE TWO, LEAST SQUARE SOLUTION
748 least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);
749
750 if (tag.getIsStatic()) break;
751 cur_iter++;
752 }
753
754 std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
755 tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
756 tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
757 //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
758 }
759
760 M.resize(M.size1(), M.size2(), false);
761 insert_sparse_columns(M_v, M, tag.getIsRight());
762 }
763
764
765 //GPU - based version
766 /** @brief Construction of SPAI preconditioner on GPU
767 *
768 * @param A initial sparse matrix
769 * @param cpu_A copy of initial matrix on CPU
770 * @param cpu_M output preconditioner on CPU
771 * @param M output preconditioner
772 * @param tag SPAI tag class with parameters
773 */
774 template<typename NumericT, unsigned int AlignmentV>
computeSPAI(viennacl::compressed_matrix<NumericT,AlignmentV> const & A,boost::numeric::ublas::compressed_matrix<NumericT> const & cpu_A,boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M,viennacl::compressed_matrix<NumericT,AlignmentV> & M,spai_tag const & tag)775 void computeSPAI(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, //input
776 boost::numeric::ublas::compressed_matrix<NumericT> const & cpu_A,
777 boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M, //output
778 viennacl::compressed_matrix<NumericT, AlignmentV> & M,
779 spai_tag const & tag)
780 {
781 typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT> SparseVectorType;
782
783 //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;
784 //sparse matrix transpose...
785 unsigned int cur_iter = 0;
786 std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
787 //init is update
788 //init_parallel_is_update(g_is_update);
789 std::vector<SparseVectorType> A_v_c(cpu_M.size2());
790 std::vector<SparseVectorType> M_v(cpu_M.size2());
791 vectorize_column_matrix(cpu_A, A_v_c);
792 vectorize_column_matrix(cpu_M, M_v);
793 std::vector<SparseVectorType> g_res(cpu_M.size2());
794 std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
795 std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
796
797 //OpenCL variables
798 block_matrix g_A_I_J_vcl;
799 block_vector g_bv_vcl;
800 while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
801 {
802 // SET UP THE BLOCKS..
803 // PHASE ONE..
804 //timer.start();
805 //index set up on CPU
806 if (cur_iter == 0)
807 block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl);
808 else
809 block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag);
810 //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
811 //PERFORM LEAST SQUARE problems solution
812 //PHASE TWO
813 //timer.start();
814 least_square_solve<SparseVectorType, NumericT>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));
815 //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
816 if (tag.getIsStatic())
817 break;
818 cur_iter++;
819 }
820
821 cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
822 insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
823 //copy back to GPU
824 M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));
825 viennacl::copy(cpu_M, M);
826 }
827
828 }
829 }
830 }
831 }
832 #endif
833