1 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
2 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
3 
4 /* =========================================================================
5    Copyright (c) 2010-2016, Institute for Microelectronics,
6                             Institute for Analysis and Scientific Computing,
7                             TU Wien.
8    Portions of this software are copyright by UChicago Argonne, LLC.
9 
10                             -----------------
11                   ViennaCL - The Vienna Computing Library
12                             -----------------
13 
14    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
15 
16    (A list of authors and contributors can be found in the manual)
17 
18    License:         MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 /** @file viennacl/linalg/detail/spai/spai.hpp
22     @brief Main implementation of SPAI (not FSPAI). Experimental.
23 */
24 
25 #include <utility>
26 #include <iostream>
27 #include <fstream>
28 #include <string>
29 #include <algorithm>
30 #include <vector>
31 #include <math.h>
32 #include <map>
33 
34 //local includes
35 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
36 #include "viennacl/linalg/qr.hpp"
37 #include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
38 #include "viennacl/linalg/detail/spai/spai-static.hpp"
39 #include "viennacl/linalg/detail/spai/sparse_vector.hpp"
40 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
41 #include "viennacl/linalg/detail/spai/block_vector.hpp"
42 
43 //boost includes
44 #include "boost/numeric/ublas/vector.hpp"
45 #include "boost/numeric/ublas/matrix.hpp"
46 #include "boost/numeric/ublas/matrix_proxy.hpp"
47 #include "boost/numeric/ublas/vector_proxy.hpp"
48 #include "boost/numeric/ublas/storage.hpp"
49 #include "boost/numeric/ublas/io.hpp"
50 #include "boost/numeric/ublas/lu.hpp"
51 #include "boost/numeric/ublas/triangular.hpp"
52 #include "boost/numeric/ublas/matrix_expression.hpp"
53 
54 // ViennaCL includes
55 #include "viennacl/linalg/prod.hpp"
56 #include "viennacl/matrix.hpp"
57 #include "viennacl/compressed_matrix.hpp"
58 #include "viennacl/linalg/sparse_matrix_operations.hpp"
59 #include "viennacl/linalg/matrix_operations.hpp"
60 #include "viennacl/scalar.hpp"
61 #include "viennacl/linalg/inner_prod.hpp"
62 #include "viennacl/linalg/ilu.hpp"
63 #include "viennacl/ocl/backend.hpp"
64 #include "viennacl/linalg/opencl/kernels/spai.hpp"
65 
66 
67 
68 #define VIENNACL_SPAI_K_b 20
69 
70 namespace viennacl
71 {
72 namespace linalg
73 {
74 namespace detail
75 {
76 namespace spai
77 {
78 
79 //debug function for print
80 template<typename SparseVectorT>
print_sparse_vector(SparseVectorT const & v)81 void print_sparse_vector(SparseVectorT const & v)
82 {
83   for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it)
84     std::cout << "[ " << vec_it->first << " ]:" << vec_it->second << std::endl;
85 }
86 
87 template<typename DenseMatrixT>
print_matrix(DenseMatrixT & m)88 void print_matrix(DenseMatrixT & m)
89 {
90   for (int i = 0; i < m.size2(); ++i)
91   {
92     for (int j = 0; j < m.size1(); ++j)
93       std::cout<<m(j, i)<<" ";
94     std::cout<<std::endl;
95   }
96 }
97 
98 /** @brief Add two sparse vectors res_v = b*v
99  *
100  * @param v      initial sparse vector
101  * @param b      scalar
102  * @param res_v  output vector
103  */
104 template<typename SparseVectorT, typename NumericT>
add_sparse_vectors(SparseVectorT const & v,NumericT b,SparseVectorT & res_v)105 void add_sparse_vectors(SparseVectorT const & v, NumericT b,  SparseVectorT & res_v)
106 {
107   for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)
108     res_v[v_it->first] += b*v_it->second;
109 }
110 
111 //sparse-matrix - vector product
112 /** @brief Computation of residual res = A*v - e
113  *
114  * @param A_v_c   column major vectorized input sparse matrix
115  * @param v       sparse vector, in this case new column of preconditioner matrix
116  * @param ind     index for current column
117  * @param res     residual
118  */
119 template<typename SparseVectorT, typename NumericT>
compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,SparseVectorT const & v,unsigned int ind,SparseVectorT & res)120 void compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,
121                            SparseVectorT const & v,
122                            unsigned int ind,
123                            SparseVectorT & res)
124 {
125   for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it)
126     add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
127 
128   res[ind] -= NumericT(1);
129 }
130 
131 /** @brief Setting up index set of columns and rows for certain column
132  *
133  * @param A_v_c   column major vectorized initial sparse matrix
134  * @param v       current column of preconditioner matrix
135  * @param J       set of column indices
136  * @param I       set of row indices
137  */
138 template<typename SparseVectorT>
build_index_set(std::vector<SparseVectorT> const & A_v_c,SparseVectorT const & v,std::vector<unsigned int> & J,std::vector<unsigned int> & I)139 void build_index_set(std::vector<SparseVectorT> const & A_v_c,
140                      SparseVectorT const & v,
141                      std::vector<unsigned int> & J,
142                      std::vector<unsigned int> & I)
143 {
144   buildColumnIndexSet(v, J);
145   projectRows(A_v_c, J, I);
146 }
147 
148 /** @brief Initializes a dense matrix from a sparse one
149  *
150  * @param A_in    Oiginal sparse matrix
151  * @param J       Set of column indices
152  * @param I       Set of row indices
153  * @param A_out   dense matrix output
154  */
155 template<typename SparseMatrixT, typename DenseMatrixT>
initProjectSubMatrix(SparseMatrixT const & A_in,std::vector<unsigned int> const & J,std::vector<unsigned int> & I,DenseMatrixT & A_out)156 void initProjectSubMatrix(SparseMatrixT const & A_in,
157                           std::vector<unsigned int> const & J,
158                           std::vector<unsigned int> & I,
159                           DenseMatrixT & A_out)
160 {
161   A_out.resize(I.size(), J.size(), false);
162   for (vcl_size_t j = 0; j < J.size(); ++j)
163     for (vcl_size_t i = 0; i < I.size(); ++i)
164       A_out(i,j) = A_in(I[i],J[j]);
165 }
166 
167 
168 /************************************************** CPU BLOCK SET UP ***************************************/
169 
170 /** @brief Setting up blocks and QR factorizing them on CPU
171  *
172  * @param A        initial sparse matrix
173  * @param A_v_c    column major vectorized initial sparse matrix
174  * @param M_v      initialized preconditioner
175  * @param g_I      container of row indices
176  * @param g_J      container of column indices
177  * @param g_A_I_J  container of dense matrices -> R matrices after QR factorization
178  * @param g_b_v    container of vectors beta, necessary for Q recovery
179  */
180 template<typename SparseMatrixT, typename DenseMatrixT, typename SparseVectorT, typename VectorT>
block_set_up(SparseMatrixT const & A,std::vector<SparseVectorT> const & A_v_c,std::vector<SparseVectorT> const & M_v,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,std::vector<DenseMatrixT> & g_A_I_J,std::vector<VectorT> & g_b_v)181 void block_set_up(SparseMatrixT const & A,
182                   std::vector<SparseVectorT> const & A_v_c,
183                   std::vector<SparseVectorT> const & M_v,
184                   std::vector<std::vector<unsigned int> >& g_I,
185                   std::vector<std::vector<unsigned int> >& g_J,
186                   std::vector<DenseMatrixT>& g_A_I_J,
187                   std::vector<VectorT>& g_b_v)
188 {
189 #ifdef VIENNACL_WITH_OPENMP
190   #pragma omp parallel for
191 #endif
192   for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
193   {
194     vcl_size_t i = static_cast<vcl_size_t>(i2);
195     build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
196     initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
197     //print_matrix(g_A_I_J[i]);
198     single_qr(g_A_I_J[i], g_b_v[i]);
199     //print_matrix(g_A_I_J[i]);
200   }
201 }
202 
203 /** @brief Setting up index set of columns and rows for all columns
204  *
205  * @param A_v_c   column major vectorized initial sparse matrix
206  * @param M_v     initialized preconditioner
207  * @param g_J     container of column indices
208  * @param g_I     container of row indices
209  */
210 template<typename SparseVectorT>
index_set_up(std::vector<SparseVectorT> const & A_v_c,std::vector<SparseVectorT> const & M_v,std::vector<std::vector<unsigned int>> & g_J,std::vector<std::vector<unsigned int>> & g_I)211 void index_set_up(std::vector<SparseVectorT> const & A_v_c,
212                   std::vector<SparseVectorT> const & M_v,
213                   std::vector<std::vector<unsigned int> > & g_J,
214                   std::vector<std::vector<unsigned int> > & g_I)
215 {
216 #ifdef VIENNACL_WITH_OPENMP
217   #pragma omp parallel for
218 #endif
219   for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
220   {
221     vcl_size_t i = static_cast<vcl_size_t>(i2);
222     build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
223   }
224 }
225 
226 /************************************************** GPU BLOCK SET UP ***************************************/
227 
228 /** @brief Setting up blocks and QR factorizing them on GPU
229  *
230  * @param A            initial sparse matrix
231  * @param A_v_c        column major vectorized initial sparse matrix
232  * @param M_v          initialized preconditioner
233  * @param g_is_update  container that indicates which blocks are active
234  * @param g_I          container of row indices
235  * @param g_J          container of column indices
236  * @param g_A_I_J      container of dense matrices -> R matrices after QR factorization
237  * @param g_bv         container of vectors beta, necessary for Q recovery
238  */
239 template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
block_set_up(viennacl::compressed_matrix<NumericT,AlignmentV> const & A,std::vector<SparseVectorT> const & A_v_c,std::vector<SparseVectorT> const & M_v,std::vector<cl_uint> g_is_update,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,block_matrix & g_A_I_J,block_vector & g_bv)240 void block_set_up(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
241                   std::vector<SparseVectorT> const & A_v_c,
242                   std::vector<SparseVectorT> const & M_v,
243                   std::vector<cl_uint> g_is_update,
244                   std::vector<std::vector<unsigned int> > & g_I,
245                   std::vector<std::vector<unsigned int> > & g_J,
246                   block_matrix & g_A_I_J,
247                   block_vector & g_bv)
248 {
249   viennacl::context ctx = viennacl::traits::context(A);
250   bool is_empty_block;
251 
252   //build index set
253   index_set_up(A_v_c, M_v, g_J, g_I);
254   block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);
255   block_qr<NumericT>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);
256 }
257 
258 
259 /***************************************************************************************************/
260 /******************************** SOLVING LS PROBLEMS ON GPU ***************************************/
261 /***************************************************************************************************/
262 
263 /** @brief Elicitation of sparse vector m for particular column from m_in - contigious vector for all columns
264  *
265  * @param m_in          contigious sparse vector for all columns
266  * @param start_m_ind   start index of particular vector
267  * @param J             column index set
268  * @param m             sparse vector for particular column
269  */
270 template<typename NumericT, typename SparseVectorT>
custom_fan_out(std::vector<NumericT> const & m_in,unsigned int start_m_ind,std::vector<unsigned int> const & J,SparseVectorT & m)271 void custom_fan_out(std::vector<NumericT> const & m_in,
272                     unsigned int start_m_ind,
273                     std::vector<unsigned int> const & J,
274                     SparseVectorT & m)
275 {
276   unsigned int  cnt = 0;
277   for (vcl_size_t i = 0; i < J.size(); ++i)
278     m[J[i]] = m_in[start_m_ind + cnt++];
279 }
280 
281 
282 
283 //GPU based least square problem
284 /** @brief Solution of Least square problem on GPU
285  *
286  * @param A_v_c        column-major vectorized initial sparse matrix
287  * @param M_v          column-major vectorized sparse preconditioner matrix
288  * @param g_I          container of row set indices
289  * @param g_J          container of column set indices
290  * @param g_A_I_J_vcl  contigious matrix that consists of blocks A(I_k, J_k)
291  * @param g_bv_vcl     contigious vector that consists of betas, necessary for Q recovery
292  * @param g_res        container of residuals
293  * @param g_is_update  container with indicators which blocks are active
294  * @param tag          spai tag
295  * @param ctx          Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
296  */
297 template<typename SparseVectorT, typename NumericT>
least_square_solve(std::vector<SparseVectorT> & A_v_c,std::vector<SparseVectorT> & M_v,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,block_matrix & g_A_I_J_vcl,block_vector & g_bv_vcl,std::vector<SparseVectorT> & g_res,std::vector<cl_uint> & g_is_update,const spai_tag & tag,viennacl::context ctx)298 void least_square_solve(std::vector<SparseVectorT> & A_v_c,
299                         std::vector<SparseVectorT> & M_v,
300                         std::vector<std::vector<unsigned int> >& g_I,
301                         std::vector<std::vector<unsigned int> > & g_J,
302                         block_matrix & g_A_I_J_vcl,
303                         block_vector & g_bv_vcl,
304                         std::vector<SparseVectorT> & g_res,
305                         std::vector<cl_uint> & g_is_update,
306                         const spai_tag & tag,
307                         viennacl::context ctx)
308 {
309   viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
310   unsigned int y_sz, m_sz;
311   std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
312   std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
313 
314   get_size(g_I, y_sz);
315   init_start_inds(g_I, y_inds);
316   init_start_inds(g_J, m_inds);
317 
318   //create y_v
319   std::vector<NumericT> y_v(y_sz, NumericT(0));
320   for (vcl_size_t i = 0; i < M_v.size(); ++i)
321   {
322     for (vcl_size_t j = 0; j < g_I[i].size(); ++j)
323     {
324       if (g_I[i][j] == i)
325         y_v[y_inds[i] + j] = NumericT(1.0);
326     }
327   }
328   //compute m_v
329   get_size(g_J, m_sz);
330   std::vector<NumericT> m_v(m_sz, static_cast<cl_uint>(0));
331 
332   block_vector y_v_vcl;
333   block_vector m_v_vcl;
334   //prepearing memory for least square problem on GPU
335   y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
336                                               static_cast<unsigned int>(sizeof(NumericT)*y_v.size()),
337                                               &(y_v[0]));
338   m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
339                                               static_cast<unsigned int>(sizeof(NumericT)*m_v.size()),
340                                               &(m_v[0]));
341   y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
342                                                static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
343                                                &(y_inds[0]));
344   viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
345                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
346                                                                            &(g_is_update[0]));
347   viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
348   viennacl::ocl::kernel & ls_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_least_squares");
349   ls_kernel.local_work_size(0, 1);
350   ls_kernel.global_work_size(0, 256);
351   viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
352                                    y_v_vcl.handle(), y_v_vcl.handle1(),
353                                    g_A_I_J_vcl.handle1(), g_is_update_vcl,
354                                    //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
355                                    static_cast<unsigned int>(M_v.size())));
356   //copy vector m_v back from GPU to CPU
357   cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),
358                                        m_v_vcl.handle().get(), CL_TRUE, 0,
359                                        sizeof(NumericT)*(m_v.size()),
360                                        &(m_v[0]), 0, NULL, NULL);
361   VIENNACL_ERR_CHECK(vcl_err);
362 
363   //fan out vector in parallel
364   //#pragma omp parallel for
365   for (long i = 0; i < static_cast<long>(M_v.size()); ++i)
366   {
367     if (g_is_update[static_cast<vcl_size_t>(i)])
368     {
369       //faned out onto sparse vector
370       custom_fan_out(m_v, m_inds[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], M_v[static_cast<vcl_size_t>(i)]);
371       g_res[static_cast<vcl_size_t>(i)].clear();
372       compute_spai_residual<SparseVectorT, NumericT>(A_v_c,  M_v[static_cast<vcl_size_t>(i)], static_cast<unsigned int>(i), g_res[static_cast<vcl_size_t>(i)]);
373       NumericT res_norm = 0;
374       //compute norm of res - just to make sure that this implementatino works correct
375       sparse_norm_2(g_res[static_cast<vcl_size_t>(i)], res_norm);
376       //std::cout<<"Residual norm of column #: "<<i<<std::endl;
377       //std::cout<<res_norm<<std::endl;
378       //std::cout<<"************************"<<std::endl;
379       g_is_update[static_cast<vcl_size_t>(i)] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
380     }
381   }
382 }
383 
384 //CPU based least square problems
385 /** @brief Solution of Least square problem on CPU
386  *
387  * @param A_v_c        column-major vectorized initial sparse matrix
388  * @param g_R          blocks for least square solution
389  * @param g_b_v        vectors beta, necessary for Q recovery
390  * @param g_I          container of row index set for all columns of matrix M
391  * @param g_J          container of column index set for all columns of matrix M
392  * @param g_res        container of residuals
393  * @param g_is_update  container with indicators which blocks are active
394  * @param M_v          column-major vectorized sparse matrix, final preconditioner
395  * @param tag          spai tag
396  */
397 template<typename SparseVectorT, typename DenseMatrixT, typename VectorT>
least_square_solve(std::vector<SparseVectorT> const & A_v_c,std::vector<DenseMatrixT> & g_R,std::vector<VectorT> & g_b_v,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,std::vector<SparseVectorT> & g_res,std::vector<bool> & g_is_update,std::vector<SparseVectorT> & M_v,spai_tag const & tag)398 void least_square_solve(std::vector<SparseVectorT> const & A_v_c,
399                         std::vector<DenseMatrixT> & g_R,
400                         std::vector<VectorT> & g_b_v,
401                         std::vector<std::vector<unsigned int> > & g_I,
402                         std::vector<std::vector<unsigned int> > & g_J,
403                         std::vector<SparseVectorT> & g_res,
404                         std::vector<bool> & g_is_update,
405                         std::vector<SparseVectorT> & M_v,
406                         spai_tag const & tag)
407 {
408   typedef typename DenseMatrixT::value_type       NumericType;
409 
410 #ifdef VIENNACL_WITH_OPENMP
411   #pragma omp parallel for
412 #endif
413   for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
414   {
415     vcl_size_t i = static_cast<vcl_size_t>(i2);
416     if (g_is_update[i])
417     {
418       VectorT y = boost::numeric::ublas::zero_vector<NumericType>(g_I[i].size());
419 
420       projectI<VectorT, NumericType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + long(i)));
421       apply_q_trans_vec(g_R[i], g_b_v[i], y);
422 
423       VectorT m_new =  boost::numeric::ublas::zero_vector<NumericType>(g_R[i].size2());
424       backwardSolve(g_R[i], y, m_new);
425       fanOutVector(m_new, g_J[i], M_v[i]);
426       g_res[i].clear();
427 
428       compute_spai_residual<SparseVectorT, NumericType>(A_v_c,  M_v[i], static_cast<unsigned int>(tag.getBegInd() + long(i)), g_res[i]);
429 
430       NumericType res_norm = 0;
431       sparse_norm_2(g_res[i], res_norm);
432 //                    std::cout<<"Residual norm of column #: "<<i<<std::endl;
433 //                    std::cout<<res_norm<<std::endl;
434 //                    std::cout<<"************************"<<std::endl;
435       g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());
436     }
437   }
438 }
439 
440 //************************************ UPDATE CHECK ***************************************************//
441 
442 template<typename VectorType>
is_all_update(VectorType & parallel_is_update)443 bool is_all_update(VectorType& parallel_is_update)
444 {
445   for (unsigned int i = 0; i < parallel_is_update.size(); ++i)
446   {
447     if (parallel_is_update[i])
448       return true;
449   }
450   return false;
451 }
452 
453 //********************************** MATRIX VECTORIZATION ***********************************************//
454 
455 //Matrix vectorization, column based approach
456 /** @brief Solution of Least square problem on CPU
457  *
458  * @param M_in   input sparse, boost::numeric::ublas::compressed_matrix
459  * @param M_v    array of sparse vectors
460  */
461 template<typename SparseMatrixT, typename SparseVectorT>
vectorize_column_matrix(SparseMatrixT const & M_in,std::vector<SparseVectorT> & M_v)462 void vectorize_column_matrix(SparseMatrixT const & M_in,
463                              std::vector<SparseVectorT> & M_v)
464 {
465   for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)
466     for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
467         M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it;
468 }
469 
470 //Matrix vectorization row based approach
471 template<typename SparseMatrixT, typename SparseVectorT>
vectorize_row_matrix(SparseMatrixT const & M_in,std::vector<SparseVectorT> & M_v)472 void vectorize_row_matrix(SparseMatrixT const & M_in,
473                           std::vector<SparseVectorT> & M_v)
474 {
475   for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it)
476     for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it)
477       M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it;
478 }
479 
480 //************************************* BLOCK ASSEMBLY CODE *********************************************//
481 
482 
483 template<typename SizeT>
write_set_to_array(std::vector<std::vector<SizeT>> const & ind_set,std::vector<cl_uint> & a)484 void write_set_to_array(std::vector<std::vector<SizeT> > const & ind_set,
485                         std::vector<cl_uint> & a)
486 {
487   vcl_size_t cnt = 0;
488 
489   for (vcl_size_t i = 0; i < ind_set.size(); ++i)
490     for (vcl_size_t j = 0; j < ind_set[i].size(); ++j)
491       a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
492 }
493 
494 
495 
496 //assembling blocks on GPU
497 /** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J
498  *
499  * @param A               intial sparse matrix
500  * @param g_J             container of column index set
501  * @param g_I             container of row index set
502  * @param g_A_I_J_vcl     contigious blocks A(I, J) using GPU memory
503  * @param g_is_update     container with indicators which blocks are active
504  * @param is_empty_block  parameter that indicates if no block were assembled
505  */
506 template<typename NumericT, unsigned int AlignmentV>
block_assembly(viennacl::compressed_matrix<NumericT,AlignmentV> const & A,std::vector<std::vector<unsigned int>> const & g_J,std::vector<std::vector<unsigned int>> const & g_I,block_matrix & g_A_I_J_vcl,std::vector<cl_uint> & g_is_update,bool & is_empty_block)507 void block_assembly(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
508                     std::vector<std::vector<unsigned int> > const & g_J,
509                     std::vector<std::vector<unsigned int> > const & g_I,
510                     block_matrix & g_A_I_J_vcl,
511                     std::vector<cl_uint> & g_is_update,
512                     bool & is_empty_block)
513 {
514   //computing start indices for index sets and start indices for block matrices
515   unsigned int sz_I, sz_J, sz_blocks;
516   std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
517   std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
518   std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
519   std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
520   //
521   init_start_inds(g_J, j_ind);
522   init_start_inds(g_I, i_ind);
523   //
524   get_size(g_J, sz_J);
525   get_size(g_I, sz_I);
526   std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
527   //
528   std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
529 
530   // computing size for blocks
531   // writing set to arrays
532   write_set_to_array(g_I, I_set);
533   write_set_to_array(g_J, J_set);
534 
535   // if block for assembly does exist
536   if (I_set.size() > 0 && J_set.size() > 0)
537   {
538     viennacl::context ctx = viennacl::traits::context(A);
539     viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
540     compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
541     std::vector<NumericT> con_A_I_J(sz_blocks, NumericT(0));
542 
543     block_vector set_I_vcl, set_J_vcl;
544     //init memory on GPU
545     //contigious g_A_I_J
546     g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
547                                                     static_cast<unsigned int>(sizeof(NumericT)*(sz_blocks)),
548                                                     &(con_A_I_J[0]));
549     g_A_I_J_vcl.handle().context(opencl_ctx);
550 
551     //matrix_dimensions
552     g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
553                                                      static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
554                                                      &(matrix_dims[0]));
555     g_A_I_J_vcl.handle1().context(opencl_ctx);
556 
557     //start_block inds
558     g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
559                                                      static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
560                                                      &(blocks_ind[0]));
561     g_A_I_J_vcl.handle2().context(opencl_ctx);
562 
563     //set_I
564     set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
565                                                   static_cast<unsigned int>(sizeof(cl_uint)*sz_I),
566                                                   &(I_set[0]));
567     set_I_vcl.handle().context(opencl_ctx);
568 
569     //set_J
570     set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
571                                                   static_cast<unsigned int>(sizeof(cl_uint)*sz_J),
572                                                   &(J_set[0]));
573     set_J_vcl.handle().context(opencl_ctx);
574 
575     //i_ind
576     set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
577                                                    static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
578                                                    &(i_ind[0]));
579     set_I_vcl.handle().context(opencl_ctx);
580 
581     //j_ind
582     set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
583                                                    static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
584                                                    &(j_ind[0]));
585     set_J_vcl.handle().context(opencl_ctx);
586 
587     viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
588                                                                              static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
589                                                                              &(g_is_update[0]));
590 
591     viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
592     viennacl::ocl::kernel& assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "assemble_blocks");
593     assembly_kernel.local_work_size(0, 1);
594     assembly_kernel.global_work_size(0, 256);
595     viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
596                                            set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(),
597                                            set_J_vcl.handle1(),
598                                            g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
599                                            g_is_update_vcl,
600                                            static_cast<unsigned int>(g_I.size())));
601     is_empty_block = false;
602   }
603   else
604     is_empty_block = true;
605 }
606 
607 /************************************************************************************************************************/
608 
609 /** @brief Insertion of vectorized matrix column into original sparse matrix
610  *
611  * @param M_v       column-major vectorized matrix
612  * @param M         original sparse matrix
613  * @param is_right  indicates if matrix should be transposed in the output
614  */
615 template<typename SparseMatrixT, typename SparseVectorT>
insert_sparse_columns(std::vector<SparseVectorT> const & M_v,SparseMatrixT & M,bool is_right)616 void insert_sparse_columns(std::vector<SparseVectorT> const & M_v,
617                            SparseMatrixT& M,
618                            bool is_right)
619 {
620   if (is_right)
621   {
622     for (unsigned int i = 0; i < M_v.size(); ++i)
623       for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)
624         M(vec_it->first, i) = vec_it->second;
625   }
626   else  //transposed fill of M
627   {
628     for (unsigned int i = 0; i < M_v.size(); ++i)
629       for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it)
630         M(i, vec_it->first) = vec_it->second;
631   }
632 }
633 
634 /** @brief Transposition of sparse matrix
635  *
636  * @param A_in      intial sparse matrix
637  * @param A output  transposed matrix
638  */
639 template<typename MatrixT>
sparse_transpose(MatrixT const & A_in,MatrixT & A)640 void sparse_transpose(MatrixT const & A_in, MatrixT & A)
641 {
642   typedef typename MatrixT::value_type         NumericType;
643 
644   std::vector<std::map<vcl_size_t, NumericType> >   temp_A(A_in.size2());
645   A.resize(A_in.size2(), A_in.size1(), false);
646 
647   for (typename MatrixT::const_iterator1 row_it = A_in.begin1();
648        row_it != A_in.end1();
649        ++row_it)
650   {
651     for (typename MatrixT::const_iterator2 col_it = row_it.begin();
652          col_it != row_it.end();
653          ++col_it)
654     {
655       temp_A[col_it.index2()][col_it.index1()] = *col_it;
656     }
657   }
658 
659   for (vcl_size_t i=0; i<temp_A.size(); ++i)
660   {
661     for (typename std::map<vcl_size_t, NumericType>::const_iterator it = temp_A[i].begin();
662          it != temp_A[i].end();
663          ++it)
664       A(i, it->first) = it->second;
665   }
666 }
667 
668 
669 
670 
671 //        template<typename SparseVectorType>
672 //        void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
673 //            for (int i = 0; i < l_M_v.size(); ++i){
674 //                l_M_v[i] = M_v[i + beg_ind];
675 //            }
676 //        }
677 
678 //CPU version
679 /** @brief Construction of SPAI preconditioner on CPU
680  *
681  * @param A     initial sparse matrix
682  * @param M     output preconditioner
683  * @param tag   spai tag
684  */
685 template<typename MatrixT>
computeSPAI(MatrixT const & A,MatrixT & M,spai_tag & tag)686 void computeSPAI(MatrixT const & A,
687                  MatrixT & M,
688                  spai_tag & tag)
689 {
690   typedef typename MatrixT::value_type                                       NumericT;
691   typedef typename boost::numeric::ublas::vector<NumericT>                   VectorType;
692   typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>   SparseVectorType;
693   typedef typename boost::numeric::ublas::matrix<NumericT>                   DenseMatrixType;
694 
695   //sparse matrix transpose...
696   unsigned int cur_iter = 0;
697   tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);
698   bool go_on = true;
699   std::vector<SparseVectorType> A_v_c(M.size2());
700   std::vector<SparseVectorType> M_v(M.size2());
701   vectorize_column_matrix(A, A_v_c);
702   vectorize_column_matrix(M, M_v);
703 
704 
705   while (go_on)
706   {
707     go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
708     cur_iter = 0;
709     unsigned int l_sz = static_cast<unsigned int>(tag.getEndInd() - tag.getBegInd());
710     //std::vector<bool> g_is_update(M.size2(), true);
711     std::vector<bool> g_is_update(l_sz, true);
712 
713     //init is update
714     //init_parallel_is_update(g_is_update);
715     //std::vector<SparseVectorType> A_v_c(K);
716     //std::vector<SparseVectorType> M_v(K);
717     //vectorization of marices
718     //print_matrix(M_v);
719 
720     std::vector<SparseVectorType> l_M_v(l_sz);
721     //custom_copy(M_v, l_M_v, beg_ind);
722     std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin());
723 
724     //print_matrix(l_M_v);
725     //std::vector<SparseVectorType> l_A_v_c(K);
726     //custom_copy(A_v_c, l_A_v_c, beg_ind);
727     //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin());
728     //print_matrix(l_A_v_c);
729     //vectorize_row_matrix(A, A_v_r);
730     //working blocks
731 
732     std::vector<DenseMatrixType> g_A_I_J(l_sz);
733     std::vector<VectorType> g_b_v(l_sz);
734     std::vector<SparseVectorType> g_res(l_sz);
735     std::vector<std::vector<unsigned int> > g_I(l_sz);
736     std::vector<std::vector<unsigned int> > g_J(l_sz);
737 
738     while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
739     {
740       // SET UP THE BLOCKS..
741       // PHASE ONE
742       if (cur_iter == 0)
743         block_set_up(A, A_v_c, l_M_v,  g_I, g_J, g_A_I_J, g_b_v);
744       else
745         block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);
746 
747       //PHASE TWO, LEAST SQUARE SOLUTION
748       least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);
749 
750       if (tag.getIsStatic()) break;
751       cur_iter++;
752     }
753 
754     std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
755     tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
756     tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
757     //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
758   }
759 
760   M.resize(M.size1(), M.size2(), false);
761   insert_sparse_columns(M_v, M, tag.getIsRight());
762 }
763 
764 
765 //GPU - based version
766 /** @brief Construction of SPAI preconditioner on GPU
767  *
768  * @param A      initial sparse matrix
769  * @param cpu_A  copy of initial matrix on CPU
770  * @param cpu_M  output preconditioner on CPU
771  * @param M      output preconditioner
772  * @param tag    SPAI tag class with parameters
773  */
774 template<typename NumericT, unsigned int AlignmentV>
computeSPAI(viennacl::compressed_matrix<NumericT,AlignmentV> const & A,boost::numeric::ublas::compressed_matrix<NumericT> const & cpu_A,boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M,viennacl::compressed_matrix<NumericT,AlignmentV> & M,spai_tag const & tag)775 void computeSPAI(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, //input
776                  boost::numeric::ublas::compressed_matrix<NumericT> const & cpu_A,
777                  boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M, //output
778                  viennacl::compressed_matrix<NumericT, AlignmentV> & M,
779                  spai_tag const & tag)
780 {
781   typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>        SparseVectorType;
782 
783   //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;
784   //sparse matrix transpose...
785   unsigned int cur_iter = 0;
786   std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
787   //init is update
788   //init_parallel_is_update(g_is_update);
789   std::vector<SparseVectorType> A_v_c(cpu_M.size2());
790   std::vector<SparseVectorType> M_v(cpu_M.size2());
791   vectorize_column_matrix(cpu_A, A_v_c);
792   vectorize_column_matrix(cpu_M, M_v);
793   std::vector<SparseVectorType> g_res(cpu_M.size2());
794   std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
795   std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
796 
797   //OpenCL variables
798   block_matrix g_A_I_J_vcl;
799   block_vector g_bv_vcl;
800   while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
801   {
802     // SET UP THE BLOCKS..
803     // PHASE ONE..
804     //timer.start();
805     //index set up on CPU
806     if (cur_iter == 0)
807       block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl);
808     else
809       block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag);
810     //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
811     //PERFORM LEAST SQUARE problems solution
812     //PHASE TWO
813     //timer.start();
814     least_square_solve<SparseVectorType, NumericT>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));
815     //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
816     if (tag.getIsStatic())
817       break;
818     cur_iter++;
819   }
820 
821   cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
822   insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
823   //copy back to GPU
824   M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));
825   viennacl::copy(cpu_M, M);
826 }
827 
828 }
829 }
830 }
831 }
832 #endif
833