1 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
2 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
3 
4 /* =========================================================================
5    Copyright (c) 2010-2016, Institute for Microelectronics,
6                             Institute for Analysis and Scientific Computing,
7                             TU Wien.
8    Portions of this software are copyright by UChicago Argonne, LLC.
9 
10                             -----------------
11                   ViennaCL - The Vienna Computing Library
12                             -----------------
13 
14    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
15 
16    (A list of authors and contributors can be found in the manual)
17 
18    License:         MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 /** @file viennacl/linalg/detail/spai/spai-dynamic.hpp
22     @brief Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental.
23 
24     SPAI code contributed by Nikolay Lukash
25 */
26 
27 #include <utility>
28 #include <iostream>
29 #include <fstream>
30 #include <string>
31 #include <algorithm>
32 #include <vector>
33 #include <math.h>
34 #include <map>
35 //#include "block_matrix.hpp"
36 //#include "block_vector.hpp"
37 //#include "benchmark-utils.hpp"
38 #include "boost/numeric/ublas/vector.hpp"
39 #include "boost/numeric/ublas/matrix.hpp"
40 #include "boost/numeric/ublas/matrix_proxy.hpp"
41 #include "boost/numeric/ublas/vector_proxy.hpp"
42 #include "boost/numeric/ublas/storage.hpp"
43 #include "boost/numeric/ublas/io.hpp"
44 #include "boost/numeric/ublas/lu.hpp"
45 #include "boost/numeric/ublas/triangular.hpp"
46 #include "boost/numeric/ublas/matrix_expression.hpp"
47 // ViennaCL includes
48 #include "viennacl/linalg/prod.hpp"
49 #include "viennacl/matrix.hpp"
50 #include "viennacl/compressed_matrix.hpp"
51 #include "viennacl/linalg/sparse_matrix_operations.hpp"
52 #include "viennacl/linalg/matrix_operations.hpp"
53 #include "viennacl/scalar.hpp"
54 #include "viennacl/linalg/cg.hpp"
55 #include "viennacl/linalg/inner_prod.hpp"
56 #include "viennacl/linalg/ilu.hpp"
57 #include "viennacl/ocl/backend.hpp"
58 
59 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
60 #include "viennacl/linalg/detail/spai/block_vector.hpp"
61 #include "viennacl/linalg/detail/spai/qr.hpp"
62 #include "viennacl/linalg/detail/spai/spai-static.hpp"
63 #include "viennacl/linalg/detail/spai/spai.hpp"
64 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
65 #include "viennacl/linalg/opencl/kernels/spai.hpp"
66 
67 namespace viennacl
68 {
69 namespace linalg
70 {
71 namespace detail
72 {
73 namespace spai
74 {
75 
76 /** @brief Helper functor for comparing std::pair<> based on the second member. */
77 struct CompareSecond
78 {
79   template<typename T1, typename T2>
operator ()viennacl::linalg::detail::spai::CompareSecond80   bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right)
81   {
82     return static_cast<double>(left.second) > static_cast<double>(right.second);
83   }
84 };
85 
86 
87 /** @brief Composition of new matrix R, that is going to be used in Least Square problem solving
88  *
89  * @param A      matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
90  * @param R_n    matrix A_Iu_J_u after QR factorization
91  * @param R      previously composed matrix R
92  */
93 template<typename MatrixT>
composeNewR(MatrixT const & A,MatrixT const & R_n,MatrixT & R)94 void composeNewR(MatrixT const & A,
95                  MatrixT const & R_n,
96                  MatrixT & R)
97 {
98   typedef typename MatrixT::value_type        NumericType;
99 
100   vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2());
101   MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(R.size1() + row_n, R.size2() + A.size2());
102 
103   //write original R to new Composite R
104   boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
105   //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
106   boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(),
107                                                                                                             R.size2() + A.size2())) +=
108   boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
109 
110   //adding decomposed(QR) block to Composite R
111   if (R_n.size1() > 0 && R_n.size2() > 0)
112       boost::numeric::ublas::project(C,
113                                      boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
114                                      boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
115   R = C;
116 }
117 
118 /** @brief Composition of new vector of coefficients beta from QR factorizations(necessary for Q recovery)
119  *
120  * @param v_n     new vector from last QR factorization
121  * @param v       composition of previous vectors from QR factorizations
122  */
123 template<typename VectorT>
composeNewVector(VectorT const & v_n,VectorT & v)124 void composeNewVector(VectorT const & v_n,
125                       VectorT       & v)
126 {
127   typedef typename VectorT::value_type          NumericType;
128 
129   VectorT w  = boost::numeric::ublas::zero_vector<NumericType>(v.size() + v_n.size());
130   boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
131   boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
132   v = w;
133 }
134 
135 /** @brief Computation of Euclidean norm for sparse vector
136  *
137  * @param v      initial sparse vector
138  * @param norm   scalar that represents Euclidean norm
139  */
140 template<typename SparseVectorT, typename NumericT>
sparse_norm_2(SparseVectorT const & v,NumericT & norm)141 void sparse_norm_2(SparseVectorT const & v,
142                    NumericT & norm)
143 {
144   for (typename SparseVectorT::const_iterator vec_it  = v.begin(); vec_it != v.end(); ++vec_it)
145     norm += (vec_it->second)*(vec_it->second);
146 
147   norm = std::sqrt(norm);
148 }
149 
150 /** @brief Dot product of two sparse vectors
151  *
152  * @param v1     initial sparse vector
153  * @param v2     initial sparse vector
154  * @param res_v  scalar that represents dot product result
155  */
156 template<typename SparseVectorT, typename NumericT>
sparse_inner_prod(SparseVectorT const & v1,SparseVectorT const & v2,NumericT & res_v)157 void sparse_inner_prod(SparseVectorT const & v1,
158                        SparseVectorT const & v2,
159                        NumericT & res_v)
160 {
161   typename SparseVectorT::const_iterator v_it1 = v1.begin();
162   typename SparseVectorT::const_iterator v_it2 = v2.begin();
163 
164   while ((v_it1 != v1.end())&&(v_it2 != v2.end()))
165   {
166     if (v_it1->first == v_it2->first)
167     {
168       res_v += (v_it1->second)*(v_it2->second);
169       ++v_it1;
170       ++v_it2;
171     }
172     else if (v_it1->first < v_it2->first)
173       ++v_it1;
174     else
175       ++v_it2;
176   }
177 }
178 
179 /** @brief Building a new set of column indices J_u, cf. Kallischko dissertation p.31
180  *
181  * @param A_v_c  vectorized column-wise initial matrix
182  * @param res    residual vector
183  * @param J      set of column indices
184  * @param J_u    set of new column indices
185  * @param tag    SPAI tag with parameters
186  */
187 template<typename SparseVectorT, typename NumericT>
buildAugmentedIndexSet(std::vector<SparseVectorT> const & A_v_c,SparseVectorT const & res,std::vector<unsigned int> & J,std::vector<unsigned int> & J_u,spai_tag const & tag)188 bool buildAugmentedIndexSet(std::vector<SparseVectorT> const & A_v_c,
189                             SparseVectorT const & res,
190                             std::vector<unsigned int> & J,
191                             std::vector<unsigned int> & J_u,
192                             spai_tag const & tag)
193 {
194   std::vector<std::pair<unsigned int, NumericT> > p;
195   vcl_size_t cur_size = 0;
196   NumericT inprod, norm2;
197 
198   for (typename SparseVectorT::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it)
199   {
200     if (!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold()))
201     {
202       inprod = norm2 = 0;
203       sparse_inner_prod(res, A_v_c[res_it->first], inprod);
204       sparse_norm_2(A_v_c[res_it->first], norm2);
205       p.push_back(std::pair<unsigned int, NumericT>(res_it->first, (inprod*inprod)/(norm2*norm2)));
206     }
207   }
208 
209   std::sort(p.begin(), p.end(), CompareSecond());
210   while ((cur_size < J.size()) && (p.size() > 0))
211   {
212     J_u.push_back(p[0].first);
213     p.erase(p.begin());
214     cur_size++;
215   }
216   p.clear();
217   return (cur_size > 0);
218 }
219 
220 /** @brief Building a new indices to current set of row indices I_n, cf. Kallischko dissertation p.32
221  *
222  * @param A_v_c    vectorized column-wise initial matrix
223  * @param I        set of previous determined row indices
224  * @param J_n      set of new column indices
225  * @param I_n      set of new indices
226  */
227 template<typename SparseVectorT>
buildNewRowSet(std::vector<SparseVectorT> const & A_v_c,std::vector<unsigned int> const & I,std::vector<unsigned int> const & J_n,std::vector<unsigned int> & I_n)228 void buildNewRowSet(std::vector<SparseVectorT> const & A_v_c,
229                     std::vector<unsigned int>  const & I,
230                     std::vector<unsigned int>  const & J_n,
231                     std::vector<unsigned int>        & I_n)
232 {
233   for (vcl_size_t i = 0; i < J_n.size(); ++i)
234   {
235     for (typename SparseVectorT::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it)
236     {
237       if (!isInIndexSet(I, col_it->first) && !isInIndexSet(I_n, col_it->first))
238         I_n.push_back(col_it->first);
239     }
240   }
241 }
242 
243 /** @brief Composition of new block for QR factorization cf. Kallischko dissertation p.82, figure 4.7
244  *
245  * @param A_I_J       previously composed block
246  * @param A_I_J_u     matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
247  * @param A_I_u_J_u   is composition of lower part A(I, \\tilde J) and  A(\\tilde I, \\tilde J) - new block for QR decomposition
248  */
249 template<typename MatrixT>
QRBlockComposition(MatrixT const & A_I_J,MatrixT const & A_I_J_u,MatrixT & A_I_u_J_u)250 void QRBlockComposition(MatrixT const & A_I_J,
251                         MatrixT const & A_I_J_u,
252                         MatrixT       & A_I_u_J_u)
253 {
254   typedef typename MatrixT::value_type     NumericType;
255 
256   vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
257   vcl_size_t row_n2 = A_I_u_J_u.size1();
258   vcl_size_t row_n = row_n1 + row_n2;
259   vcl_size_t col_n = A_I_J_u.size2();
260 
261   MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(row_n, col_n);
262   boost::numeric::ublas::project(C,
263                                  boost::numeric::ublas::range(0, row_n1),
264                                  boost::numeric::ublas::range(0, col_n))
265   += boost::numeric::ublas::project(A_I_J_u,
266                                     boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
267                                     boost::numeric::ublas::range(0, col_n));
268 
269   boost::numeric::ublas::project(C,
270                                  boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
271                                  boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
272   A_I_u_J_u = C;
273 }
274 
275 /** @brief CPU-based dynamic update for SPAI preconditioner
276  *
277  * @param A            initial sparse matrix
278  * @param A_v_c        vectorized column-wise initial matrix
279  * @param g_res        container of residuals for all columns
280  * @param g_is_update  container with identificators that shows which block should be modified
281  * @param g_I          container of row index sets for all columns
282  * @param g_J          container of column index sets for all columns
283  * @param g_b_v        container of vectors of beta for Q recovery(cf. Golub Van Loan "Matrix Computations", 3rd edition p.211)
284  * @param g_A_I_J      container of block matrices from previous update
285  * @param tag          SPAI configuration tag
286  */
287 template<typename SparseMatrixT,
288          typename SparseVectorT,
289          typename DenseMatrixT,
290          typename VectorT>
block_update(SparseMatrixT const & A,std::vector<SparseVectorT> const & A_v_c,std::vector<SparseVectorT> & g_res,std::vector<bool> & g_is_update,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,std::vector<VectorT> & g_b_v,std::vector<DenseMatrixT> & g_A_I_J,spai_tag const & tag)291 void block_update(SparseMatrixT const & A,
292                   std::vector<SparseVectorT> const & A_v_c,
293                   std::vector<SparseVectorT>       & g_res,
294                   std::vector<bool> & g_is_update,
295                   std::vector<std::vector<unsigned int> >& g_I,
296                   std::vector<std::vector<unsigned int> >& g_J,
297                   std::vector<VectorT>      & g_b_v,
298                   std::vector<DenseMatrixT> & g_A_I_J,
299                   spai_tag const & tag)
300 {
301   typedef typename DenseMatrixT::value_type     NumericType;
302 
303 
304   std::vector<std::vector<unsigned int> > g_J_u(g_J.size());   // set of new column indices
305   std::vector<std::vector<unsigned int> > g_I_u(g_J.size());   // set of new row indices
306   std::vector<DenseMatrixT> g_A_I_J_u(g_J.size());             // matrix A(I, \tilde J), cf. Kallischko p.31-32
307   std::vector<DenseMatrixT> g_A_I_u_J_u(g_J.size());           // matrix A(\tilde I, \tilde J), cf. Kallischko
308   std::vector<VectorT>      g_b_v_u(g_J.size());               // new vector of beta coefficients from QR factorization
309 
310 #ifdef VIENNACL_WITH_OPENMP
311   #pragma omp parallel for
312 #endif
313   for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
314   {
315     if (g_is_update[static_cast<vcl_size_t>(i)])
316     {
317       if (buildAugmentedIndexSet<SparseVectorT, NumericType>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))
318       {
319         //initialize matrix A_I_\hatJ
320         initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);
321         //multiplication of Q'*A_I_\hatJ
322         apply_q_trans_mat(g_A_I_J[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);
323         //building new rows index set \hatI
324         buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);
325         initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);
326         //composition of block for new QR factorization
327         QRBlockComposition(g_A_I_J[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);
328         //QR factorization
329         single_qr(g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_b_v_u[static_cast<vcl_size_t>(i)]);
330         //composition of new R and new vector b_v
331         composeNewR(g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_A_I_J[static_cast<vcl_size_t>(i)]);
332         composeNewVector(g_b_v_u[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)]);
333         //composition of new sets: I and J
334         g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());
335         g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());
336       }
337       else
338       {
339         g_is_update[static_cast<vcl_size_t>(i)] = false;
340       }
341     }
342   }
343 }
344 
345 
346 /**************************************************** GPU SPAI Update ****************************************************************/
347 
348 
349 //performs Q'*A(I, \tilde J) on GPU
350 /** @brief Performs multiplication Q'*A(I, \\tilde J) on GPU
351  *
352  * @param g_J_u          container of sets of new column indices
353  * @param g_I            container of row indices
354  * @param g_A_I_J_vcl    block matrix composed from previous blocks, they are blocks of R
355  * @param g_bv_vcl       block of beta vectors
356  * @param g_A_I_J_u_vcl  block of matrices A(I, \\tilde J)
357  * @param g_is_update    indicators, that show if a certain block should be processed
358  * @param ctx            Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
359  */
360 template<typename NumericT>
block_q_multiplication(std::vector<std::vector<unsigned int>> const & g_J_u,std::vector<std::vector<unsigned int>> const & g_I,block_matrix & g_A_I_J_vcl,block_vector & g_bv_vcl,block_matrix & g_A_I_J_u_vcl,std::vector<cl_uint> & g_is_update,viennacl::context ctx)361 void block_q_multiplication(std::vector<std::vector<unsigned int> > const & g_J_u,
362                             std::vector<std::vector<unsigned int> > const & g_I,
363                             block_matrix & g_A_I_J_vcl,
364                             block_vector & g_bv_vcl,
365                             block_matrix & g_A_I_J_u_vcl,
366                             std::vector<cl_uint> & g_is_update,
367                             viennacl::context ctx)
368 {
369   viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
370   unsigned int local_r_n = 0;
371   unsigned int local_c_n = 0;
372   unsigned int sz_blocks = 0;
373 
374   get_max_block_size(g_I,   local_r_n);
375   get_max_block_size(g_J_u, local_c_n);
376 
377   //for debug
378   std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
379   std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
380   compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
381   //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
382 
383   viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
384                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
385                                                                            &(g_is_update[0]));
386   viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
387   viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_q_mult");
388 
389   block_q_kernel.local_work_size(0,      local_c_n);
390   block_q_kernel.global_work_size(0, 128*local_c_n);
391   viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
392                                         g_bv_vcl.handle(),
393                                         g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
394                                         viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))),
395                                         static_cast<cl_uint>(g_I.size())));
396 }
397 
398 /** @brief Assembly of container of index row sets: I_q, row indices for new "QR block"
399  *
400  * @param g_I    container of row indices
401  * @param g_J    container of column indices
402  * @param g_I_u  container of new row indices
403  * @param g_I_q  container of row indices for new QR blocks
404  */
405 template<typename SizeT>
assemble_qr_row_inds(std::vector<std::vector<SizeT>> const & g_I,std::vector<std::vector<SizeT>> const & g_J,std::vector<std::vector<SizeT>> const & g_I_u,std::vector<std::vector<SizeT>> & g_I_q)406 void assemble_qr_row_inds(std::vector<std::vector<SizeT> > const & g_I,
407                           std::vector<std::vector<SizeT> > const & g_J,
408                           std::vector<std::vector<SizeT> > const & g_I_u,
409                           std::vector<std::vector<SizeT> >       & g_I_q)
410 {
411 #ifdef VIENNACL_WITH_OPENMP
412   #pragma omp parallel for
413 #endif
414   for (long i = 0; i < static_cast<long>(g_I.size()); ++i)
415   {
416     for (vcl_size_t j = g_J[static_cast<vcl_size_t>(i)].size(); j < g_I[static_cast<vcl_size_t>(i)].size(); ++j)
417       g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I[static_cast<vcl_size_t>(i)][j]);
418 
419     for (vcl_size_t j = 0; j < g_I_u[static_cast<vcl_size_t>(i)].size(); ++j)
420       g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I_u[static_cast<vcl_size_t>(i)][j]);
421   }
422 }
423 
424 /** @brief Performs assembly for new QR block
425  *
426  * @param g_J                container of column indices
427  * @param g_I                container of row indices
428  * @param g_J_u              container of new column indices
429  * @param g_I_u              container of new row indices
430  * @param g_I_q              container of row indices for new QR blocks
431  * @param g_A_I_J_u_vcl      blocks of Q'*A(I, \\tilde J)
432  * @param matrix_dimensions  array with matrix dimensions for all blocks
433  * @param g_A_I_u_J_u_vcl    blocks A(\\tilde I, \\tilde J)
434  * @param g_is_update        container with update indicators
435  * @param is_empty_block     indicator if all previous blocks A(\\tilde I, \\tilde J) - are empty, in case if they are empty kernel with smaller number of arguments is used
436  * @param ctx                Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
437 */
438 template<typename NumericT>
assemble_qr_block(std::vector<std::vector<unsigned int>> const & g_J,std::vector<std::vector<unsigned int>> const & g_I,std::vector<std::vector<unsigned int>> const & g_J_u,std::vector<std::vector<unsigned int>> const & g_I_u,std::vector<std::vector<unsigned int>> & g_I_q,block_matrix & g_A_I_J_u_vcl,viennacl::ocl::handle<cl_mem> & matrix_dimensions,block_matrix & g_A_I_u_J_u_vcl,std::vector<cl_uint> & g_is_update,bool is_empty_block,viennacl::context ctx)439 void assemble_qr_block(std::vector<std::vector<unsigned int> > const & g_J,
440                        std::vector<std::vector<unsigned int> > const& g_I,
441                        std::vector<std::vector<unsigned int> > const& g_J_u,
442                        std::vector<std::vector<unsigned int> > const& g_I_u,
443                        std::vector<std::vector<unsigned int> >& g_I_q,
444                        block_matrix & g_A_I_J_u_vcl,
445                        viennacl::ocl::handle<cl_mem> & matrix_dimensions,
446                        block_matrix & g_A_I_u_J_u_vcl,
447                        std::vector<cl_uint> & g_is_update,
448                        bool is_empty_block,
449                        viennacl::context ctx)
450 {
451   viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
452 
453   //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
454   assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
455   unsigned int sz_blocks;
456   std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
457   std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
458 
459   compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
460 
461   std::vector<NumericT> con_A_I_J_q(sz_blocks, static_cast<NumericT>(0));
462 
463   block_matrix g_A_I_J_q_vcl;
464   //need to allocate memory for QR block
465   g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
466                                                     static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),
467                                                     &(con_A_I_J_q[0]));
468   g_A_I_J_q_vcl.handle().context(opencl_ctx);
469 
470   g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
471                                                      static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
472                                                      &(matrix_dims[0]));
473   g_A_I_J_q_vcl.handle1().context(opencl_ctx);
474 
475   g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
476                                                       static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),
477                                                       &(blocks_ind[0]));
478   g_A_I_J_q_vcl.handle2().context(opencl_ctx);
479 
480   viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
481                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
482                                                                            &(g_is_update[0]));
483 
484   viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
485   if (!is_empty_block)
486   {
487     viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly");
488     qr_assembly_kernel.local_work_size(0, 1);
489     qr_assembly_kernel.global_work_size(0, 256);
490     viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions,
491                                               g_A_I_J_u_vcl.handle(),
492                                               g_A_I_J_u_vcl.handle2(),
493                                               g_A_I_J_u_vcl.handle1(),
494                                               g_A_I_u_J_u_vcl.handle(),
495                                               g_A_I_u_J_u_vcl.handle2(),
496                                               g_A_I_u_J_u_vcl.handle1(),
497                                               g_A_I_J_q_vcl.handle(),
498                                               g_A_I_J_q_vcl.handle2(),
499                                               g_A_I_J_q_vcl.handle1(),
500                                               g_is_update_vcl,
501                                               static_cast<unsigned int>(g_I.size())));
502   }
503   else
504   {
505     viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly_1");
506     qr_assembly_kernel.local_work_size(0, 1);
507     qr_assembly_kernel.global_work_size(0, 256);
508     viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
509                                               g_A_I_J_u_vcl.handle1(),
510                                               g_A_I_J_q_vcl.handle(),
511                                               g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
512                                               g_is_update_vcl,
513                                               static_cast<unsigned int>(g_I.size())));
514   }
515   g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
516   g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
517   g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
518 }
519 
520 /** @brief Performs assembly for new R matrix on GPU
521  *
522  * @param g_I              container of row indices
523  * @param g_J              container of column indices
524  * @param g_A_I_J_vcl      container of block matrices from previous update
525  * @param g_A_I_J_u_vcl    container of block matrices Q'*A(I, \\tilde J)
526  * @param g_A_I_u_J_u_vcl  container of block matrices QR factored on current iteration
527  * @param g_bv_vcl         block of beta vectors from previous iteration
528  * @param g_bv_vcl_u       block of updated beta vectors got after recent QR factorization
529  * @param g_is_update      container with identificators that shows which block should be modified
530  * @param ctx              Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
531  */
532 template<typename NumericT>
assemble_r(std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,block_matrix & g_A_I_J_vcl,block_matrix & g_A_I_J_u_vcl,block_matrix & g_A_I_u_J_u_vcl,block_vector & g_bv_vcl,block_vector & g_bv_vcl_u,std::vector<cl_uint> & g_is_update,viennacl::context ctx)533 void assemble_r(std::vector<std::vector<unsigned int> > & g_I,
534                 std::vector<std::vector<unsigned int> > & g_J,
535                 block_matrix & g_A_I_J_vcl,
536                 block_matrix & g_A_I_J_u_vcl,
537                 block_matrix & g_A_I_u_J_u_vcl,
538                 block_vector & g_bv_vcl,
539                 block_vector & g_bv_vcl_u,
540                 std::vector<cl_uint> & g_is_update,
541                 viennacl::context ctx)
542 {
543   viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
544   std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
545   std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
546   std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
547   unsigned int sz_blocks, bv_size;
548 
549   compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
550   get_size(g_J, bv_size);
551   init_start_inds(g_J, start_bv_r_inds);
552 
553   std::vector<NumericT> con_A_I_J_r(sz_blocks, static_cast<NumericT>(0));
554   std::vector<NumericT> b_v_r(bv_size, static_cast<NumericT>(0));
555 
556   block_matrix g_A_I_J_r_vcl;
557   block_vector g_bv_r_vcl;
558   g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
559                                                     static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),
560                                                     &(con_A_I_J_r[0]));
561   g_A_I_J_r_vcl.handle().context(opencl_ctx);
562 
563   g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
564                                                      static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
565                                                      &(matrix_dims[0]));
566   g_A_I_J_r_vcl.handle1().context(opencl_ctx);
567 
568   g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
569                                                      static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),
570                                                      &(blocks_ind[0]));
571   g_A_I_J_r_vcl.handle2().context(opencl_ctx);
572 
573   g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
574                                                  static_cast<unsigned int>(sizeof(NumericT)*bv_size),
575                                                  &(b_v_r[0]));
576   g_bv_r_vcl.handle().context(opencl_ctx);
577 
578   g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
579                                                   static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
580                                                   &(start_bv_r_inds[0]));
581   g_bv_r_vcl.handle().context(opencl_ctx);
582 
583   viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
584                                                                            static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
585                                                                            &(g_is_update[0]));
586   viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
587   viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_r_assembly");
588   r_assembly_kernel.local_work_size(0, 1);
589   r_assembly_kernel.global_work_size(0, 256);
590 
591   viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(),
592                                           g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(),
593                                           g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(),
594                                           g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
595                                           g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
596 
597   viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_bv_assembly");
598   bv_assembly_kernel.local_work_size(0, 1);
599   bv_assembly_kernel.global_work_size(0, 256);
600   viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
601                                             g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
602                                             g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
603                                             static_cast<cl_uint>(g_I.size())));
604   g_bv_vcl.handle() = g_bv_r_vcl.handle();
605   g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
606 
607   g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
608   g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
609   g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
610 }
611 
612 /** @brief GPU-based block update
613  *
614  * @param A            sparse matrix
615  * @param A_v_c        vectorized column-wise initial matrix
616  * @param g_is_update  container with identificators that shows which block should be modified
617  * @param g_res        container of residuals for all columns
618  * @param g_J          container of column index sets for all columns
619  * @param g_I          container of row index sets for all columns
620  * @param g_A_I_J_vcl  container of block matrices from previous update
621  * @param g_bv_vcl     block of beta vectors from previous iteration
622  * @param tag          SPAI configuration tag
623  */
624 template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
block_update(viennacl::compressed_matrix<NumericT,AlignmentV> const & A,std::vector<SparseVectorT> const & A_v_c,std::vector<cl_uint> & g_is_update,std::vector<SparseVectorT> & g_res,std::vector<std::vector<unsigned int>> & g_J,std::vector<std::vector<unsigned int>> & g_I,block_matrix & g_A_I_J_vcl,block_vector & g_bv_vcl,spai_tag const & tag)625 void block_update(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
626                   std::vector<SparseVectorT> const & A_v_c,
627                   std::vector<cl_uint> & g_is_update,
628                   std::vector<SparseVectorT> & g_res,
629                   std::vector<std::vector<unsigned int> > & g_J,
630                   std::vector<std::vector<unsigned int> > & g_I,
631                   block_matrix & g_A_I_J_vcl,
632                   block_vector & g_bv_vcl,
633                   spai_tag const & tag)
634 {
635   viennacl::context ctx = viennacl::traits::context(A);
636   //updated index set for columns
637   std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
638   //updated index set for rows
639   std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
640   //mixed index set of old and updated indices for rows
641   std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
642   //GPU memory for A_I_\hatJ
643   block_matrix g_A_I_J_u_vcl;
644   //GPU memory for A_\hatI_\hatJ
645   block_matrix g_A_I_u_J_u_vcl;
646   bool is_empty_block;
647   //GPU memory for new b_v
648   block_vector g_bv_u_vcl;
649 
650 #ifdef VIENNACL_WITH_OPENMP
651   #pragma omp parallel for
652 #endif
653   for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
654   {
655     if (g_is_update[static_cast<vcl_size_t>(i)])
656     {
657       if (buildAugmentedIndexSet<SparseVectorT, NumericT>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))
658           buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);
659     }
660   }
661   //assemble new A_I_J_u blocks on GPU and multiply them with Q'
662   block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block);
663   //I have matrix A_I_J_u ready..
664   block_q_multiplication<NumericT>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx);
665   //assemble A_\hatI_\hatJ
666   block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block);
667   assemble_qr_block<NumericT>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
668                               g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx);
669 
670   block_qr<NumericT>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx);
671   //concatanation of new and old indices
672 #ifdef VIENNACL_WITH_OPENMP
673   #pragma omp parallel for
674 #endif
675   for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
676   {
677     g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());
678     g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());
679   }
680   assemble_r<NumericT>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl,  g_bv_vcl,  g_bv_u_vcl, g_is_update, ctx);
681 }
682 
683 }
684 }
685 }
686 }
687 #endif
688