1 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
2 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
3
4 /* =========================================================================
5 Copyright (c) 2010-2016, Institute for Microelectronics,
6 Institute for Analysis and Scientific Computing,
7 TU Wien.
8 Portions of this software are copyright by UChicago Argonne, LLC.
9
10 -----------------
11 ViennaCL - The Vienna Computing Library
12 -----------------
13
14 Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15
16 (A list of authors and contributors can be found in the manual)
17
18 License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20
21 /** @file viennacl/linalg/detail/spai/spai-dynamic.hpp
22 @brief Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental.
23
24 SPAI code contributed by Nikolay Lukash
25 */
26
27 #include <utility>
28 #include <iostream>
29 #include <fstream>
30 #include <string>
31 #include <algorithm>
32 #include <vector>
33 #include <math.h>
34 #include <map>
35 //#include "block_matrix.hpp"
36 //#include "block_vector.hpp"
37 //#include "benchmark-utils.hpp"
38 #include "boost/numeric/ublas/vector.hpp"
39 #include "boost/numeric/ublas/matrix.hpp"
40 #include "boost/numeric/ublas/matrix_proxy.hpp"
41 #include "boost/numeric/ublas/vector_proxy.hpp"
42 #include "boost/numeric/ublas/storage.hpp"
43 #include "boost/numeric/ublas/io.hpp"
44 #include "boost/numeric/ublas/lu.hpp"
45 #include "boost/numeric/ublas/triangular.hpp"
46 #include "boost/numeric/ublas/matrix_expression.hpp"
47 // ViennaCL includes
48 #include "viennacl/linalg/prod.hpp"
49 #include "viennacl/matrix.hpp"
50 #include "viennacl/compressed_matrix.hpp"
51 #include "viennacl/linalg/sparse_matrix_operations.hpp"
52 #include "viennacl/linalg/matrix_operations.hpp"
53 #include "viennacl/scalar.hpp"
54 #include "viennacl/linalg/cg.hpp"
55 #include "viennacl/linalg/inner_prod.hpp"
56 #include "viennacl/linalg/ilu.hpp"
57 #include "viennacl/ocl/backend.hpp"
58
59 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
60 #include "viennacl/linalg/detail/spai/block_vector.hpp"
61 #include "viennacl/linalg/detail/spai/qr.hpp"
62 #include "viennacl/linalg/detail/spai/spai-static.hpp"
63 #include "viennacl/linalg/detail/spai/spai.hpp"
64 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
65 #include "viennacl/linalg/opencl/kernels/spai.hpp"
66
67 namespace viennacl
68 {
69 namespace linalg
70 {
71 namespace detail
72 {
73 namespace spai
74 {
75
76 /** @brief Helper functor for comparing std::pair<> based on the second member. */
77 struct CompareSecond
78 {
79 template<typename T1, typename T2>
operator ()viennacl::linalg::detail::spai::CompareSecond80 bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right)
81 {
82 return static_cast<double>(left.second) > static_cast<double>(right.second);
83 }
84 };
85
86
87 /** @brief Composition of new matrix R, that is going to be used in Least Square problem solving
88 *
89 * @param A matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
90 * @param R_n matrix A_Iu_J_u after QR factorization
91 * @param R previously composed matrix R
92 */
93 template<typename MatrixT>
composeNewR(MatrixT const & A,MatrixT const & R_n,MatrixT & R)94 void composeNewR(MatrixT const & A,
95 MatrixT const & R_n,
96 MatrixT & R)
97 {
98 typedef typename MatrixT::value_type NumericType;
99
100 vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2());
101 MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(R.size1() + row_n, R.size2() + A.size2());
102
103 //write original R to new Composite R
104 boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
105 //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
106 boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(),
107 R.size2() + A.size2())) +=
108 boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
109
110 //adding decomposed(QR) block to Composite R
111 if (R_n.size1() > 0 && R_n.size2() > 0)
112 boost::numeric::ublas::project(C,
113 boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
114 boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
115 R = C;
116 }
117
118 /** @brief Composition of new vector of coefficients beta from QR factorizations(necessary for Q recovery)
119 *
120 * @param v_n new vector from last QR factorization
121 * @param v composition of previous vectors from QR factorizations
122 */
123 template<typename VectorT>
composeNewVector(VectorT const & v_n,VectorT & v)124 void composeNewVector(VectorT const & v_n,
125 VectorT & v)
126 {
127 typedef typename VectorT::value_type NumericType;
128
129 VectorT w = boost::numeric::ublas::zero_vector<NumericType>(v.size() + v_n.size());
130 boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
131 boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
132 v = w;
133 }
134
135 /** @brief Computation of Euclidean norm for sparse vector
136 *
137 * @param v initial sparse vector
138 * @param norm scalar that represents Euclidean norm
139 */
140 template<typename SparseVectorT, typename NumericT>
sparse_norm_2(SparseVectorT const & v,NumericT & norm)141 void sparse_norm_2(SparseVectorT const & v,
142 NumericT & norm)
143 {
144 for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it)
145 norm += (vec_it->second)*(vec_it->second);
146
147 norm = std::sqrt(norm);
148 }
149
150 /** @brief Dot product of two sparse vectors
151 *
152 * @param v1 initial sparse vector
153 * @param v2 initial sparse vector
154 * @param res_v scalar that represents dot product result
155 */
156 template<typename SparseVectorT, typename NumericT>
sparse_inner_prod(SparseVectorT const & v1,SparseVectorT const & v2,NumericT & res_v)157 void sparse_inner_prod(SparseVectorT const & v1,
158 SparseVectorT const & v2,
159 NumericT & res_v)
160 {
161 typename SparseVectorT::const_iterator v_it1 = v1.begin();
162 typename SparseVectorT::const_iterator v_it2 = v2.begin();
163
164 while ((v_it1 != v1.end())&&(v_it2 != v2.end()))
165 {
166 if (v_it1->first == v_it2->first)
167 {
168 res_v += (v_it1->second)*(v_it2->second);
169 ++v_it1;
170 ++v_it2;
171 }
172 else if (v_it1->first < v_it2->first)
173 ++v_it1;
174 else
175 ++v_it2;
176 }
177 }
178
179 /** @brief Building a new set of column indices J_u, cf. Kallischko dissertation p.31
180 *
181 * @param A_v_c vectorized column-wise initial matrix
182 * @param res residual vector
183 * @param J set of column indices
184 * @param J_u set of new column indices
185 * @param tag SPAI tag with parameters
186 */
187 template<typename SparseVectorT, typename NumericT>
buildAugmentedIndexSet(std::vector<SparseVectorT> const & A_v_c,SparseVectorT const & res,std::vector<unsigned int> & J,std::vector<unsigned int> & J_u,spai_tag const & tag)188 bool buildAugmentedIndexSet(std::vector<SparseVectorT> const & A_v_c,
189 SparseVectorT const & res,
190 std::vector<unsigned int> & J,
191 std::vector<unsigned int> & J_u,
192 spai_tag const & tag)
193 {
194 std::vector<std::pair<unsigned int, NumericT> > p;
195 vcl_size_t cur_size = 0;
196 NumericT inprod, norm2;
197
198 for (typename SparseVectorT::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it)
199 {
200 if (!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold()))
201 {
202 inprod = norm2 = 0;
203 sparse_inner_prod(res, A_v_c[res_it->first], inprod);
204 sparse_norm_2(A_v_c[res_it->first], norm2);
205 p.push_back(std::pair<unsigned int, NumericT>(res_it->first, (inprod*inprod)/(norm2*norm2)));
206 }
207 }
208
209 std::sort(p.begin(), p.end(), CompareSecond());
210 while ((cur_size < J.size()) && (p.size() > 0))
211 {
212 J_u.push_back(p[0].first);
213 p.erase(p.begin());
214 cur_size++;
215 }
216 p.clear();
217 return (cur_size > 0);
218 }
219
220 /** @brief Building a new indices to current set of row indices I_n, cf. Kallischko dissertation p.32
221 *
222 * @param A_v_c vectorized column-wise initial matrix
223 * @param I set of previous determined row indices
224 * @param J_n set of new column indices
225 * @param I_n set of new indices
226 */
227 template<typename SparseVectorT>
buildNewRowSet(std::vector<SparseVectorT> const & A_v_c,std::vector<unsigned int> const & I,std::vector<unsigned int> const & J_n,std::vector<unsigned int> & I_n)228 void buildNewRowSet(std::vector<SparseVectorT> const & A_v_c,
229 std::vector<unsigned int> const & I,
230 std::vector<unsigned int> const & J_n,
231 std::vector<unsigned int> & I_n)
232 {
233 for (vcl_size_t i = 0; i < J_n.size(); ++i)
234 {
235 for (typename SparseVectorT::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it)
236 {
237 if (!isInIndexSet(I, col_it->first) && !isInIndexSet(I_n, col_it->first))
238 I_n.push_back(col_it->first);
239 }
240 }
241 }
242
243 /** @brief Composition of new block for QR factorization cf. Kallischko dissertation p.82, figure 4.7
244 *
245 * @param A_I_J previously composed block
246 * @param A_I_J_u matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices
247 * @param A_I_u_J_u is composition of lower part A(I, \\tilde J) and A(\\tilde I, \\tilde J) - new block for QR decomposition
248 */
249 template<typename MatrixT>
QRBlockComposition(MatrixT const & A_I_J,MatrixT const & A_I_J_u,MatrixT & A_I_u_J_u)250 void QRBlockComposition(MatrixT const & A_I_J,
251 MatrixT const & A_I_J_u,
252 MatrixT & A_I_u_J_u)
253 {
254 typedef typename MatrixT::value_type NumericType;
255
256 vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
257 vcl_size_t row_n2 = A_I_u_J_u.size1();
258 vcl_size_t row_n = row_n1 + row_n2;
259 vcl_size_t col_n = A_I_J_u.size2();
260
261 MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(row_n, col_n);
262 boost::numeric::ublas::project(C,
263 boost::numeric::ublas::range(0, row_n1),
264 boost::numeric::ublas::range(0, col_n))
265 += boost::numeric::ublas::project(A_I_J_u,
266 boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
267 boost::numeric::ublas::range(0, col_n));
268
269 boost::numeric::ublas::project(C,
270 boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
271 boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
272 A_I_u_J_u = C;
273 }
274
275 /** @brief CPU-based dynamic update for SPAI preconditioner
276 *
277 * @param A initial sparse matrix
278 * @param A_v_c vectorized column-wise initial matrix
279 * @param g_res container of residuals for all columns
280 * @param g_is_update container with identificators that shows which block should be modified
281 * @param g_I container of row index sets for all columns
282 * @param g_J container of column index sets for all columns
283 * @param g_b_v container of vectors of beta for Q recovery(cf. Golub Van Loan "Matrix Computations", 3rd edition p.211)
284 * @param g_A_I_J container of block matrices from previous update
285 * @param tag SPAI configuration tag
286 */
287 template<typename SparseMatrixT,
288 typename SparseVectorT,
289 typename DenseMatrixT,
290 typename VectorT>
block_update(SparseMatrixT const & A,std::vector<SparseVectorT> const & A_v_c,std::vector<SparseVectorT> & g_res,std::vector<bool> & g_is_update,std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,std::vector<VectorT> & g_b_v,std::vector<DenseMatrixT> & g_A_I_J,spai_tag const & tag)291 void block_update(SparseMatrixT const & A,
292 std::vector<SparseVectorT> const & A_v_c,
293 std::vector<SparseVectorT> & g_res,
294 std::vector<bool> & g_is_update,
295 std::vector<std::vector<unsigned int> >& g_I,
296 std::vector<std::vector<unsigned int> >& g_J,
297 std::vector<VectorT> & g_b_v,
298 std::vector<DenseMatrixT> & g_A_I_J,
299 spai_tag const & tag)
300 {
301 typedef typename DenseMatrixT::value_type NumericType;
302
303
304 std::vector<std::vector<unsigned int> > g_J_u(g_J.size()); // set of new column indices
305 std::vector<std::vector<unsigned int> > g_I_u(g_J.size()); // set of new row indices
306 std::vector<DenseMatrixT> g_A_I_J_u(g_J.size()); // matrix A(I, \tilde J), cf. Kallischko p.31-32
307 std::vector<DenseMatrixT> g_A_I_u_J_u(g_J.size()); // matrix A(\tilde I, \tilde J), cf. Kallischko
308 std::vector<VectorT> g_b_v_u(g_J.size()); // new vector of beta coefficients from QR factorization
309
310 #ifdef VIENNACL_WITH_OPENMP
311 #pragma omp parallel for
312 #endif
313 for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
314 {
315 if (g_is_update[static_cast<vcl_size_t>(i)])
316 {
317 if (buildAugmentedIndexSet<SparseVectorT, NumericType>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))
318 {
319 //initialize matrix A_I_\hatJ
320 initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);
321 //multiplication of Q'*A_I_\hatJ
322 apply_q_trans_mat(g_A_I_J[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]);
323 //building new rows index set \hatI
324 buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);
325 initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);
326 //composition of block for new QR factorization
327 QRBlockComposition(g_A_I_J[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]);
328 //QR factorization
329 single_qr(g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_b_v_u[static_cast<vcl_size_t>(i)]);
330 //composition of new R and new vector b_v
331 composeNewR(g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_A_I_J[static_cast<vcl_size_t>(i)]);
332 composeNewVector(g_b_v_u[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)]);
333 //composition of new sets: I and J
334 g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());
335 g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());
336 }
337 else
338 {
339 g_is_update[static_cast<vcl_size_t>(i)] = false;
340 }
341 }
342 }
343 }
344
345
346 /**************************************************** GPU SPAI Update ****************************************************************/
347
348
349 //performs Q'*A(I, \tilde J) on GPU
350 /** @brief Performs multiplication Q'*A(I, \\tilde J) on GPU
351 *
352 * @param g_J_u container of sets of new column indices
353 * @param g_I container of row indices
354 * @param g_A_I_J_vcl block matrix composed from previous blocks, they are blocks of R
355 * @param g_bv_vcl block of beta vectors
356 * @param g_A_I_J_u_vcl block of matrices A(I, \\tilde J)
357 * @param g_is_update indicators, that show if a certain block should be processed
358 * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
359 */
360 template<typename NumericT>
block_q_multiplication(std::vector<std::vector<unsigned int>> const & g_J_u,std::vector<std::vector<unsigned int>> const & g_I,block_matrix & g_A_I_J_vcl,block_vector & g_bv_vcl,block_matrix & g_A_I_J_u_vcl,std::vector<cl_uint> & g_is_update,viennacl::context ctx)361 void block_q_multiplication(std::vector<std::vector<unsigned int> > const & g_J_u,
362 std::vector<std::vector<unsigned int> > const & g_I,
363 block_matrix & g_A_I_J_vcl,
364 block_vector & g_bv_vcl,
365 block_matrix & g_A_I_J_u_vcl,
366 std::vector<cl_uint> & g_is_update,
367 viennacl::context ctx)
368 {
369 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
370 unsigned int local_r_n = 0;
371 unsigned int local_c_n = 0;
372 unsigned int sz_blocks = 0;
373
374 get_max_block_size(g_I, local_r_n);
375 get_max_block_size(g_J_u, local_c_n);
376
377 //for debug
378 std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
379 std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
380 compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
381 //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
382
383 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
384 static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
385 &(g_is_update[0]));
386 viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
387 viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_q_mult");
388
389 block_q_kernel.local_work_size(0, local_c_n);
390 block_q_kernel.global_work_size(0, 128*local_c_n);
391 viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
392 g_bv_vcl.handle(),
393 g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
394 viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))),
395 static_cast<cl_uint>(g_I.size())));
396 }
397
398 /** @brief Assembly of container of index row sets: I_q, row indices for new "QR block"
399 *
400 * @param g_I container of row indices
401 * @param g_J container of column indices
402 * @param g_I_u container of new row indices
403 * @param g_I_q container of row indices for new QR blocks
404 */
405 template<typename SizeT>
assemble_qr_row_inds(std::vector<std::vector<SizeT>> const & g_I,std::vector<std::vector<SizeT>> const & g_J,std::vector<std::vector<SizeT>> const & g_I_u,std::vector<std::vector<SizeT>> & g_I_q)406 void assemble_qr_row_inds(std::vector<std::vector<SizeT> > const & g_I,
407 std::vector<std::vector<SizeT> > const & g_J,
408 std::vector<std::vector<SizeT> > const & g_I_u,
409 std::vector<std::vector<SizeT> > & g_I_q)
410 {
411 #ifdef VIENNACL_WITH_OPENMP
412 #pragma omp parallel for
413 #endif
414 for (long i = 0; i < static_cast<long>(g_I.size()); ++i)
415 {
416 for (vcl_size_t j = g_J[static_cast<vcl_size_t>(i)].size(); j < g_I[static_cast<vcl_size_t>(i)].size(); ++j)
417 g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I[static_cast<vcl_size_t>(i)][j]);
418
419 for (vcl_size_t j = 0; j < g_I_u[static_cast<vcl_size_t>(i)].size(); ++j)
420 g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I_u[static_cast<vcl_size_t>(i)][j]);
421 }
422 }
423
424 /** @brief Performs assembly for new QR block
425 *
426 * @param g_J container of column indices
427 * @param g_I container of row indices
428 * @param g_J_u container of new column indices
429 * @param g_I_u container of new row indices
430 * @param g_I_q container of row indices for new QR blocks
431 * @param g_A_I_J_u_vcl blocks of Q'*A(I, \\tilde J)
432 * @param matrix_dimensions array with matrix dimensions for all blocks
433 * @param g_A_I_u_J_u_vcl blocks A(\\tilde I, \\tilde J)
434 * @param g_is_update container with update indicators
435 * @param is_empty_block indicator if all previous blocks A(\\tilde I, \\tilde J) - are empty, in case if they are empty kernel with smaller number of arguments is used
436 * @param ctx Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host)
437 */
438 template<typename NumericT>
assemble_qr_block(std::vector<std::vector<unsigned int>> const & g_J,std::vector<std::vector<unsigned int>> const & g_I,std::vector<std::vector<unsigned int>> const & g_J_u,std::vector<std::vector<unsigned int>> const & g_I_u,std::vector<std::vector<unsigned int>> & g_I_q,block_matrix & g_A_I_J_u_vcl,viennacl::ocl::handle<cl_mem> & matrix_dimensions,block_matrix & g_A_I_u_J_u_vcl,std::vector<cl_uint> & g_is_update,bool is_empty_block,viennacl::context ctx)439 void assemble_qr_block(std::vector<std::vector<unsigned int> > const & g_J,
440 std::vector<std::vector<unsigned int> > const& g_I,
441 std::vector<std::vector<unsigned int> > const& g_J_u,
442 std::vector<std::vector<unsigned int> > const& g_I_u,
443 std::vector<std::vector<unsigned int> >& g_I_q,
444 block_matrix & g_A_I_J_u_vcl,
445 viennacl::ocl::handle<cl_mem> & matrix_dimensions,
446 block_matrix & g_A_I_u_J_u_vcl,
447 std::vector<cl_uint> & g_is_update,
448 bool is_empty_block,
449 viennacl::context ctx)
450 {
451 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
452
453 //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
454 assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
455 unsigned int sz_blocks;
456 std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
457 std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
458
459 compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
460
461 std::vector<NumericT> con_A_I_J_q(sz_blocks, static_cast<NumericT>(0));
462
463 block_matrix g_A_I_J_q_vcl;
464 //need to allocate memory for QR block
465 g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
466 static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),
467 &(con_A_I_J_q[0]));
468 g_A_I_J_q_vcl.handle().context(opencl_ctx);
469
470 g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
471 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
472 &(matrix_dims[0]));
473 g_A_I_J_q_vcl.handle1().context(opencl_ctx);
474
475 g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
476 static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),
477 &(blocks_ind[0]));
478 g_A_I_J_q_vcl.handle2().context(opencl_ctx);
479
480 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
481 static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
482 &(g_is_update[0]));
483
484 viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
485 if (!is_empty_block)
486 {
487 viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly");
488 qr_assembly_kernel.local_work_size(0, 1);
489 qr_assembly_kernel.global_work_size(0, 256);
490 viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions,
491 g_A_I_J_u_vcl.handle(),
492 g_A_I_J_u_vcl.handle2(),
493 g_A_I_J_u_vcl.handle1(),
494 g_A_I_u_J_u_vcl.handle(),
495 g_A_I_u_J_u_vcl.handle2(),
496 g_A_I_u_J_u_vcl.handle1(),
497 g_A_I_J_q_vcl.handle(),
498 g_A_I_J_q_vcl.handle2(),
499 g_A_I_J_q_vcl.handle1(),
500 g_is_update_vcl,
501 static_cast<unsigned int>(g_I.size())));
502 }
503 else
504 {
505 viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly_1");
506 qr_assembly_kernel.local_work_size(0, 1);
507 qr_assembly_kernel.global_work_size(0, 256);
508 viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
509 g_A_I_J_u_vcl.handle1(),
510 g_A_I_J_q_vcl.handle(),
511 g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
512 g_is_update_vcl,
513 static_cast<unsigned int>(g_I.size())));
514 }
515 g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
516 g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
517 g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
518 }
519
520 /** @brief Performs assembly for new R matrix on GPU
521 *
522 * @param g_I container of row indices
523 * @param g_J container of column indices
524 * @param g_A_I_J_vcl container of block matrices from previous update
525 * @param g_A_I_J_u_vcl container of block matrices Q'*A(I, \\tilde J)
526 * @param g_A_I_u_J_u_vcl container of block matrices QR factored on current iteration
527 * @param g_bv_vcl block of beta vectors from previous iteration
528 * @param g_bv_vcl_u block of updated beta vectors got after recent QR factorization
529 * @param g_is_update container with identificators that shows which block should be modified
530 * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host)
531 */
532 template<typename NumericT>
assemble_r(std::vector<std::vector<unsigned int>> & g_I,std::vector<std::vector<unsigned int>> & g_J,block_matrix & g_A_I_J_vcl,block_matrix & g_A_I_J_u_vcl,block_matrix & g_A_I_u_J_u_vcl,block_vector & g_bv_vcl,block_vector & g_bv_vcl_u,std::vector<cl_uint> & g_is_update,viennacl::context ctx)533 void assemble_r(std::vector<std::vector<unsigned int> > & g_I,
534 std::vector<std::vector<unsigned int> > & g_J,
535 block_matrix & g_A_I_J_vcl,
536 block_matrix & g_A_I_J_u_vcl,
537 block_matrix & g_A_I_u_J_u_vcl,
538 block_vector & g_bv_vcl,
539 block_vector & g_bv_vcl_u,
540 std::vector<cl_uint> & g_is_update,
541 viennacl::context ctx)
542 {
543 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
544 std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
545 std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
546 std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
547 unsigned int sz_blocks, bv_size;
548
549 compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
550 get_size(g_J, bv_size);
551 init_start_inds(g_J, start_bv_r_inds);
552
553 std::vector<NumericT> con_A_I_J_r(sz_blocks, static_cast<NumericT>(0));
554 std::vector<NumericT> b_v_r(bv_size, static_cast<NumericT>(0));
555
556 block_matrix g_A_I_J_r_vcl;
557 block_vector g_bv_r_vcl;
558 g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
559 static_cast<unsigned int>(sizeof(NumericT)*sz_blocks),
560 &(con_A_I_J_r[0]));
561 g_A_I_J_r_vcl.handle().context(opencl_ctx);
562
563 g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
564 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
565 &(matrix_dims[0]));
566 g_A_I_J_r_vcl.handle1().context(opencl_ctx);
567
568 g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
569 static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)),
570 &(blocks_ind[0]));
571 g_A_I_J_r_vcl.handle2().context(opencl_ctx);
572
573 g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
574 static_cast<unsigned int>(sizeof(NumericT)*bv_size),
575 &(b_v_r[0]));
576 g_bv_r_vcl.handle().context(opencl_ctx);
577
578 g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
579 static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
580 &(start_bv_r_inds[0]));
581 g_bv_r_vcl.handle().context(opencl_ctx);
582
583 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
584 static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
585 &(g_is_update[0]));
586 viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
587 viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_r_assembly");
588 r_assembly_kernel.local_work_size(0, 1);
589 r_assembly_kernel.global_work_size(0, 256);
590
591 viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(),
592 g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(),
593 g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(),
594 g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
595 g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
596
597 viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_bv_assembly");
598 bv_assembly_kernel.local_work_size(0, 1);
599 bv_assembly_kernel.global_work_size(0, 256);
600 viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
601 g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
602 g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
603 static_cast<cl_uint>(g_I.size())));
604 g_bv_vcl.handle() = g_bv_r_vcl.handle();
605 g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
606
607 g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
608 g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
609 g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
610 }
611
612 /** @brief GPU-based block update
613 *
614 * @param A sparse matrix
615 * @param A_v_c vectorized column-wise initial matrix
616 * @param g_is_update container with identificators that shows which block should be modified
617 * @param g_res container of residuals for all columns
618 * @param g_J container of column index sets for all columns
619 * @param g_I container of row index sets for all columns
620 * @param g_A_I_J_vcl container of block matrices from previous update
621 * @param g_bv_vcl block of beta vectors from previous iteration
622 * @param tag SPAI configuration tag
623 */
624 template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
block_update(viennacl::compressed_matrix<NumericT,AlignmentV> const & A,std::vector<SparseVectorT> const & A_v_c,std::vector<cl_uint> & g_is_update,std::vector<SparseVectorT> & g_res,std::vector<std::vector<unsigned int>> & g_J,std::vector<std::vector<unsigned int>> & g_I,block_matrix & g_A_I_J_vcl,block_vector & g_bv_vcl,spai_tag const & tag)625 void block_update(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
626 std::vector<SparseVectorT> const & A_v_c,
627 std::vector<cl_uint> & g_is_update,
628 std::vector<SparseVectorT> & g_res,
629 std::vector<std::vector<unsigned int> > & g_J,
630 std::vector<std::vector<unsigned int> > & g_I,
631 block_matrix & g_A_I_J_vcl,
632 block_vector & g_bv_vcl,
633 spai_tag const & tag)
634 {
635 viennacl::context ctx = viennacl::traits::context(A);
636 //updated index set for columns
637 std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
638 //updated index set for rows
639 std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
640 //mixed index set of old and updated indices for rows
641 std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
642 //GPU memory for A_I_\hatJ
643 block_matrix g_A_I_J_u_vcl;
644 //GPU memory for A_\hatI_\hatJ
645 block_matrix g_A_I_u_J_u_vcl;
646 bool is_empty_block;
647 //GPU memory for new b_v
648 block_vector g_bv_u_vcl;
649
650 #ifdef VIENNACL_WITH_OPENMP
651 #pragma omp parallel for
652 #endif
653 for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
654 {
655 if (g_is_update[static_cast<vcl_size_t>(i)])
656 {
657 if (buildAugmentedIndexSet<SparseVectorT, NumericT>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag))
658 buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]);
659 }
660 }
661 //assemble new A_I_J_u blocks on GPU and multiply them with Q'
662 block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block);
663 //I have matrix A_I_J_u ready..
664 block_q_multiplication<NumericT>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx);
665 //assemble A_\hatI_\hatJ
666 block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block);
667 assemble_qr_block<NumericT>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
668 g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx);
669
670 block_qr<NumericT>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx);
671 //concatanation of new and old indices
672 #ifdef VIENNACL_WITH_OPENMP
673 #pragma omp parallel for
674 #endif
675 for (long i = 0; i < static_cast<long>(g_J.size()); ++i)
676 {
677 g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end());
678 g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end());
679 }
680 assemble_r<NumericT>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl, g_bv_vcl, g_bv_u_vcl, g_is_update, ctx);
681 }
682
683 }
684 }
685 }
686 }
687 #endif
688