1 /*
2 
3 Copyright (C) 2008-2021 Michele Martone
4 
5 This file is part of librsb.
6 
7 librsb is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11 
12 librsb is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15 License for more details.
16 
17 You should have received a copy of the GNU Lesser General Public
18 License along with librsb; see the file COPYING.
19 If not, see <http://www.gnu.org/licenses/>.
20 
21 */
22 /* @cond INNERDOC  */
23 /*!
24  * @file
25  * @author Michele Martone
26  * @brief
27  * This source file contains functions for sparse recursive multicore triangular solve.
28  */
29 /*
30  * FIXME: the submatrices sorting routines are buggy.
31  * */
32 #include "rsb_internals.h"		/* */
33 #include "rsb_lock.h"		/* */
34 #include "rsb_spsv.h"		/* */
35 
36 #define RSB_WANT_VERBOSE_SPSV	0
37 /* #define RSB_CBLAS_X_SCAL_SPSV rsb__cblas_Xscal */
38 #define RSB_CBLAS_X_SCAL_SPSV rsb__cblas_Xscal_parallel
39 
40 RSB_INTERNALS_COMMON_HEAD_DECLS
41 
rsb_do_spsv_terminal(const struct rsb_mtx_t * mtxAp,const void * x,void * y,const void * alphap,rsb_coo_idx_t incx,rsb_coo_idx_t incy,rsb_trans_t transl RSB_INNER_NRHS_SPSV_ARGS)42 static rsb_err_t rsb_do_spsv_terminal(const struct rsb_mtx_t * mtxAp, const void * x, void * y, const void * alphap, rsb_coo_idx_t incx, rsb_coo_idx_t incy, rsb_trans_t transl RSB_INNER_NRHS_SPSV_ARGS)
43 {
44 	/**
45 	  	\ingroup gr_internals
46 		Entry function for SPSV.
47 		alphap can be NULL.
48 	*/
49 	rsb_err_t errval = RSB_ERR_NO_ERROR;
50 
51 	if(!mtxAp || !y || !x || transl == RSB_INVALID_FLAGS || !rsb__is_square(mtxAp) || !RSB_IS_VALID_INCX_VALUE(incx) || !RSB_IS_VALID_INCX_VALUE(incy))
52 	{
53 		errval = RSB_ERR_BADARGS;
54 		goto ret;
55 	}
56 
57 	/*
58 		FIXME : should handle alphap in a more specialized fashion.
59 	*/
60 
61 #if 0
62 	if(betap && !RSB_IS_ELEMENT_ONE(betap,mtxAp->typecode))
63 	{
64 		if(incy>1)
65 			rsb__cblas_Xscal(mtxAp->typecode,rsb__do_get_columns_of(mtxAp,transl),betap,y,incy);
66 		else
67 		{
68 			/* if should zero the output vector */
69 			if(RSB_IS_ELEMENT_ZERO(betap,mtxAp->typecode))
70 				rsb__cblas_Xscal(mtxAp->typecode,rsb__do_get_columns_of(mtxAp,transl),NULL,y,incy);
71 			else
72 			/* if should scale the output vector */
73 			if(!RSB_IS_ELEMENT_ONE(betap,mtxAp->typecode))
74 				rsb_vector_scale(y,betap,mtxAp->typecode,rsb__do_get_columns_of(mtxAp,transl));
75 		}
76 	}
77 #endif
78 
79 
80 #if RSB_ENABLE_INNER_NRHS_SPSV
81 	{
82 	const size_t lenx=(mtxAp->el_size*rhsnri);
83 	const size_t leny=(mtxAp->el_size*outnri);
84 	rsb_int_t nrhsi=0;
85 	for(nrhsi=0;nrhsi<nrhs;++nrhsi)
86 #endif
87 	{
88 #if RSB_ENABLE_INNER_NRHS_SPSV
89 		void      *out=((      rsb_byte_t*)y)+(leny*nrhsi);
90 		const void*rhs=((const rsb_byte_t*)x)+(lenx*nrhsi);
91 #else
92 		void      *out=((      rsb_byte_t*)y);
93 		const void*rhs=((const rsb_byte_t*)x);
94 #endif /* RSB_ENABLE_INNER_NRHS_SPMV */
95 	if(!alphap || RSB_IS_ELEMENT_ONE(alphap,mtxAp->typecode))
96 	{
97 		if(incy==1 && incx==1)
98 			errval = rsb__do_spsv_uxua(mtxAp,rhs,out,transl);
99 		else
100 			errval = rsb__do_spsv_sxsx(mtxAp,rhs,y,alphap,incx,incy,transl);
101 	}
102 	else
103 		errval = rsb__do_spsv_sxsx(mtxAp,rhs,out,alphap,incx,incy,transl);
104 	}
105 #if RSB_ENABLE_INNER_NRHS_SPSV
106 	}
107 #endif /* RSB_ENABLE_INNER_NRHS_SPMV */
108 ret:
109 	return errval;
110 }
111 
rsb_do_spsv_recursive_serial(const struct rsb_mtx_t * mtxAp,const void * x,void * y,const void * alphap,rsb_coo_idx_t incx,rsb_coo_idx_t incy,rsb_trans_t transl,enum rsb_op_flags_t op_flags RSB_INNER_NRHS_SPSV_ARGS)112 static rsb_err_t rsb_do_spsv_recursive_serial(const struct rsb_mtx_t * mtxAp, const void * x, void * y, const void * alphap, rsb_coo_idx_t incx, rsb_coo_idx_t incy, rsb_trans_t transl, enum rsb_op_flags_t op_flags RSB_INNER_NRHS_SPSV_ARGS)
113 {
114 	/**
115 	  	\ingroup gr_internals
116 	 *
117 	 *	FIXME : document
118 	 * */
119 	rsb_err_t errval = RSB_ERR_NO_ERROR;
120 	rsb_aligned_t mone[RSB_CONST_ENOUGH_ALIGNED_FOR_ANY_TYPE];
121 	rsb_aligned_t pone[RSB_CONST_ENOUGH_ALIGNED_FOR_ANY_TYPE];
122 	rsb_int_t nrhsi = 0;
123 	rsb__util_set_area_to_converted_integer(&mone[0],mtxAp->typecode,-1);
124 	rsb__util_set_area_to_converted_integer(&pone[0],mtxAp->typecode,+1);
125 
126 	if(mtxAp->roff == mtxAp->coff)
127 	if(rsb__is_root_matrix(mtxAp))
128 #if RSB_ENABLE_INNER_NRHS_SPSV
129 	for (nrhsi=0;nrhsi<nrhs;++nrhsi)
130 #endif /* RSB_ENABLE_INNER_NRHS_SPSV */
131 		RSB_CBLAS_X_SCAL_SPSV(mtxAp->typecode,rsb__do_get_rows_of(mtxAp,transl),alphap,RSB_TYPED_OFF_PTR(mtxAp->typecode,y,nrhsi*(outnri)*incy),incy);
132 
133 	if( rsb__is_recursive_matrix(mtxAp->flags))
134 #if 1
135 	{
136 		void*offy=NULL; const void *offx=NULL;
137 //		rsb_coo_idx_t scoff=submatrix->coff; rsb_coo_idx_t sroff=submatrix->roff;
138 		rsb_bool_t isupp = rsb__is_upper_triangle(mtxAp->flags);
139 		rsb_bool_t istrans = (RSB_DOES_NOT_TRANSPOSE(transl))?0:1;
140 		rsb_coo_idx_t half;
141 //		offy=((rsb_byte_t*)y)+(mtxAp->el_size*sroff)*incy,offx=((const rsb_byte_t*)x)+(mtxAp->el_size*scoff)*incx;
142 		if(mtxAp->coff!=mtxAp->roff)
143 		{
144 			RSB_ERROR("!\n");
145 			errval = RSB_ERR_BADARGS;goto err;
146 		}
147 		if(!(RSB_SUBMATRIX_INDEX(mtxAp,0,0)) || !(RSB_SUBMATRIX_INDEX(mtxAp,1,1)))
148 		{
149 			RSB_ERROR("@ %d %d and with null diagonal elements, %p %p %p %p\n",mtxAp->roff,mtxAp->coff,
150 					mtxAp->sm[0],
151 					mtxAp->sm[1],
152 					mtxAp->sm[2],
153 					mtxAp->sm[3]
154 					);
155 			errval = RSB_ERR_BADARGS;goto err;
156 		}
157 		half = RSB_SUBMATRIX_INDEX(mtxAp,1,1)->roff-mtxAp->roff;
158 		offy=((      rsb_byte_t*)y)+(mtxAp->el_size*half)*incy;
159 		offx=((const rsb_byte_t*)x)+(mtxAp->el_size*half)*incx;
160 
161 		switch(isupp)
162 		{
163 		case RSB_BOOL_TRUE:
164 		switch(istrans)
165 		{
166 		case RSB_BOOL_TRUE:
167 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(RSB_SUBMATRIX_INDEX(mtxAp,0,0),x,y,&pone[0],incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
168 			if(RSB_SUBMATRIX_INDEX(mtxAp,0,1) && op_flags != RSB_OP_FLAG_INFINITE_PARALLELISM_EMULATE)
169 				RSB_DO_ERROR_CUMULATE(errval,rsb__do_spmv_general(transl,&mone[0],RSB_SUBMATRIX_INDEX(mtxAp,0,1),offx,incx,NULL,y,incy,RSB_OP_FLAG_DIAGONAL_OVERRIDE_EXPLICIT_SERIAL RSB_INNER_NRHS_SPSV_ARGS_IDS));
170 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(RSB_SUBMATRIX_INDEX(mtxAp,1,1),offx,offy,&pone[0],incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
171 		break;
172 		default:
173 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(RSB_SUBMATRIX_INDEX(mtxAp,1,1),offx,offy,&pone[0],incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
174 			if(RSB_SUBMATRIX_INDEX(mtxAp,0,1) && op_flags != RSB_OP_FLAG_INFINITE_PARALLELISM_EMULATE)
175 				RSB_DO_ERROR_CUMULATE(errval,rsb__do_spmv_general(transl,&mone[0],RSB_SUBMATRIX_INDEX(mtxAp,0,1),offx,incx,NULL,y,incy,RSB_OP_FLAG_DIAGONAL_OVERRIDE_EXPLICIT_SERIAL RSB_INNER_NRHS_SPSV_ARGS_IDS));
176 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(RSB_SUBMATRIX_INDEX(mtxAp,0,0),x,y,&pone[0],incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
177 		break;
178 		}
179 		break;
180 		case RSB_BOOL_FALSE:
181 		switch(istrans)
182 		{
183 		case RSB_BOOL_TRUE:
184 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(RSB_SUBMATRIX_INDEX(mtxAp,1,1),offx,offy,&pone[0],incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
185 			if(RSB_SUBMATRIX_INDEX(mtxAp,1,0) && op_flags != RSB_OP_FLAG_INFINITE_PARALLELISM_EMULATE)
186 				RSB_DO_ERROR_CUMULATE(errval,rsb__do_spmv_general(transl,&mone[0],RSB_SUBMATRIX_INDEX(mtxAp,1,0),x,incx,NULL,offy,incy,RSB_OP_FLAG_DIAGONAL_OVERRIDE_EXPLICIT_SERIAL RSB_INNER_NRHS_SPSV_ARGS_IDS));
187 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(RSB_SUBMATRIX_INDEX(mtxAp,0,0),x,y,&pone[0],incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
188 		break;
189 		default:
190 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(RSB_SUBMATRIX_INDEX(mtxAp,0,0),x,y,&pone[0],incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
191 			if(RSB_SUBMATRIX_INDEX(mtxAp,1,0) && op_flags != RSB_OP_FLAG_INFINITE_PARALLELISM_EMULATE)
192 				RSB_DO_ERROR_CUMULATE(errval,rsb__do_spmv_general(transl,&mone[0],RSB_SUBMATRIX_INDEX(mtxAp,1,0),x,incx,NULL,offy,incy,RSB_OP_FLAG_DIAGONAL_OVERRIDE_EXPLICIT_SERIAL RSB_INNER_NRHS_SPSV_ARGS_IDS));
193 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(RSB_SUBMATRIX_INDEX(mtxAp,1,1),offx,offy,&pone[0],incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
194 		break;
195 		}
196 		break;
197 		default:
198 			errval = RSB_ERR_INTERNAL_ERROR;goto err;
199 		break;
200 		}
201 	}
202 #else
203 	{
204 		rsb_submatrix_idx_t i,j;
205 		struct rsb_mtx_t * submatrix=NULL;
206 		void*offy=NULL; const void *offx=NULL;
207 
208 		if( RSB_DOES_NOT_TRANSPOSE( transl ) )
209 		{
210 		RSB_SUBMATRIX_FOREACH(mtxAp,submatrix,i,j)
211 		if(submatrix)
212 		{
213 //			RSB_STDOUT("%d %d \n",i,j);
214 			rsb_coo_idx_t scoff=submatrix->coff; rsb_coo_idx_t sroff=submatrix->roff;
215 
216 			RSB_DEBUG_ASSERT(scoff>=0);
217 			RSB_DEBUG_ASSERT(sroff>=0);
218 
219 			//RSB_ERROR("-> 0x%p %d %d (%d) (%d)\n",submatrix,submatrix->roff,submatrix->coff,submatrix->nnz, rsb__is_recursive_matrix(mtxAp->flags));
220 
221 				offy=((rsb_byte_t*)y)+(mtxAp->el_size*sroff)*incy,offx=((const rsb_byte_t*)x)+(mtxAp->el_size*scoff)*incx;
222 			if(submatrix->roff==submatrix->coff && (i==(RSB_DOES_NOT_TRANSPOSE(transl))?0:1) )
223 			{
224 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(submatrix,x,y,alphap,incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
225 			}
226 			else
227 			if(submatrix->roff==submatrix->coff && (i==(RSB_DOES_NOT_TRANSPOSE(transl))?1:0) )
228 			{
229 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(submatrix,x,y,alphap,incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
230 			}
231 			else
232 			//if(i==((RSB_DOES_NOT_TRANSPOSE(transl))?1:0))
233 			if(i==1 && op_flags != RSB_OP_FLAG_INFINITE_PARALLELISM_EMULATE)
234 			{
235 			//	RSB_STDOUT("offx %g offy %g\n",*(double*)offx,*(double*)offy);
236 //				RSB_STDOUT("spmv %d %d\n",submatrix->roff,submatrix->coff);
237 				/* FIXME : DOES NOT TAKE INTO ACCOUNT INCX,INCY */
238 				/* transposition is not relevant, as long as we work with square matrices everywhere */
239 			//	if(y!=x)
240 			//		RSB_DO_ERROR_CUMULATE(errval,rsb__xcopy(offy,x,0,0,sroff,mtxAp->el_size));
241 				// FIXME : the following lines should be equivalent, but they aren't . why ?
242 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_spmv_recursive_serial(submatrix,offx,offy,&mone[0],NULL,1,1,transl RSB_INNER_NRHS_SPMV_ARGS_IDS));
243 			//	RSB_DO_ERROR_CUMULATE(errval,rsb_do_spmv_recursive_parallel(submatrix,offx,offy,&mone[0],NULL,1,1,transl));
244 				//RSB_DO_ERROR_CUMULATE(errval,rsb_spmv_unua(submatrix,offx,offy,transl));
245 			//	RSB_STDOUT("offx %g offy %g\n",*(double*)offx,*(double*)offy);
246 			}
247 			if(RSB_SOME_ERROR(errval))
248 				goto err;
249 		}}
250 		else
251 		{
252 		RSB_SUBMATRIX_FOREACH_REVERSE(mtxAp,submatrix,i,j)
253 		{
254 		//	RSB_STDOUT("%d %d \n",i,j);
255 		if(submatrix)
256 		{
257 //			RSB_STDOUT("%d %d \n",i,j);
258 			rsb_coo_idx_t scoff=submatrix->coff; rsb_coo_idx_t sroff=submatrix->roff;
259 
260 			RSB_DEBUG_ASSERT(scoff>=0);
261 			RSB_DEBUG_ASSERT(sroff>=0);
262 
263 			offy=((rsb_byte_t*)y)+(mtxAp->el_size*sroff)*incy,offx=((const rsb_byte_t*)x)+(mtxAp->el_size*scoff)*incx;
264 			if(submatrix->roff==submatrix->coff && (i==(RSB_DOES_NOT_TRANSPOSE(transl)))?0:1) )
265 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(submatrix,x,y,alphap,incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
266 			else
267 			if(submatrix->roff==submatrix->coff && (i==RSB_DOES_NOT_TRANSPOSE(transl)))?1:0) )
268 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_recursive_serial(submatrix,x,y,alphap,incx,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS));
269 			else
270 			if(i==1 && op_flags != RSB_OP_FLAG_INFINITE_PARALLELISM_EMULATE)
271 			{
272 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_spmv_recursive_serial(submatrix,offx,offy,&mone[0],NULL,1,1,transl RSB_INNER_NRHS_SPMV_ARGS_IDS));
273 			}
274 			if(errval != RSB_ERR_NO_ERROR)goto err;
275 		}}
276 		}
277 	}
278 #endif
279 	else
280 	{
281 		void*offy=NULL; const void *offx=NULL;
282 		rsb_coo_idx_t scoff=0;
283 		rsb_coo_idx_t sroff=0;
284 		RSB_DEBUG_ASSERT(scoff>=0);
285 		RSB_DEBUG_ASSERT(sroff>=0);
286 		offy=((rsb_byte_t*)y)+(mtxAp->el_size*sroff)*incy,offx=((const rsb_byte_t*)x)+(mtxAp->el_size*scoff)*incx;
287 		RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_terminal(mtxAp,offx,offy,alphap,incx,incy,transl RSB_OUTER_NRHS_SPSV_ARGS_IDS));
288 	}
289 err:
290 	RSB_DO_ERR_RETURN(errval)
291 }
292 
rsb__do_get_submatrices_block_for_get_csr(const struct rsb_mtx_t * mtxAp,struct rsb_translated_matrix_t ** all_leaf_matricesp,rsb_submatrix_idx_t * all_leaf_matrices_np)293 rsb_err_t rsb__do_get_submatrices_block_for_get_csr(const struct rsb_mtx_t * mtxAp, struct rsb_translated_matrix_t ** all_leaf_matricesp, rsb_submatrix_idx_t * all_leaf_matrices_np)
294 {
295 	/**
296 	 * \ingroup gr_internals
297 	 * FIXME: rename : csr -> csc
298 	 * */
299 	rsb_err_t errval = RSB_ERR_NO_ERROR;
300 	rsb_submatrix_idx_t all_leaf_matrices_n=0;
301 	struct rsb_translated_matrix_t * all_leaf_matrices=NULL;
302 
303 	all_leaf_matrices_n=mtxAp->all_leaf_matrices_n;
304 	all_leaf_matrices = rsb__clone_area(mtxAp->all_leaf_matrices,sizeof(struct rsb_translated_matrix_t)*all_leaf_matrices_n);
305 	errval = rsb__sort_array_of_leaf_matrices(NULL,all_leaf_matrices,all_leaf_matrices_n,rsb_op_get_csr);
306 
307 	*all_leaf_matrices_np=all_leaf_matrices_n;
308 	*all_leaf_matricesp=all_leaf_matrices;
309 	RSB_DO_ERR_RETURN(errval)
310 }
311 
rsb__do_get_submatrices_for_ussv(const struct rsb_mtx_t * mtxAp,struct rsb_translated_matrix_t ** all_leaf_matricesp,rsb_submatrix_idx_t * all_leaf_matrices_np,rsb_trans_t transT)312 rsb_err_t rsb__do_get_submatrices_for_ussv( const struct rsb_mtx_t * mtxAp, struct rsb_translated_matrix_t ** all_leaf_matricesp, rsb_submatrix_idx_t * all_leaf_matrices_np, rsb_trans_t transT)
313 {
314 	/**
315 	  	\ingroup gr_internals
316 	*/
317 	rsb_err_t errval = RSB_ERR_NO_ERROR;
318 	rsb_submatrix_idx_t all_leaf_matrices_n=0;
319 	struct rsb_translated_matrix_t * all_leaf_matrices=NULL;
320 
321 	all_leaf_matrices_n=mtxAp->all_leaf_matrices_n;
322 	all_leaf_matrices = rsb__clone_area(mtxAp->all_leaf_matrices,sizeof(struct rsb_translated_matrix_t)*all_leaf_matrices_n);
323 	rsb__submatrices_exclude_nontriangular(all_leaf_matrices,&all_leaf_matrices_n,mtxAp);
324 	errval = rsb__sort_array_of_leaf_matrices_for_ussv(mtxAp,all_leaf_matrices,all_leaf_matrices_n,transT);
325 
326 	*all_leaf_matrices_np=all_leaf_matrices_n;
327 	*all_leaf_matricesp=all_leaf_matrices;
328 	RSB_DO_ERR_RETURN(errval)
329 }
330 
rsb__submatrices_exclude_nontriangular(struct rsb_translated_matrix_t * all_leaf_matrices,rsb_submatrix_idx_t * all_leaf_matrices_np,const struct rsb_mtx_t * mtxAp)331 void rsb__submatrices_exclude_nontriangular(struct rsb_translated_matrix_t * all_leaf_matrices, rsb_submatrix_idx_t * all_leaf_matrices_np, const struct rsb_mtx_t * mtxAp)
332 {
333 	/**
334 	  	\ingroup gr_internals
335 	*/
336 	rsb_submatrix_idx_t n, all_leaf_matrices_n=0;
337 	RSB_DEBUG_ASSERT(mtxAp);
338 	RSB_DEBUG_ASSERT(all_leaf_matrices);
339 	RSB_DEBUG_ASSERT(all_leaf_matrices_np);
340 	if(rsb__is_upper_triangle(mtxAp->flags))
341 	{
342 		for(n=0;n<mtxAp->all_leaf_matrices_n;++n)
343 			if (mtxAp->all_leaf_matrices[n].roff<=mtxAp->all_leaf_matrices[n].coff)
344 				all_leaf_matrices[all_leaf_matrices_n++]=mtxAp->all_leaf_matrices[n];
345 	}
346 	else
347 	{
348 		for(n=0;n<mtxAp->all_leaf_matrices_n;++n)
349 			if (mtxAp->all_leaf_matrices[n].roff>=mtxAp->all_leaf_matrices[n].coff)
350 				all_leaf_matrices[all_leaf_matrices_n++]=mtxAp->all_leaf_matrices[n];
351 	}
352 //	if(all_leaf_matrices_n<mtxAp->all_leaf_matrices_n)
353 //	;
354 	//;RSB_STDOUT("FIX : discarded %d upper diagonal matrices out of %d \n",mtxAp->all_leaf_matrices_n-all_leaf_matrices_n,mtxAp->all_leaf_matrices_n);
355 	*all_leaf_matrices_np=all_leaf_matrices_n;
356 }
357 
rsb__do_spsv_uxua_recursive_parallel(const struct rsb_mtx_t * mtxAp,const void * x,void * y,const void * alphap,rsb_coo_idx_t incx,rsb_coo_idx_t incy,rsb_trans_t transl,enum rsb_op_flags_t op_flags RSB_INNER_NRHS_SPSV_ARGS)358 static rsb_err_t rsb__do_spsv_uxua_recursive_parallel(const struct rsb_mtx_t * mtxAp, const void * x, void * y, const void * alphap, rsb_coo_idx_t incx, rsb_coo_idx_t incy, rsb_trans_t transl, enum rsb_op_flags_t op_flags RSB_INNER_NRHS_SPSV_ARGS)
359 {
360 	/**
361 	  	\ingroup gr_internals
362 	 * triangular solve for recursive
363 	 *
364 	 * each submatrix starting at (i,j) depends on :
365 
366 	   i if i=j
367 
368 	   1 2 3 4 5 6
369 	  +-+---------+
370 	  +-+-+       | 1
371 	  +-+-+-+     | 2
372 	  +-+-+-+-+   | 3
373 	  +-+-+-+-+-+ | 4
374 	  +-+-+-+-+-+-+ 5
375 	  +-+-+-+-+-+-+ 6
376 
377 	  The active rows in the spmv are locked with an interval information.
378 	  Since the rows interval active in trsv is not under spmv, there is no need for another lock.
379 
380 	  We need a ...
381 
382 	  alphap NULL means alpha = 1
383 
384 	 * */
385 	rsb_err_t errval = RSB_ERR_NO_ERROR;
386 #if RSB_WANT_OMP_RECURSIVE_KERNELS
387 #if	RSB_WANT_VERBOSE_SPSV
388 	rsb_time_t sv_time = RSB_CONST_IMPOSSIBLY_BIG_TIME,dp_time = RSB_CONST_IMPOSSIBLY_BIG_TIME;
389 #endif /* RSB_WANT_VERBOSE_SPSV */
390 	struct rsb_translated_matrix_t * all_leaf_matrices=NULL;	/** NEW, EXPERIMENTAL */
391 	struct rsb_rows_lock_struct_t lock;
392 	rsb_submatrix_idx_t * deps=NULL;	/** NEW, EXPERIMENTAL */
393 	rsb_submatrix_idx_t all_leaf_matrices_n=0;
394 	rsb_bool_t backdeps,isupptri;
395 	rsb_aligned_t alpha_inv[RSB_CONST_ENOUGH_ALIGNED_FOR_ANY_TYPE];
396 	rsb_aligned_t mone[RSB_CONST_ENOUGH_ALIGNED_FOR_ANY_TYPE];
397 	rsb_aligned_t pone[RSB_CONST_ENOUGH_ALIGNED_FOR_ANY_TYPE];
398 	rsb__util_set_area_to_converted_integer(&mone[0],mtxAp->typecode,-1);
399 	rsb__util_set_area_to_converted_integer(&pone[0],mtxAp->typecode,+1);
400 
401 	RSB_BZERO_P(&lock);
402 	if( !mtxAp)
403 	{errval = RSB_ERR_GENERIC_ERROR;goto err;}
404 	if( !rsb__is_recursive_matrix(mtxAp->flags) || !mtxAp)
405 	{errval = RSB_ERR_GENERIC_ERROR;goto err;}
406 	if(x!=y)/* FIXME */
407 	{errval = RSB_ERR_GENERIC_ERROR;goto err;}
408 
409 	isupptri = rsb__is_upper_triangle(mtxAp->flags);
410 	backdeps = RSB_BOOL_XOR(RSB_DOES_TRANSPOSE(transl),isupptri);
411 	rsb__util_set_area_to_negated_fraction(alpha_inv,alphap,mtxAp->typecode);
412 
413 #if	RSB_WANT_VERBOSE_SPSV
414 	dp_time = rsb_time();
415 #endif /* RSB_WANT_VERBOSE_SPSV */
416 
417 	errval = rsb__do_get_submatrices_for_ussv(mtxAp,&all_leaf_matrices,&all_leaf_matrices_n,transl);
418 	deps = rsb__malloc(sizeof(rsb_submatrix_idx_t)*all_leaf_matrices_n);
419 	if(RSB_SOME_ERROR(errval) || !all_leaf_matrices || !deps)
420 	{errval = RSB_ERR_ENOMEM;goto err;}
421 
422 #if 0
423 	{
424 		/* printout */
425 		rsb_submatrix_idx_t n;
426 			for(n=0;n<all_leaf_matrices_n;++n)
427 				RSB_STDOUT("got %d/%d:%d~%d,%d~%d\n",
428 					n,all_leaf_matrices_n,
429 					all_leaf_matrices[n].roff,all_leaf_matrices[n].mtxlp->nr+all_leaf_matrices[n].roff,
430 					all_leaf_matrices[n].coff,all_leaf_matrices[n].mtxlp->nc+all_leaf_matrices[n].coff);
431 	}
432 #endif
433 #if 0
434 	{
435 	rsb_submatrix_idx_t n;
436 	rsb_coo_idx_t s=0;
437 	for(n=0;n<all_leaf_matrices_n;++n)
438 		if(all_leaf_matrices[n].roff==all_leaf_matrices[n].coff)
439 		{
440 			s+=all_leaf_matrices[n].mtxlp->nr;
441 //			RSB_STDOUT("%d/%d [%d~%d,%d~%d] (on diag)\n",n,all_leaf_matrices_n,
442 //				all_leaf_matrices[n].roff, all_leaf_matrices[n].roff+all_leaf_matrices[n].mtxlp->nr-1,
443 //				all_leaf_matrices[n].coff, all_leaf_matrices[n].coff+all_leaf_matrices[n].mtxlp->nc-1);
444 		}
445 		else
446 			;
447 //			RSB_STDOUT("%d/%d [%d~%d,%d~%d] (not on diag)\n",n,all_leaf_matrices_n,
448 //				all_leaf_matrices[n].roff, all_leaf_matrices[n].roff+all_leaf_matrices[n].mtxlp->nr-1,
449 //				all_leaf_matrices[n].coff, all_leaf_matrices[n].coff+all_leaf_matrices[n].mtxlp->nc-1);
450 	if(mtxAp->nr != s)
451 	{	RSB_STDOUT("FATAL : sum of diagonal matrices rows %d != %d \n",s,mtxAp->nr);
452 		goto err;
453 }
454 	}
455 #endif
456 #if 0
457 	if(RSB_DOES_TRANSPOSE(transl))
458 	{
459 		rsb_submatrix_idx_t n; for(n=0;n<all_leaf_matrices_n;++n) {
460 			RSB_SWAP(rsb_coo_idx_t,all_leaf_matrices[n].roff,all_leaf_matrices[n].coff);
461 		}
462 	}
463 #endif
464 
465 	if(0)
466 	if(RSB_DOES_TRANSPOSE(transl))
467 	{
468 		rsb_submatrix_idx_t n;
469 		for(n=0;n<all_leaf_matrices_n;++n)
470 		{
471 			all_leaf_matrices[n].roff=mtxAp->nc-(all_leaf_matrices[n].coff+all_leaf_matrices[n].mtxlp->nc*1);
472 			all_leaf_matrices[n].coff=mtxAp->nr-(all_leaf_matrices[n].roff+all_leaf_matrices[n].mtxlp->nr*1);
473 			all_leaf_matrices[n].nr=all_leaf_matrices[n].mtxlp->nc;
474 			all_leaf_matrices[n].nc=all_leaf_matrices[n].mtxlp->nr;
475 		}
476 	}
477 
478 	if(errval != RSB_ERR_NO_ERROR)
479 		goto err;
480 #if 0
481 	{
482 		/* printout */
483 		rsb_submatrix_idx_t n;
484 			for(n=0;n<all_leaf_matrices_n;++n)
485 				RSB_STDOUT("got %d/%d:%d~%d,%d~%d\n",
486 					n,all_leaf_matrices_n,
487 //					all_leaf_matrices[n].mtxlp->roff,all_leaf_matrices[n].mtxlp->nr+all_leaf_matrices[n].mtxlp->roff,
488 //					all_leaf_matrices[n].mtxlp->coff,all_leaf_matrices[n].mtxlp->nc+all_leaf_matrices[n].mtxlp->coff);
489 					all_leaf_matrices[n].roff,all_leaf_matrices[n].nr+all_leaf_matrices[n].roff,
490 					all_leaf_matrices[n].coff,all_leaf_matrices[n].nc+all_leaf_matrices[n].coff);
491 	}
492 #endif
493 #if 1
494 	{
495 		rsb_submatrix_idx_t n;
496 		for(n=1;n<all_leaf_matrices_n;++n)
497 		{
498 			rsb_submatrix_idx_t np=n-1;
499 			if(all_leaf_matrices[n].roff==all_leaf_matrices[n].coff)
500 			{
501 				while(np>0 && all_leaf_matrices[np].roff!=all_leaf_matrices[np].coff)
502 					--np;
503 				//for(;np<n;++np)
504 				//	RSB_STDOUT("%d vs %d ? %d\n",n,np,rsb__compar_rcsr_matrix_for_spsvl(all_leaf_matrices+np,all_leaf_matrices+n));
505 			}
506 			else
507 			{
508 				while(np>0
509 				 && !(
510 					(  all_leaf_matrices[np].roff==all_leaf_matrices[np].coff ) /*&&
511 					( (all_leaf_matrices[np].coff+ all_leaf_matrices[np].mtxlp->nc)>=
512 					  (all_leaf_matrices[n ].coff+ all_leaf_matrices[n ].mtxlp->nc) )*/
513 					))
514 					--np;
515 			}
516 			deps[n]=np;
517 //#define RSB__TRSV_OUT__ 1
518 			if(RSB__TRSV_OUT__)RSB_STDOUT("dep: %d ->  %d\n",n,np);
519 		}
520 	}
521 #endif
522 #if 0
523 	if(RSB_DOES_TRANSPOSE(transl))
524 	{
525 		rsb_submatrix_idx_t n; for(n=0;n<all_leaf_matrices_n;++n) {
526 			RSB_SWAP(rsb_coo_idx_t,all_leaf_matrices[n].roff,all_leaf_matrices[n].coff);
527 		}
528 #if 1
529 		for(n=0;n<all_leaf_matrices_n/2;++n) {	RSB_SWAP(struct rsb_translated_matrix_t,all_leaf_matrices[n],all_leaf_matrices[(all_leaf_matrices_n-1)-n]);}
530 	}
531 #endif
532 #endif
533 #if 0
534 	{
535 	rsb_submatrix_idx_t n;
536 	for(n=0;n<all_leaf_matrices_n;++n)
537 		if(all_leaf_matrices[n].roff<all_leaf_matrices[n].coff)
538 		{
539 			RSB_ERROR("all_leaf_matrices[n].roff<all_leaf_matrices[n].coff (%d<%d) in a lower triangular matrix!\n",
540 			all_leaf_matrices[n].roff,all_leaf_matrices[n].coff);
541 			{errval = RSB_ERR_GENERIC_ERROR;goto err;}
542 		}
543 	}
544 #endif
545 #if 0
546 	{
547 	rsb_submatrix_idx_t n,ad=0,d=0,pad=0,pda=0;
548 //	int cppad=0,ncppad=0;
549 	for(n=0;n<mtxAp->all_leaf_matrices_n;++n)
550 	{
551 		if(all_leaf_matrices[n].roff==all_leaf_matrices[n].coff)
552 		{
553 			rsb_submatrix_idx_t nn;
554 			rsb_submatrix_idx_t dr=all_leaf_matrices[n].roff+all_leaf_matrices[n].mtxlp->nr;
555 			++d;
556 			for(nn=n;nn<mtxAp->all_leaf_matrices_n;++nn)
557 			if(
558 				all_leaf_matrices[nn].roff >= dr &&
559 				all_leaf_matrices[nn].roff > all_leaf_matrices[nn].coff &&
560 				all_leaf_matrices[nn].coff + all_leaf_matrices[nn].mtxlp->nc <= dr &&
561 				1
562 			)
563 			{
564 				pad++;	/* parallelizable anti diagonal */
565 			}
566 			for(nn=n+1;nn<mtxAp->all_leaf_matrices_n &&
567 				all_leaf_matrices[nn].roff > all_leaf_matrices[nn].coff ;++nn)
568 				pda++;
569 		}
570 	}
571 		RSB_STDOUT(
572 			"    diagonal blocks : %d\n"
573 			"antidiagonal blocks : %d\n"
574 			"prediagonal blocks : %d\n"
575 	//		"antidiagonal blocks on the critical path : %d\n"
576 	//		"antidiagonal blocks not on the critical path : %d\n"
577 			,d,pad,pda//,cppad,ncppad
578 			);
579 	}
580 #endif
581 
582 	errval = rsb__do_lock_init(&lock,rsb_global_session_handle.rsb_want_threads,all_leaf_matrices_n,mtxAp,op_flags);
583 	if(errval != RSB_ERR_NO_ERROR)
584 		goto err;
585 
586 
587 {
588 #if RSB_ENABLE_INNER_NRHS_SPSV
589 	const size_t leny=(mtxAp->el_size*outnri);
590 	rsb_int_t nrhsi=0;
591 	for(nrhsi=0;nrhsi<nrhs;++nrhsi)
592 #endif /* RSB_ENABLE_INNER_NRHS_SPMV */
593 	{
594 #if RSB_ENABLE_INNER_NRHS_SPSV
595 		void      *out=((      rsb_byte_t*)y)+(leny*nrhsi);
596 #else /* RSB_ENABLE_INNER_NRHS_SPMV */
597 		void      *out=((      rsb_byte_t*)y);
598 #endif /* RSB_ENABLE_INNER_NRHS_SPMV */
599 		RSB_CBLAS_X_SCAL_SPSV(mtxAp->typecode,rsb__do_get_rows_of(mtxAp,transl),alphap,out,incy);
600 
601 #if 1
602 	/* corner triangle solve */
603 	if(all_leaf_matrices_n)
604 	{
605 		rsb_submatrix_idx_t n=0;
606 		void * offy=((rsb_byte_t*)out)+(mtxAp->el_size*all_leaf_matrices[n].roff)*incy;
607 		RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_terminal(all_leaf_matrices[n].mtxlp,offy,offy,&pone[0],incy,incy,transl RSB_DEFAULT_OUTER_NRHS_SPMV_ARGS));
608 		rsb__do_lock_get(&lock,0,all_leaf_matrices[n].mtxlp->nr+all_leaf_matrices[n].roff,all_leaf_matrices[n].mtxlp->nr,all_leaf_matrices[n].coff,all_leaf_matrices[n].mtxlp->nc,n,transl);
609 		rsb__do_lock_release(&lock,0);
610 		lock.dm=1; /* first matrix processed */
611 		if(!backdeps)
612 			lock.dr=all_leaf_matrices[n].mtxlp->nr+all_leaf_matrices[n].roff;
613 		else
614 			lock.dr=all_leaf_matrices[n].roff;
615 	}
616 #else
617 	if(all_leaf_matrices_n)
618 	{
619 		if(backdeps)
620 			lock.dr=all_leaf_matrices[0].mtxlp->nr+all_leaf_matrices[0].roff;
621 		else
622 			lock.dr=all_leaf_matrices[0].mtxlp->nr;
623 	}
624 #endif
625 	}
626 }
627 
628 #if 	RSB_WANT_VERBOSE_SPSV
629 	sv_time = rsb_time();
630 	dp_time=-dp_time+sv_time;
631 #endif /* RSB_WANT_VERBOSE_SPSV */
632 	#pragma omp parallel reduction(|:errval) shared(lock,all_leaf_matrices,mtxAp)  RSB_NTC
633 	{
634 	const rsb_thr_t th_id = omp_get_thread_num();
635 	rsb_submatrix_idx_t n=0;
636 	rsb_submatrix_idx_t dm=0;
637 	#pragma omp barrier
638 
639 	if(th_id >= rsb_global_session_handle.rsb_want_threads)
640 		goto skip;
641 
642 	#pragma omp critical (rsb_spsv_crs)
643 	{ dm=lock.dm; }
644 
645 again:
646 	for(n=0;n<all_leaf_matrices_n;++n)
647 	//if(!RSB_BITMAP_GET(lock.bmap,1,lock.subms,0,n))
648 	{
649 		struct rsb_mtx_t *submatrix=all_leaf_matrices[n].mtxlp;
650 		const rsb_byte_t* trhs=((rsb_byte_t*)y)+mtxAp->el_size*all_leaf_matrices[n].coff*incx;
651 		rsb_byte_t* tout=((rsb_byte_t*)y)+mtxAp->el_size*all_leaf_matrices[n].roff*incy;
652 		enum rsb_op_t op = rsb_op_nop;
653 		rsb_coo_idx_t roff=all_leaf_matrices[n].roff;
654 		rsb_coo_idx_t coff=all_leaf_matrices[n].coff;
655 
656 //#define RSB__TRSV_OUT__ 1
657 		#pragma omp critical (rsb_spsv_crs)
658 		{
659 		if( th_id==0 )
660 		{
661 			if(
662 			 	(
663 				 	(( backdeps) && coff+all_leaf_matrices[n].mtxlp->nc==lock.dr ) ||
664 				 	((!backdeps) && roff==lock.dr )
665 				)
666 				&& roff==coff  &&
667 					(!RSB_BITMAP_GET(lock.bmap,1,lock.subms,0,n))
668 				)
669 				{
670 					rsb_submatrix_idx_t np=deps[n];
671 					while(RSB_BITMAP_GET(lock.bmap,1,lock.subms,0,np))
672 					{
673 						++np;
674 						if(RSB__TRSV_OUT__) RSB_STDOUT("%d -> %d\n",n,np);
675 					}
676 					if(np==n && (rsb__do_lock_get(&lock,th_id,roff,all_leaf_matrices[n].mtxlp->nr,coff,all_leaf_matrices[n].mtxlp->nc,n,transl)==RSB_BOOL_TRUE))
677 						op = rsb_op_spsvl;
678 				}
679 		}
680 
681 		if(op == rsb_op_nop)
682 		{
683 //			if(RSB__TRSV_OUT)RSB_STDOUT("%d@%d %d %d %d %d\n",n,th_id,op,all_leaf_matrices[n].roff,all_leaf_matrices[n].coff,omp_get_num_threads());
684 			if(
685 			 	(((!backdeps) &&
686 				(
687 				 (roff >=lock.dr && !isupptri &&
688 				roff != coff &&
689 				coff + all_leaf_matrices[n].mtxlp->nc <= lock.dr) ||
690 				 (coff >=lock.dr && isupptri &&
691 				roff != coff &&
692 				roff + all_leaf_matrices[n].mtxlp->nr <= lock.dr)
693 				)
694 			       	)
695 				 ||
696 			  	(( backdeps) &&
697 				(( isupptri &&(coff >= lock.dr)) ||
698 				((!isupptri)&&(roff >= lock.dr))) &&
699 			       	roff != coff //&&
700 				//coff + all_leaf_matrices[n].mtxlp->nc <=lock.dr
701 				))
702 			       	&&
703 #if RSB_WANT_BOUNDED_BOXES_SPSV
704 				(rsb__do_lock_get(&lock,th_id,all_leaf_matrices[n].mtxlp->broff,all_leaf_matrices[n].mtxlp->bm,all_leaf_matrices[n].mtxlp->bcoff,all_leaf_matrices[n].mtxlp->bk,n,transl)==RSB_BOOL_TRUE)&&
705 #else /* RSB_WANT_BOUNDED_BOXES_SPSV */
706 				(rsb__do_lock_get(&lock,th_id,all_leaf_matrices[n].roff,all_leaf_matrices[n].mtxlp->nr,all_leaf_matrices[n].coff,all_leaf_matrices[n].mtxlp->nc,n,transl)==RSB_BOOL_TRUE)&&
707 #endif /* RSB_WANT_BOUNDED_BOXES_SPSV */
708 				1
709 				)
710 				op = rsb_op_spmv ;
711 		}
712 		}
713 //			if(RSB__TRSV_OUT)RSB_STDOUT("%d@%d %d %d %d %d\n",n,th_id,op,all_leaf_matrices[n].roff,all_leaf_matrices[n].coff,omp_get_num_threads());
714 		if(RSB__TRSV_OUT__)dm=lock.dm;
715 		if(RSB__TRSV_OUT__)
716 		if(
717 				(!RSB_BITMAP_GET(lock.bmap,1,lock.subms,0,n) && op == rsb_op_nop)||
718 				( RSB_BITMAP_GET(lock.bmap,1,lock.subms,0,n) && op != rsb_op_nop)
719 				)
720 		RSB_STDOUT("%d/%d [%d~%d,%d~%d] on th.%d -> op %d (dr:%d) (done:%d)\n",n,all_leaf_matrices_n,
721 				all_leaf_matrices[n].roff, all_leaf_matrices[n].roff+submatrix->nr-1,
722 				all_leaf_matrices[n].coff, all_leaf_matrices[n].coff+submatrix->nc-1,
723 				th_id,op,lock.dr,dm);
724 
725 		switch(op){
726 		case rsb_op_spsvl:
727 		{
728 			/* diagonal blocks */
729 			if(RSB__TRSV_OUT__)RSB_STDOUT("spsv on %d on %d \n",n,th_id);
730 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_terminal(submatrix,trhs,tout,&pone[0],incx,incy,transl RSB_OUTER_NRHS_SPSV_ARGS_IDS));
731 			//RSB_DO_ERROR_CUMULATE(errval,rsb_do_spsv_terminal(submatrix,trhs,tout,alphap,incx,incy,transl RSB_OUTER_NRHS_SPSV_ARGS_IDS));
732                        	#pragma omp critical (rsb_spsv_crs)
733 			{
734 				if(!backdeps)
735 			       		lock.dr=submatrix->nr+roff;
736 				else
737 			       		lock.dr=submatrix->roff;
738 				rsb__do_lock_release(&lock,th_id); ++lock.dm; dm=lock.dm;
739 			}
740 		}
741 		break;
742 		case rsb_op_spmv:
743 		{
744 			/* antidiagonal blocks */
745 			if(RSB__TRSV_OUT__)RSB_STDOUT("spmv on %d on %d \n",n,th_id);
746 			//RSB_DO_ERROR_CUMULATE(errval,rsb_spmv_unua(submatrix,trhs,tout,transl));
747 			RSB_DO_ERROR_CUMULATE(errval,rsb_do_spmv_non_recursive(submatrix,trhs,tout,&mone[0],NULL,incx,incy,transl RSB_OUTER_NRHS_SPSV_ARGS_IDS));
748                        	#pragma omp critical (rsb_spsv_crs)
749 			{rsb__do_lock_release(&lock,th_id);++lock.dm;dm=lock.dm;}
750 		}
751 		break;
752 		case rsb_op_nop:
753 		{
754 		}
755 		break;
756 		default: //rsb_op_spsvlt, rsb_op_spsvu, rsb_op_spsvut, rsb_op_get_csr
757 			RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
758 		}
759 		//if(errval != RSB_ERR_NO_ERROR)
760 		//	break;
761 		//if(op != rsb_op_nop)
762 
763 	}
764 
765 	#pragma omp critical (rsb_spsv_crs)
766 	{ dm=lock.dm; }
767 	if(RSB__TRSV_OUT__)RSB_STDOUT("on thread %d : done %d/%d \n",th_id,lock.dm,all_leaf_matrices_n);
768 	if(dm<all_leaf_matrices_n
769 #if RSB_WANT_EARLY_PARALLEL_REGION_JUMPOUT_SPSV
770 			&& ((all_leaf_matrices_n-dm)>th_id)
771 #endif /* RSB_WANT_EARLY_PARALLEL_REGION_JUMPOUT_SPSV */
772 	)goto again;
773 skip:
774 		RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS;
775 		/* FIXME : could place a barrier here. */
776 		#pragma omp barrier
777 		/* now we can leave the parallel region safely */
778 	}
779 err:
780 	RSB_CONDITIONAL_FREE(all_leaf_matrices);
781 	RSB_CONDITIONAL_FREE(deps);
782 	RSB_DO_ERROR_CUMULATE(errval,rsb__do_lock_free(&lock));
783 #if	RSB_WANT_VERBOSE_SPSV
784 	sv_time=-sv_time+rsb_time();
785 	RSB_INFO("SPSV: solve time:%lg   deps time:%lg\n",sv_time,dp_time);
786 #endif /* RSB_WANT_VERBOSE_SPSV */
787 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
788 	errval = RSB_ERR_UNSUPPORTED_OPERATION;
789 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
790 	RSB_DO_ERR_RETURN(errval)
791 }
792 
rsb__do_spsv_general(rsb_trans_t transl,const void * alphap,const struct rsb_mtx_t * mtxAp,const void * x,rsb_coo_idx_t incx,void * y,rsb_coo_idx_t incy,enum rsb_op_flags_t op_flags RSB_INNER_NRHS_SPSV_ARGS)793 rsb_err_t rsb__do_spsv_general(rsb_trans_t transl, const void * alphap, const struct rsb_mtx_t * mtxAp, const void * x, rsb_coo_idx_t incx, void * y, rsb_coo_idx_t incy, enum rsb_op_flags_t op_flags RSB_INNER_NRHS_SPSV_ARGS)
794 {
795 	/**
796 	  	\ingroup gr_internals
797 	 	computes \f$y \leftarrow \alpha op(A)^{-1} \cdot y \f$
798 
799 		Entry function for SPSV.
800 		alphap can be NULL.
801 
802 		FIXME : incx and incy check should be stricter
803 		FIXME : x!=y are disabled
804 	 * \return RSB_ERR_NO_ERROR on correct operation, an error code (see \ref errors_section) otherwise.
805 	*/
806 	rsb_err_t errval = RSB_ERR_NO_ERROR;
807 
808 	if(!mtxAp || !y || !x || transl == RSB_INVALID_FLAGS || !rsb__is_square(mtxAp) || incx<1 || incy<1)
809 	{
810 		errval = RSB_ERR_BADARGS;goto err;
811 	}
812 	if(!RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_TRIANGULAR))
813 	{
814 		errval = RSB_ERR_BADARGS;goto err;
815 	}
816 	if(RSB__FLAG_HAS_UNSPECIFIED_TRIANGLE(mtxAp->flags))
817 	{
818 		errval = RSB_ERR_BADARGS;
819 		RSB_PERR_GOTO(err,RSB_ERRM_BOH_TRI);
820 	}
821 #if 1
822 	if(x!=y)
823 	{
824 			// FIXME: should parallelize this
825 		rsb_int_t nrhsi = 0;
826 #if RSB_ENABLE_INNER_NRHS_SPSV
827 		for (nrhsi=0;nrhsi<nrhs;++nrhsi)
828 #endif /* RSB_ENABLE_INNER_NRHS_SPSV */
829 		{
830 			//RSB_DO_ERROR_CUMULATE(errval,rsb__xcopy_strided_typed(y,x,0,0,mtxAp->nr,mtxAp->typecode,incy,incx));
831 			RSB_DO_ERROR_CUMULATE(errval,rsb__cblas_Xcopy(mtxAp->typecode,mtxAp->nr,RSB_TYPED_OFF_PTR(mtxAp->typecode,x,nrhsi*(rhsnri)*incx),incx,RSB_TYPED_OFF_PTR(mtxAp->typecode,y,nrhsi*(outnri)*incy),incy));
832 		}
833 	}
834 #endif
835 
836 	if(op_flags == RSB_OP_FLAG_INFINITE_PARALLELISM_EMULATE)
837 	{
838 		errval = rsb_do_spsv_recursive_serial(mtxAp,y,y,alphap,incy,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS);
839 		goto done;
840 	}
841 	if( !rsb__is_recursive_matrix(mtxAp->flags))
842 		errval = rsb_do_spsv_terminal(mtxAp,y,y,alphap,incy,incy,transl RSB_OUTER_NRHS_SPSV_ARGS_IDS);
843 	else
844 	{
845 #if RSB_WANT_OMP_RECURSIVE_KERNELS
846 		if(op_flags == RSB_OP_FLAG_WANT_SERIAL)
847 			errval = rsb_do_spsv_recursive_serial(mtxAp,y,y,alphap,incy,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS);
848 		else
849 			errval = rsb__do_spsv_uxua_recursive_parallel(mtxAp,y,y,alphap,incy,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS);
850 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
851 		errval = rsb_do_spsv_recursive_serial(mtxAp,y,y,alphap,incy,incy,transl,op_flags RSB_INNER_NRHS_SPSV_ARGS_IDS);
852 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
853 	}
854 	goto done;
855 done:
856 #if 0
857                 {
858 			/* FIXME : won't work with when incx or incy is not 1 */
859 			rsb_aligned_t checksum[RSB_CONST_ENOUGH_ALIGNED_FOR_ANY_TYPE];
860 			RSB_CBLAS_X_SCAL_SPSV(mtxAp->typecode,1,NULL,checksum,1);
861 			rsb_nnz_idx_t n;
862 			rsb__util_vector_sum(checksum,y,mtxAp->typecode,mtxAp->nr);
863 			RSB_STDOUT("#spsv checksum:\n");
864 			rsb__debug_print_value(checksum,typecode);
865 			RSB_STDOUT("\n");
866                 }
867 #endif
868 err:
869 	RSB_DO_ERR_RETURN(errval)
870 }
871 
rsb__do_spsv(rsb_trans_t transT,const void * alphap,const struct rsb_mtx_t * mtxTp,const void * Xp,rsb_coo_idx_t incX,void * Yp,rsb_coo_idx_t incY)872 rsb_err_t rsb__do_spsv(rsb_trans_t transT, const void * alphap, const struct rsb_mtx_t * mtxTp, const void * Xp, rsb_coo_idx_t incX, void * Yp, rsb_coo_idx_t incY)
873 {
874 	return rsb__do_spsv_general(transT,alphap,mtxTp,Xp,incX,Yp,incY,RSB_OP_FLAG_DEFAULT RSB_DEFAULT_OUTER_NRHS_SPSV_ARGS	);
875 }
876 
877 /* @endcond */
878