1 /*
2 
3 Copyright (C) 2008-2021 Michele Martone
4 
5 This file is part of librsb.
6 
7 librsb is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11 
12 librsb is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15 License for more details.
16 
17 You should have received a copy of the GNU Lesser General Public
18 License along with librsb; see the file COPYING.
19 If not, see <http://www.gnu.org/licenses/>.
20 
21 */
22 /* @cond INNERDOC  */
23 /*!
24  * @file
25  * @brief  Recursive Sparse matrices assembling code.
26  * @author Michele Martone
27  * */
28 /*
29  * TODO: improve this code, because there is a number of unclean practices which could break a build.
30  * */
31 #include "rsb_common.h"
32 
33 #ifndef RSB_C2R_ASSERT
34 //#define RSB_C2R_ASSERT(e) assert(e)		// uncomment this to use   asserts
35 #define RSB_C2R_ASSERT(e)			// uncomment this to avoid asserts
36 #else /* RSB_C2R_ASSERT */
37 #undef RSB_C2R_ASSERT
38 #define RSB_C2R_ASSERT(e)
39 #endif /* RSB_C2R_ASSERT */
40 
41 #define RSB_DO_ENOUGHNNZFORINDEXBASEDBUILD(M) (!RSB_DO_TOOFEWNNZFORRCSR((M)->nnz,(M)->nr))
42 #define RSB_C2R_IF_VERBOSE 0	/* activates output which is useful for debugging */
43 #define RSB_C2R_PARANOIA 0
44 
45 #define RSB_MEMCPY_SMALL_GENERAL(ID,IS,DOFF,SOFF,NNZ,TYPE) \
46 	{ \
47 		TYPE*dp = ((TYPE*)(ID))+(DOFF),*ld = dp+(NNZ); \
48 		const register TYPE*sp = ((TYPE*)(IS))+(SOFF); \
49 		for(;dp<ld;++sp,++dp)*dp = *sp; \
50        	}
51 
52 #define RSB_C2R_WANT_MAYBE_FASTER 0
53 
54 #if RSB_C2R_WANT_MAYBE_FASTER
55 #define RSB_COA_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ) RSB_MEMCPY_SMALL_GENERAL(ID,IS,DOFF,SOFF,NNZ,rsb_coo_idx_t)
56 #define RSB_COA_MEMCPY_ROWSZ(ID,IS,DOFF,SOFF,NNZ) RSB_COA_MEMCPY_parallel(ID,IS,DOFF,SOFF,NNZ)
57 //#define RSB_A_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ,ES) RSB_MEMCPY_SMALL_GENERAL(ID,IS,DOFF,SOFF,NNZ,double)	/* FIXME */
58 #define RSB_A_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ,ES) RSB_A_MEMCPY(ID,IS,DOFF,SOFF,NNZ,ES)
59 #else /* RSB_C2R_WANT_MAYBE_FASTER */
60 #define RSB_COA_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ) RSB_COA_MEMCPY(ID,IS,DOFF,SOFF,NNZ)
61 #define RSB_A_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ,ES) RSB_A_MEMCPY(ID,IS,DOFF,SOFF,NNZ,ES)
62 #endif /* RSB_C2R_WANT_MAYBE_FASTER */
63 
64 #define RSB_TIC(T) (T) = -rsb_time()
65 #define RSB_TOC(T) (T) += rsb_time()
66 #define RSB_TOC_TIC(T,U) {rsb_time_t t = rsb_time();(T) += t;(U) = -t;}
67 #define RSB_TIC_TOC(U,T) {rsb_time_t t = rsb_time();(T) += t;(U) = -t;}
68 
69 #define RSB_WANT_BINSEARCH_MIN_NZPR 8	/* FIXME */
70 #define RSB_WANT_VERBOSE_TIMINGS 0
71 #define RSB_WANT_VERBOSE_SUBDIVISION 0	/* */
72 #define RSB_WANT_VERBOSE_SUBDIVISION2 0	/* */
73 #define RSB_WANT_MORE_PARALLELISM 1	/* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
74 #define RSB_WANT_FIRST_VERSION 0	/* FIXME: EXPERIMENTAL, BY DEFAULT TURNED ON (1) */
75 #define RSB_WANT_LITTLE_IMPROVED 1	/* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
76 #if RSB_WANT_OMP_RECURSIVE_KERNELS
77 #define RSB_WANT_PARALLEL_SUBDIVISION 1	/* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
78 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
79 #define RSB_WANT_PARALLEL_SUBDIVISION 0	/* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
80 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
81 #define RSB_WANT_QUADRANT_QUICK_DETECT 0/* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
82 #define RSB_WANT_SUBDIVISION_FIXES_20101120 1 /* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
83 #define RSB_WANT_SUBDIVISION_FIXES_20101213 0 /* FIXME: EXPERIMENTAL ) */
84 #define RSB_WANT_FIX_BUG_DISCOVERED_20121210 1	/* this bug prevents from HCSR usage */
85 #if RSB_WANT_VERBOSE_SUBDIVISION2
86 //#define RSB_MTXASM_INFO RSB_INFO	/* NEW */
87 #define RSB_MTXASM_INFO printf	/* NEW */
88 #else /* RSB_MTXASM_INFO */
89 #define RSB_MTXASM_INFO 	/* NEW */
90 #endif /* RSB_MTXASM_INFO */
91 
92 #define RSB_SUBDIVISION_SKEW_MAX  (RSB_FLOAT_ONE/2.0)
93 #define RSB_MAX_QUADRANTS_UNBALANCE (4-1)
94 #define RSB_SUBDIVISION_BUG_EXTRA (4)		/* incorrect behaviour is encountered if setting this to 0, as it should (experienced on a 12 core machine). proper bugfix remains unknown to me. */
95 
96 #define RSB_WANT_FASTER_EXPERIMENTAL_CONSTRUCTOR 0 /* FIXME: this is experimental code and shall be finished ! */
97 RSB_INTERNALS_COMMON_HEAD_DECLS
98 
99 #if RSB_WANT_FASTER_EXPERIMENTAL_CONSTRUCTOR
100 #define RSB_POW2(P) (1<<(P))
101 //#define RSB_MUL2(P) ((P) *= 2)
102 #define RSB_MUL2(P) ((P)<<=1)
103 //#define RSB_HALF(V) (((V)+1)/2)
104 #define RSB_HALF(V) (((V)+1)>>1)
105 #define RSB_POW4(P) (RSB_POW2(P)*RSB_POW2(P))
rsb_coo_index_bit_interleave(rsb_coo_idx_t o,rsb_coo_idx_t e)106 static inline rsb_nnz_idx_t rsb_coo_index_bit_interleave(rsb_coo_idx_t o, rsb_coo_idx_t e)
107 {
108 	/* FIXME: this is DUPLICATE code !!! */
109 	rsb_nnz_idx_t i = 0, O = o, E = e;
110 	RSB_DEBUG_ASSERT(O>=0);
111 	RSB_DEBUG_ASSERT(E>=0);
112 	if (sizeof(rsb_nnz_idx_t)==1)
113 	{
114 		E = (E | (E << 2)) & 0x33;
115 		E = (E | (E << 1)) & 0x55;
116 		O = (O | (O << 2)) & 0x33;
117 		O = (O | (O << 1)) & 0x55;
118 	}
119 	else
120 	if (sizeof(rsb_nnz_idx_t)==2)
121 	{
122 		E = (E | (E << 4)) & 0x0F0F;
123 		E = (E | (E << 2)) & 0x3333;
124 		E = (E | (E << 1)) & 0x5555;
125 		O = (O | (O << 4)) & 0x0F0F;
126 		O = (O | (O << 2)) & 0x3333;
127 		O = (O | (O << 1)) & 0x5555;
128 	}
129 	else
130 	if (sizeof(rsb_nnz_idx_t)==4)
131 	{
132 		E = (E | (E << 8)) & 0x00FF00FF;
133 		E = (E | (E << 4)) & 0x0F0F0F0F;
134 		E = (E | (E << 2)) & 0x33333333;
135 		E = (E | (E << 1)) & 0x55555555;
136 		O = (O | (O << 8)) & 0x00FF00FF;
137 		O = (O | (O << 4)) & 0x0F0F0F0F;
138 		O = (O | (O << 2)) & 0x33333333;
139 		O = (O | (O << 1)) & 0x55555555;
140 	}
141 	else
142 	if (sizeof(rsb_nnz_idx_t)==8)
143 	{
144 		E = (E | (E <<16)) & 0x0000FFFF0000FFFF;
145 		E = (E | (E << 8)) & 0x00FF00FF00FF00FF;
146 		E = (E | (E << 4)) & 0x0F0F0F0F0F0F0F0F;
147 		E = (E | (E << 2)) & 0x3333333333333333;
148 		E = (E | (E << 1)) & 0x5555555555555555;
149 		O = (O | (O <<16)) & 0x0000FFFF0000FFFF;
150 		O = (O | (O << 8)) & 0x00FF00FF00FF00FF;
151 		O = (O | (O << 4)) & 0x0F0F0F0F0F0F0F0F;
152 		O = (O | (O << 2)) & 0x3333333333333333;
153 		O = (O | (O << 1)) & 0x5555555555555555;
154 	}
155 	else
156 	{
157 		RSB_ERROR(RSB_ERRM_FYRYNS);
158 		/* FIXME : fatal! */
159 	}
160 
161 	i = (E | (O << 1));
162 	RSB_DEBUG_ASSERT((i & ~-1)>=0);
163 	return i;
164 }
165 
rsb_assign_subm__(rsb_coo_idx_t * RSB_RESTRICT IA,rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz)166 rsb_err_t rsb_assign_subm__(rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz)
167 {
168 	rsb_err_t errval = RSB_ERR_NO_ERROR;
169 	const rsb_coo_idx_t nlev = 4;
170 	/* FIXME: irm, jrm fit together in a single byte! (1byte/nnz!) */
171 	rsb_coo_idx_t nlevi,nlevj;
172 	rsb_coo_idx_t mink = RSB_POW2(nlev),maxk = mink;
173 	rsb_nnz_idx_t scnt[16*16], nzi;
174 	rsb_coo_idx_t idiv[   16];
175 	rsb_coo_idx_t jdiv[   16];
176 	rsb_nnz_idx_t nzoff = 0;
177 	//rsb_coo_idx_t * zIA, * zJA;
178 	rsb_byte_t * zIA = NULL, * zJA = NULL;
179 	rsb_coo_idx_t * oIA = NULL, * oJA = NULL;
180 
181 	RSB_BZERO_P(&scnt);
182 	oJA = rsb__malloc(nnz*sizeof(rsb_coo_idx_t));
183 	oIA = rsb__malloc(nnz*sizeof(rsb_coo_idx_t));
184 	zJA = rsb__malloc(nnz*sizeof(rsb_coo_idx_t));
185 	zIA = rsb__malloc(nnz*sizeof(rsb_coo_idx_t));
186 	if(!oJA || !oIA) goto err;
187 	if(!zJA || !zIA) goto err;
188 	for(nzi=0;RSB_LIKELY(nzi<nnz);++nzi)
189 	{
190 		const rsb_coo_idx_t i = IA[nzi],j = JA[nzi];
191 		rsb_coo_idx_t irm = 0,jrm = 0;
192 		rsb_coo_idx_t sm = m,sk = k;
193 		rsb_coo_idx_t om = 0,ok = 0;
194 		rsb_int sidx;
195 		for(nlevi=0;nlevi<nlev;++nlevi)
196 		{
197 			rsb_coo_idx_t hm = RSB_HALF(sm),hk = RSB_HALF(sk);
198 			RSB_MUL2(irm);RSB_MUL2(jrm);
199 			if(i>=hm+om){irm += 1;sm-=hm;om += hm;}else{sm = hm;}
200 			if(j>=hk+ok){jrm += 1;sk-=hk;ok += hk;}else{sk = hk;}
201 		}
202 		zIA[nzi] = irm;
203 		zJA[nzi] = jrm;
204 		//sidx = 16*irm+jrm;
205 		sidx = rsb_coo_index_bit_interleave(irm,jrm);
206 		//scnt[sidx]++;
207 		//printf("hm:%d sm:%d\n",hm,sm); printf("hk:%d sk:%d\n",hk,sk);
208 		//printf("%d %d -> %d %d (%d)    at %d %d  sized %d %d\n",i,j,irm,jrm,sidx,om,ok,sm,sk);
209 	}
210 	if(0)
211 	for(nlevi=0;nlevi<nlev;++nlevi)
212 	for(nlevj=0;nlevj<nlev;++nlevj)
213 	{
214 		printf("%d %d : %d\n",nlevi,nlevj,scnt[16*nlevi+nlevj]);
215 	}
216 	for(nlevi=1;nlevi<nlev*nlev;++nlevi)
217 		scnt[nlev*nlev-nlevi] = scnt[nlev*nlev-nlevi-1];
218 	scnt[0] = 0;
219 	for(nlevi=1;nlevi<nlev*nlev;++nlevi)
220 		scnt[nlevi] += scnt[nlevi-1]; /* FIXME: shall use rsb__do_prefix_sum_coo_idx_t */
221 	for(nzi=0;RSB_LIKELY(nzi<nnz);++nzi)
222 	{
223 		rsb_int sidx;
224 		rsb_coo_idx_t irm = 0,jrm = 0;
225 		irm = zIA[nzi];
226 	       	jrm = zJA[nzi];
227 		sidx = rsb_coo_index_bit_interleave(irm,jrm);
228 		//sidx = 0;
229 		oIA[ scnt[sidx]  ] = IA[nzi];
230 		oJA[ scnt[sidx]++] = JA[nzi];
231 	}
232 	rsb__memcpy(IA,oIA,sizeof(rsb_coo_idx_t)*nnz);
233 	rsb__memcpy(JA,oJA,sizeof(rsb_coo_idx_t)*nnz);
234 	//for(nzi=0;RSB_LIKELY(nzi<nnz);++nzi)
235 	//printf("please ignore this value: %d\n",scnt[0]);
236 err:
237 	RSB_CONDITIONAL_FREE(oJA);
238 	RSB_CONDITIONAL_FREE(oIA);
239 	RSB_CONDITIONAL_FREE(zJA);
240 	RSB_CONDITIONAL_FREE(zIA);
241 	return errval;
242 }
243 
rsb_allocate_new__(void * RSB_RESTRICT VA,rsb_coo_idx_t * RSB_RESTRICT IA,rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz,rsb_type_t typecode,const struct rsb_mtx_partitioning_info_t * pinfop,rsb_flags_t flags,rsb_err_t * errvalp)244 static void rsb_allocate_new__(void *RSB_RESTRICT VA, rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp)
245 {
246 	rsb_time_t dt;
247 	long cs = rsb__get_first_level_c_size();
248 	//long cs = rsb__get_lastlevel_c_size();
249 	rsb_nnz_idx_t bnz = RSB_MIN(cs/(4*sizeof(rsb_coo_idx_t)),nnz),fnz;
250 	//rsb_nnz_idx_t bnz = nnz,fnz;
251 	if(!getenv("RSB_CB"))bnz = nnz;
252 	RSB_TIC(dt);
253 	for(fnz=0;fnz<nnz;fnz+=bnz)
254 	{
255 		rsb_assign_subm__(IA+fnz,JA+fnz,m,k,RSB_MIN(bnz,nnz-fnz));
256 	}
257 	RSB_TOC(dt);
258 	printf("%d cache blocks\n",(nnz+bnz-1)/bnz);
259 	printf("EXPERIMENTAL: processed indices at %lf Mnnz/s in %lf s\n",(RSB_FPINV(dt)*nnz)/RSB_MILLION_F,dt);
260 	exit(0);
261 }
262 #endif /* RSB_WANT_FASTER_EXPERIMENTAL_CONSTRUCTOR */
263 
rsb__do_set_in_place_submatrices_offsets(struct rsb_mtx_t * RSB_RESTRICT submatrices,rsb_submatrix_idx_t cmc,rsb_char_t * RSB_RESTRICT VA,rsb_coo_idx_t * RSB_RESTRICT IA,rsb_coo_idx_t * RSB_RESTRICT JA,size_t el_size)264 void rsb__do_set_in_place_submatrices_offsets(struct rsb_mtx_t *RSB_RESTRICT submatrices, rsb_submatrix_idx_t cmc, rsb_char_t *RSB_RESTRICT  VA, rsb_coo_idx_t *RSB_RESTRICT  IA, rsb_coo_idx_t *RSB_RESTRICT JA, size_t el_size)
265 {
266 	/**
267 		\ingroup gr_internals
268 		\note: if nnz==0 and diagonal implicit, this could be dangerous
269 	 */
270 	rsb_submatrix_idx_t smi;
271 	for(smi=0;smi<cmc;++smi)
272 	{
273 		struct rsb_mtx_t * submatrix = submatrices+smi;
274 		if(!RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING))
275 		{
276 			submatrix->bpntr = IA+submatrix->nzoff;
277 			submatrix->bindx = JA+submatrix->nzoff;
278 			submatrix->VA = ((rsb_char_t*)VA)+el_size*submatrix->nzoff;
279 		}
280 	}
281 }
282 
rsb__do_switch_recursive_matrix_to_fullword_storage(struct rsb_mtx_t * mtxAp)283 rsb_err_t rsb__do_switch_recursive_matrix_to_fullword_storage(struct rsb_mtx_t * mtxAp)
284 {
285 	/**
286 		\ingroup gr_internals
287 		TODO: move somewhere else
288 		FIXME: may be UNFINISHED
289 	 */
290 	rsb_err_t errval = RSB_ERR_NO_ERROR;
291 	if(!mtxAp)
292 	{
293 		RSB_ERROR(RSB_ERRM_E_MTXAP);
294 		return RSB_ERR_BADARGS;
295 	}
296 
297 	if(rsb__is_recursive_matrix(mtxAp->flags))
298 	{
299 		rsb_submatrix_idx_t i,j;
300 		struct rsb_mtx_t * submatrix;
301 		RSB_SUBMATRIX_FOREACH(mtxAp,submatrix,i,j)
302 			if(submatrix)
303 				RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_recursive_matrix_to_fullword_storage(submatrix));
304 	}
305 	else
306 	{
307 //		if(rsb__do_is_candidate_for_halfword_coo(mtxAp))
308 //			RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_fullword_coo(mtxAp));
309 //		else
310 //		if(rsb__do_is_candidate_for_halfword_csr(mtxAp))
311 //			RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_fullword_csr(mtxAp));
312 //		else
313 //		if(!rsb__is_root_matrix(mtxAp) || rsb__is_terminal_recursive_matrix(mtxAp)) /* root recursive or root nonrec. */
314 //			RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_MUTUALLY_EXCLUSIVE_SWITCHES);
315 //		else
316 //		;/* for root matrices, we keep the flags, because some of the leaves MAY have it */
317 		if( mtxAp->matrix_storage == RSB_MATRIX_STORAGE_BCOR )
318 		{
319 		       	if(RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES))
320 			{
321 				rsb__do_switch_array_to_fullword_coo((rsb_half_idx_t*)(mtxAp->bpntr),mtxAp->nnz,0);
322 				rsb__do_switch_array_to_fullword_coo((rsb_half_idx_t*)(mtxAp->bindx),mtxAp->nnz,0);
323 			       	RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
324 			}
325 		}
326 		else
327 		if( mtxAp->matrix_storage == RSB_MATRIX_STORAGE_BCSR )
328 		{
329 		       	if(RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES))
330 			{
331 				rsb__do_switch_array_to_fullword_coo((rsb_half_idx_t*)(mtxAp->bindx),mtxAp->nnz,0);
332 			       	RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
333 			}
334 		}
335 		else
336 			errval = RSB_ERR_BADARGS;
337 	}
338 
339 	RSB_DO_ERR_RETURN(errval)
340 }
341 
rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(struct rsb_mtx_t * mtxAp)342 static rsb_err_t rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(struct rsb_mtx_t * mtxAp)
343 {
344 	/**
345 		\ingroup gr_unfinished
346 		TODO: move somewhere else
347 	 */
348 	rsb_err_t errval = RSB_ERR_NO_ERROR;
349 	if(!mtxAp)
350 	{
351 		RSB_ERROR(RSB_ERRM_E_MTXAP);
352 		return RSB_ERR_BADARGS;
353 	}
354 
355 	if(rsb__is_recursive_matrix(mtxAp->flags))
356 	{
357 		RSB_ERROR(RSB_ERRM_ES);
358 		return RSB_ERR_BADARGS;
359 	}
360 	else
361 	{
362 		if(RSB_C2R_IF_VERBOSE && 0)
363 			RSB_INFO_MATRIX_SUMMARY(mtxAp),
364 			RSB_INFO(" -switch.."),
365 			RSB_INFO("HCOO?(%d)..",rsb__do_is_candidate_for_halfword_coo(mtxAp)),
366 			RSB_INFO("HCSR?(%d)",rsb__do_is_candidate_for_halfword_csr(mtxAp)),
367 			RSB_INFO("\n");
368 		if(RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_WANT_COO_STORAGE))
369 		{
370 			if(rsb__do_is_candidate_for_halfword_coo(mtxAp))
371 			{
372 				if(RSB_C2R_IF_VERBOSE)
373 					RSB_INFO("to halfword COO:"),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("\n");
374 				RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_halfword_coo(mtxAp));
375 			}
376 			else
377 				RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
378 		}
379 		else
380 //		if(rsb_do_is_candidate_for_fullword_coo(mtxAp))
381 //		{
382 //			RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_recursive_in_place_matrix_to_in_place_rcoo(mtxAp,RSB_BOOL_FALSE));
383 //			RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_fullword_coo(mtxAp));// FIXME: wrong naming :(
384 //		}
385 //		else
386 #if RSB_WANT_FIX_BUG_DISCOVERED_20121210
387 		if(!RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_WANT_COO_STORAGE))
388 #else /* RSB_WANT_FIX_BUG_DISCOVERED_20121210 */
389 		if( RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_WANT_COO_STORAGE))
390 #endif /* RSB_WANT_FIX_BUG_DISCOVERED_20121210 */
391 		{
392 			if(RSB_SOME_ERROR(rsb__do_is_candidate_for_halfword_csr(mtxAp)))
393 			{
394 				if(RSB_C2R_IF_VERBOSE)
395 					RSB_INFO("to halfword CSR:"),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("\n");
396 				RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_halfword_csr(mtxAp));
397 			}
398 			else
399 				RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
400 		}
401 		else
402 		//if(!rsb__is_root_matrix(mtxAp) || rsb__is_terminal_recursive_matrix(mtxAp)) /* root recursive or root nonrec. */
403 		//	RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_MUTUALLY_EXCLUSIVE_SWITCHES);
404 		//else
405 			RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
406 		;/* for root matrices, we keep the flags, because some of the leaves MAY have it */
407 	}
408 
409 	RSB_DO_ERR_RETURN(errval)
410 }
411 
412 #if !RSB_WANT_MORE_PARALLELISM
rsb_do_switch_fresh_recursive_matrix_to_halfword_storages(struct rsb_mtx_t * mtxAp)413 static rsb_err_t rsb_do_switch_fresh_recursive_matrix_to_halfword_storages(struct rsb_mtx_t * mtxAp)
414 {
415 	/**
416 		\ingroup gr_unfinished
417 TODO: move somewhere else
418 	 */
419 	rsb_err_t errval = RSB_ERR_NO_ERROR;
420 	if(!mtxAp)
421 	{
422 		RSB_ERROR(RSB_ERRM_E_MTXAP);
423 		return RSB_ERR_BADARGS;
424 	}
425 
426 	if(rsb__is_recursive_matrix(mtxAp->flags))
427 	{
428 		rsb_submatrix_idx_t i,j;
429 		const struct rsb_mtx_t * submatrix;
430 		RSB_SUBMATRIX_FOREACH(mtxAp,submatrix,i,j)
431 			if(submatrix)
432 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_recursive_matrix_to_halfword_storages(submatrix));
433 	}
434 	else
435 		errval = rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(mtxAp);
436 
437 	RSB_DO_ERR_RETURN(errval)
438 }
439 #endif /* RSB_WANT_MORE_PARALLELISM */
440 
rsb__check_bounds(struct rsb_mtx_t * mtxAp)441 rsb_err_t rsb__check_bounds(struct rsb_mtx_t * mtxAp)
442 {
443 	/* FIXME: need checks on Mdim, mdim, ... */
444 	rsb_err_t errval = RSB_ERR_NO_ERROR;
445 
446 	if(
447 			mtxAp->broff<0 || mtxAp->bcoff<0 ||
448 			/* mtxAp->broff<mtxAp->roff+broff || mtxAp->bcoff<mtxAp->coff+bcoff ||  */
449 			mtxAp->bm>mtxAp->nr ||
450 			mtxAp->bk>mtxAp->nc
451 	  )
452 	{
453 		RSB_ERROR(RSB_PRINTF_MATRIX_BOUNDS_SUMMARY_ARGS(mtxAp)); RSB_ERROR("\n");
454 
455 		RSB_ASSERT(! ( mtxAp->broff<0) );
456 		RSB_ASSERT(! ( mtxAp->bcoff<0) );
457 	/*	RSB_ASSERT(! ( mtxAp->broff<mtxAp->roff+broff) );
458 		RSB_ASSERT(! ( mtxAp->bcoff<mtxAp->coff+bcoff) ); */
459 		RSB_ASSERT(! ( mtxAp->bm>mtxAp->nr) );
460 		RSB_ASSERT(! ( mtxAp->bk>mtxAp->nc) );
461 
462 		errval = RSB_ERR_INTERNAL_ERROR;
463 		RSB_ERROR(RSB_ERRM_BCE);
464 		RSB_ERROR(RSB_ERRM_BM),RSB_ERROR_MATRIX_SUMMARY(mtxAp),RSB_ERROR(RSB_ERRM_NL);
465 		return errval;
466 	}
467 	return errval;
468 }
469 
rsb__compute_bounded_box(struct rsb_mtx_t * mtxAp)470 rsb_err_t rsb__compute_bounded_box(struct rsb_mtx_t * mtxAp)
471 {
472 	/*
473 	 * Set have to be: nr, nc.
474 	 * Will compute: ...
475 	 *
476 	 * TODO: make sure it does not depend on RSB_FLAG_QUAD_PARTITIONING.
477 	 * */
478 	rsb_err_t errval = RSB_ERR_NO_ERROR;
479 	rsb_coo_idx_t broff = RSB_INVALID_COO_IDX_VAL,bcoff = RSB_INVALID_COO_IDX_VAL,bm = RSB_INVALID_COO_IDX_VAL,bk = RSB_INVALID_COO_IDX_VAL;
480 
481 	if(rsb__is_coo_matrix(mtxAp))
482 	{
483 		//rsb_nnz_idx_t nnz0 = 0,nnz1 = mtxAp->nnz;
484 		if(RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES))
485 		{
486 			RSB_DECLARE_CONST_HALFCOO_ARRAYS_FROM_MATRIX(IA,JA,mtxAp)
487 			rsb_half_idx_t li, ui;
488 			rsb_half_idx_t lj, uj;
489 			// FIXME: could optimize, since IA is sorted
490 			rsb__util_find_extremal_half_index_val(IA,mtxAp->nnz,0,mtxAp->nr,&li,&ui);
491 			rsb__util_find_extremal_half_index_val(JA,mtxAp->nnz,0,mtxAp->nc,&lj,&uj);
492 			bk = 1;bk += uj; bm = 1;bm += ui; broff = li; bcoff = lj;
493 		}
494 		else
495 		{
496 			RSB_DECLARE_CONST_FULLCOO_ARRAYS_FROM_MATRIX(IA,JA,mtxAp)
497 			rsb_coo_idx_t li, ui;
498 			rsb_coo_idx_t lj, uj;
499 			// FIXME: could optimize, since IA is sorted
500 			rsb__util_find_extremal_full_index_val(IA,mtxAp->nnz,0,mtxAp->nr,&li,&ui);
501 			rsb__util_find_extremal_full_index_val(JA,mtxAp->nnz,0,mtxAp->nc,&lj,&uj);
502 			bk = 1;bk += uj; bm = 1;bm += ui; broff = li; bcoff = lj;
503 		}
504 	}
505 	else
506 	if(rsb__is_csr_matrix(mtxAp))
507 	{
508 		if(RSB_DO_FLAG_HAS(mtxAp->flags,(RSB_FLAG_USE_HALFWORD_INDICES)))
509 		{
510 			RSB_DECLARE_CONST_HALFCSR_ARRAYS_FROM_MATRIX(PA,JA,mtxAp)
511 			rsb_half_idx_t lj, uj;
512 			rsb_coo_idx_t li, ui;
513 			rsb__util_find_extremal_half_index_val(JA,mtxAp->nnz,0,mtxAp->nr,&lj,&uj);
514 			ui = rsb__nnz_split_nnz_bsearch(PA,mtxAp->nnz,mtxAp->nr+1);
515 			li = rsb__nnz_split_nnz_bsearch(PA,1,mtxAp->nr+1)-1;
516 			bk = 1;bk += uj; bm = ui; broff = li; bcoff = lj;
517 		}
518 		else
519 		{
520 			RSB_DECLARE_CONST_FULLCSR_ARRAYS_FROM_MATRIX(PA,JA,mtxAp)
521 			rsb_coo_idx_t lj, uj;
522 			rsb_coo_idx_t li, ui;
523 			rsb__util_find_extremal_full_index_val(JA,mtxAp->nnz,0,mtxAp->nr,&lj,&uj);
524 			ui = rsb__nnz_split_nnz_bsearch(PA,mtxAp->nnz,mtxAp->nr+1);
525 			li = rsb__nnz_split_nnz_bsearch(PA,1,mtxAp->nr+1)-1;
526 			bk = 1;bk += uj; bm = ui; broff = li; bcoff = lj;
527 		}
528 	}
529 	else
530 		RSB_ERROR(RSB_ERRMSG_BADFORMAT);
531 
532 	mtxAp->broff = mtxAp->roff+broff;
533 	mtxAp->bcoff = mtxAp->coff+bcoff;
534 	mtxAp->bm = bm;
535 	mtxAp->bk = bk;
536 
537 	errval = rsb__check_bounds(mtxAp);
538 #if 0
539 	RSB_INFO("bounding box of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),
540 		RSB_INFO(": %.2f%% x  %.2f %%\n",(100.0f*(float)(bm-broff))/mtxAp->nr,(100.0f*(float)(bk-bcoff))/mtxAp->nc);
541 		RSB_INFO(": %d,%d %d,%d\n",mtxAp->roff+broff,mtxAp->coff+bcoff,bm,bk);
542 		RSB_INFO(": %d,%d %d,%d\n",mtxAp->roff+broff,mtxAp->coff+bcoff,bm,bk);
543 #endif
544 	return errval;
545 }
546 
rsb_do_compute_bounded_boxes(struct rsb_mtx_t * mtxAp)547 static rsb_err_t rsb_do_compute_bounded_boxes(struct rsb_mtx_t * mtxAp)
548 {
549 	/**
550 		\ingroup gr_unfinished
551 	 */
552 	rsb_err_t errval = RSB_ERR_NO_ERROR;
553 	rsb_submatrix_idx_t smi = 0;
554 	rsb_bool_t want_really = 0;
555 #if RSB_WANT_BOUNDED_BOXES
556 	want_really = (rsb_global_session_handle.want_bounded_box!=0);
557 #else /* RSB_WANT_BOUNDED_BOXES */
558 	mtxAp->broff = roff;
559 	mtxAp->bcoff = coff;
560 	mtxAp->bm = m;
561 	mtxAp->bk = k;
562 	goto err;
563 #endif /* RSB_WANT_BOUNDED_BOXES */
564 
565 	if(!mtxAp)
566 	{
567 		RSB_ERROR(RSB_ERRM_E_MTXAP);
568 		return RSB_ERR_BADARGS;
569 	}
570 
571 	if(mtxAp->nnz==0)
572 		goto err;
573 
574 	if(want_really)
575 	{
576 		if(rsb__is_terminal_recursive_matrix(mtxAp)) // fix for serial 20101206
577 		{
578 			RSB_DO_ERROR_CUMULATE(errval,rsb__compute_bounded_box(mtxAp));
579 			goto err;
580 		}
581 		#pragma omp parallel for schedule(static,1) reduction(|:errval)  shared(mtxAp) RSB_NTC
582 		for(smi=0;smi<mtxAp->all_leaf_matrices_n;++smi)
583 		{
584 			struct rsb_mtx_t * submatrix = mtxAp->all_leaf_matrices[smi].mtxlp;
585 			RSB_DO_ERROR_CUMULATE(errval,rsb__compute_bounded_box(submatrix));
586 		}
587 		#pragma omp barrier
588 	}
589 	else
590 	{
591 		#pragma omp parallel for schedule(static,1) reduction(|:errval)  shared(mtxAp) RSB_NTC
592 		for(smi=0;smi<mtxAp->all_leaf_matrices_n;++smi)
593 		{
594 			struct rsb_mtx_t * submatrix = mtxAp->all_leaf_matrices[smi].mtxlp;
595 			submatrix->bm = submatrix->nr;
596 			submatrix->bk = submatrix->nc;
597 			submatrix->broff = submatrix->roff;
598 			submatrix->bcoff = submatrix->coff;
599 		}
600 	}
601 err:
602 	RSB_DO_ERR_RETURN(errval)
603 }
604 
rsb_do_switch_fresh_recursive_matrix_to_halfword_storages_parallel(struct rsb_mtx_t * mtxAp)605 static rsb_err_t rsb_do_switch_fresh_recursive_matrix_to_halfword_storages_parallel(struct rsb_mtx_t * mtxAp)
606 {
607 	/**
608 		\ingroup gr_unfinished
609 TODO: move somewhere else
610 	 */
611 	rsb_err_t errval = RSB_ERR_NO_ERROR;
612 	rsb_submatrix_idx_t smi = 0;
613 	if(!mtxAp)
614 	{
615 		errval = RSB_ERR_BADARGS;
616 		RSB_PERR_GOTO(err,RSB_ERRM_E_MTXAP);
617 	}
618 
619 	/* FIXME: 20100809 it seems that 'switching' a 0-nnz matrix overwrites something which should not be overwritten  */
620 	if(mtxAp->nnz==0)
621 		goto err;
622 
623 	if(rsb__is_terminal_recursive_matrix(mtxAp)) // fix for serial 20101206
624 	{
625 		RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(mtxAp));
626 		goto err;
627 	}
628 	#pragma omp parallel for schedule(static,1) reduction(|:errval)  shared(mtxAp) RSB_NTC
629 	for(smi=0;smi<mtxAp->all_leaf_matrices_n;++smi)
630 	{
631 		struct rsb_mtx_t * submatrix = mtxAp->all_leaf_matrices[smi].mtxlp;
632 		RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(submatrix));
633 	}
634 	#pragma omp barrier
635 err:
636 	RSB_DO_ERR_RETURN(errval)
637 }
638 
639 #if 0
640 static rsb_err_t rsb_do_shuffle_left_and_right_rows_inner(rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t m, rsb_coo_idx_t m0, rsb_nnz_idx_t nnz, rsb_nnz_idx_t nnz0, rsb_coo_idx_t * RSB_RESTRICT IL, rsb_coo_idx_t * IM, rsb_coo_idx_t * RSB_RESTRICT WA, size_t sz)
641 {
642 	/**
643 		\ingroup gr_unfinished
644 		FIXME: UNFINISHED, EXPERIMENTAL
645 	 */
646 		rsb_err_t errval = RSB_ERR_NO_ERROR;
647 		rsb_coo_idx_t iu = m0,id = m-1;
648 		rsb_coo_idx_t wl = nnz,wr = 0,ns = 0,nu = 0,ws = 0,nd = nnz;
649 
650 		if( sz<1 || !IA || !IM || !IL || !WA || RSB_INVALID_NNZ_INDEX(nnz) || RSB_INVALID_COO_INDEX(m) )
651 		{
652 			errval = RSB_ERR_BADARGS;
653 			RSB_PERR_GOTO(err,RSB_ERRM_E_MTXAP);
654 		}
655 		if(RSB_UNLIKELY(IL[m]!=nnz))
656 		{
657 			errval = RSB_ERR_INTERNAL_ERROR;
658 			RSB_PERR_GOTO(err,RSB_ERRM_ES);
659 		}
660 		if(iu>=id)
661 		{
662 			errval = RSB_ERR_INTERNAL_ERROR;
663 			RSB_PERR_GOTO(err,RSB_ERRM_ES);
664 		}
665 		nu = IL[iu];
666 
667 		while(RSB_LIKELY(iu<=id))
668 		{
669 			/* compute the left subrow length */
670 			ns = IM[iu]-IL[iu];
671 			/* shift left the left subrow */
672 			RSB_A_MEMMOVE(IA,IA,nu,IL[iu],ns,sz);
673 			/* update the counter of left subrows elements in IA */
674 			nu += ns;
675 			/* compute the right subrow length */
676 			ws = (IL[iu+1]-IM[iu]);
677 			/* buffer the right subrow */
678 			RSB_A_MEMCPY(WA,IA,wr,IM[iu],ws,sz);
679 			/* update the (complementary) counter of right subrows elements in the buffer */
680 			wr += ws;
681 
682 			if(RSB_UNLIKELY(iu>=id))
683 			{
684 				/* skip row, as id was already done */
685 				++id;
686 				goto done;
687 			}
688 			/* compute the right subrow length */
689 			ns = IL[id+1]-IM[id];
690 			/* update the (complementary) counter of right subrows elements in IA */
691 			nd -= ns;
692 			/* shift right the right subrow */
693 			RSB_A_MEMMOVE(IA,IA,nd,IM[id],ns,sz);
694 			/* compute the left subrow length */
695 			ws = IM[id]-IL[id];
696 			/* update the counter of right subrows elements in the buffer */
697 			wl -= ws;
698 			/* buffer the left subrow */
699 			RSB_A_MEMCPY(WA,IA,wl,IL[id],ws,sz);
700 
701 			++iu,--id;
702 		}
703 		/* IA has definitive elements, from left at  0..nu-1 and from right at (nnz-nd)..nnz-1  */
704 		{
705 			//rsb_nnz_idx_t
706 		}
707 		/* WA has definitive elements, from right at  0..wr-1 and from left at  wl..nnz-1  */
708 		/* it should be : nnz == nu + nnz-wl+nd-wr */
709 		if(nu+((nnz)-wl)!=nd-wr)
710 		{
711 			errval = RSB_ERR_INTERNAL_ERROR;
712 			RSB_PERR_GOTO(err,RSB_ERRM_ES);
713 		}
714 done:
715 		/* compute the number of left submatrix elements in the buffer */
716 		ns = (nnz)-wl;
717 		/* copy the partial left submatrix from the buffer to the array */
718 		RSB_A_MEMMOVE(IA,WA,nu,wl,ns,sz);
719 		/* update the counter of left subrows elements in IA */
720 		nu += ns;
721 		/* compute the number of right submatrix elements in the buffer */
722 		ns = wr;
723 		/* copy the partial right submatrix from the buffer to the array */
724 		RSB_A_MEMMOVE(IA,WA,nu,0,ns,sz);
725 		/* update the counter to all subrows elements in IA (those already present, too) */
726 		nd -= ns;
727 
728 		/* minimal coherence check */
729 err:
730 		if(RSB_UNLIKELY(nu!=nd))
731 		{
732 			RSB_ERROR("nnz=%d != nu+nd = %d; nu=%d, wl=%d, wr=%d, nd=%d\n",nnz,nu+nd,nu,wl,wr,nd);
733 //			RSB_ERROR("nnz=%d != nu+nnz-wl+nd = %d; nu=%d, wl=%d, wr=%d, nd=%d\n",nnz,nu+nnz+wl+nd,nu,wl,wr,nd);
734 			errval = RSB_ERR_INTERNAL_ERROR;
735 		}
736 		/* the buffer is empty now, and the arrays are left-right partitioned */
737 		RSB_DO_ERR_RETURN(errval)
738 }
739 #endif
740 
741 #if 0
742 static rsb_err_t rsb_do_shuffle_left_and_right_rows(void * RSB_RESTRICT VA, rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t m, rsb_coo_idx_t m0, rsb_nnz_idx_t nnz, rsb_nnz_idx_t nnz0, rsb_type_t typecode, rsb_coo_idx_t * RSB_RESTRICT IL, rsb_coo_idx_t * IM, rsb_coo_idx_t * RSB_RESTRICT WA)
743 {
744 	/**
745 		\ingroup gr_unfinished
746 		FIXME: UNFINISHED, EXPERIMENTAL
747 	 */
748 	rsb_err_t errval = RSB_ERR_NO_ERROR;
749 	const size_t sz = sizeof(rsb_coo_idx_t);
750 	if(!IL || !IM || !WA)
751 	{
752 		errval = RSB_ERR_BADARGS;
753 		RSB_PERR_GOTO(err,RSB_ERRM_E_MTXAP);
754 	}
755 	RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows_inner(IA,m,m0,nnz,nnz0,IL,IM,WA,sz));
756 	RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows_inner(JA,m,m0,nnz,nnz0,IL,IM,WA,sz));
757 	RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows_inner(VA,m,m0,nnz,nnz0,IL,IM,WA,RSB_SIZEOF(typecode)));
758 err:
759 	RSB_DO_ERR_RETURN(errval)
760 }
761 #endif
762 
rsb_do_compute_vertical_split_search_only(const rsb_coo_idx_t * RSB_RESTRICT IA,const rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_coo_idx_t hm,rsb_coo_idx_t hk,rsb_nnz_idx_t nnz,const rsb_coo_idx_t * IB,rsb_nnz_idx_t * ulp,rsb_nnz_idx_t * urp,rsb_nnz_idx_t * llp,rsb_nnz_idx_t * lrp)763 static rsb_err_t rsb_do_compute_vertical_split_search_only(
764 		const rsb_coo_idx_t * RSB_RESTRICT IA, const rsb_coo_idx_t * RSB_RESTRICT JA,
765 	       	rsb_coo_idx_t roff, rsb_coo_idx_t coff, rsb_coo_idx_t m, rsb_coo_idx_t k,
766 	       	rsb_coo_idx_t hm, rsb_coo_idx_t hk, rsb_nnz_idx_t nnz,
767 	       	const rsb_coo_idx_t * IB, rsb_nnz_idx_t *ulp, rsb_nnz_idx_t *urp, rsb_nnz_idx_t *llp, rsb_nnz_idx_t *lrp)
768 {
769 	/**
770 	\ingroup gr_unfinished
771 
772 
773 	 */
774 	rsb_nnz_idx_t ul = 0,ur = 0,ll = 0,lr = 0;
775 	rsb_err_t errval = RSB_ERR_NO_ERROR;
776 	rsb_nnz_idx_t dnnz = 0/*,wdnnz = 0,rnnz = 0,hrnnz = 0*/;
777 	register rsb_coo_idx_t i;
778 	//rsb_nnz_idx_t nnz0 = 0;
779 	//rsb_coo_idx_t xroff = 0;
780 
781 
782 	if(nnz>m || 1)
783 	//if(nnz>m)
784 	{
785 	for(i = roff;RSB_LIKELY(i<roff+m);++i)
786 	{
787 		// offset of line i in the global line pointers array
788 		rsb_nnz_idx_t nnz0 = IB[i];
789 		// nnz1..nnz0 are the boundaries of line i
790 		rsb_nnz_idx_t nnz1 = IB[i+1];
791 		rsb_nnz_idx_t nnz2 = 0;
792 		// check
793 		RSB_C2R_ASSERT(nnz0>=IB[i]);
794 		RSB_C2R_ASSERT(nnz1<=IB[i+1]);
795 		// skip line if empty
796 		if(nnz1-nnz0<1)continue;
797 		// find first element of line i also in the submatrix
798 		nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,coff,nnz1-nnz0);
799 		// skip line if empty in the submatrix
800 		if(nnz1-nnz0<1)continue;
801 		// find the length of the subrow i in the submatrix
802 		nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,coff+k,nnz1-nnz0);
803 		//check
804 		RSB_C2R_ASSERT(JA[nnz0+0]>=coff);
805 		// skip line if empty in the submatrix
806 		if(nnz1-nnz0<1)continue;
807 		nnz2 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,coff+hk,nnz1-nnz0);
808 	       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
809 		RSB_C2R_ASSERT(JA[nnz0+0]>=coff);
810 		RSB_C2R_ASSERT(JA[nnz1-1]< coff+k);
811 		dnnz += nnz1-nnz0;
812 		if(i<roff+hm)
813 			ul += nnz2-nnz0,
814 			ur += nnz1-nnz2;
815 		else
816 			ll += nnz2-nnz0,
817 			lr += nnz1-nnz2;
818 	}
819 	}
820 	else
821 	{
822 		// FIXME: UNFINISHED
823 		rsb_nnz_idx_t nnz0,nnz1,n;
824 		//RSB_INFO("almost empty matrix !\n");
825 		for(n=0;n<nnz;++n)
826 		{
827 			rsb_nnz_idx_t nnz2 = 0;
828 			i = IA[n];
829 			nnz0 = IB[i];
830 			nnz1 = IB[i+1];
831 			// ...
832 #if 1
833 		// skip line if empty
834 		if(nnz1-nnz0<1)continue;
835 		// find first element of line i also in the submatrix
836 		nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,coff,nnz1-nnz0);
837 		// skip line if empty in the submatrix
838 		if(nnz1-nnz0<1)continue;
839 		// find the length of the subrow i in the submatrix
840 		nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,coff+k,nnz1-nnz0);
841 		//check
842 		RSB_C2R_ASSERT(JA[nnz0+0]>=coff);
843 		// skip line if empty in the submatrix
844 		if(nnz1-nnz0<1)continue;
845 		nnz2 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,coff+hk,nnz1-nnz0);
846 	       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
847 		RSB_C2R_ASSERT(JA[nnz0+0]>=coff);
848 		RSB_C2R_ASSERT(JA[nnz1-1]< coff+k);
849 		dnnz += nnz1-nnz0;
850 		if(i<roff+hm)
851 			ul += nnz2-nnz0,
852 			ur += nnz1-nnz2;
853 		else
854 			ll += nnz2-nnz0,
855 			lr += nnz1-nnz2;
856 #else
857 			if(nnz1-nnz0<1)continue;
858 			if(i<roff+hm)
859 			{
860 				for(;n<nnz1;++n)
861 					if(JA[n]>=coff+hk)
862 						++ur;
863 					else
864 						++ul;
865 			}
866 			else
867 			{
868 				for(;n<nnz1;++n)
869 					if(JA[n]>=coff+hk)
870 						++lr;
871 					else
872 						++ll;
873 			}
874 #endif
875 		}
876 	}
877 //done:
878 	*llp = ll;
879 	*lrp = lr;
880 	*ulp = ul;
881 	*urp = ur;
882 //err:
883 	RSB_DO_ERR_RETURN(errval)
884 }
885 
886 #if !RSB_WANT_PARALLEL_SUBDIVISION
rsb_do_compute_vertical_split(const rsb_coo_idx_t * RSB_RESTRICT IA,const rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_coo_idx_t hm,rsb_coo_idx_t hk,rsb_nnz_idx_t nnz,rsb_coo_idx_t * IL,rsb_coo_idx_t * RSB_RESTRICT IM,rsb_coo_idx_t * IR,rsb_nnz_idx_t * ulp,rsb_nnz_idx_t * urp,rsb_nnz_idx_t * llp,rsb_nnz_idx_t * lrp)887 static rsb_err_t rsb_do_compute_vertical_split(const rsb_coo_idx_t * RSB_RESTRICT IA, const rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t roff, rsb_coo_idx_t coff, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_coo_idx_t hm, rsb_coo_idx_t hk, rsb_nnz_idx_t nnz, rsb_coo_idx_t * IL, rsb_coo_idx_t * RSB_RESTRICT IM, rsb_coo_idx_t * IR, rsb_nnz_idx_t *ulp, rsb_nnz_idx_t *urp, rsb_nnz_idx_t *llp, rsb_nnz_idx_t *lrp)
888 {
889 	/**
890 	\ingroup gr_unfinished
891 	FIXME: UNFINISHED, EXPERIMENTAL
892 
893 	Computes two arrays: IM, IL.
894 	IM[i], contains the index of the first element >= hk on line i,
895 	IL[i], contains the index of the first element on line i.
896 	Notes:
897        		IM[i]==IL[i+1] if no element >=hk exists
898        		IM[i]==IL[i]   if no IL[i] >=hk
899        		IL[i]==IL[i+1]  if line i is empty
900        		IL[0]==0
901        		IL[m]==nnz
902 		IM is valid on the 0..nr-1 range.
903 		IL is valid on the 0..nr range.
904 
905 	TODO: blocking support
906 	 */
907 	rsb_nnz_idx_t ul = 0,ur = 0,ll = 0,lr = 0;
908 	rsb_err_t errval = RSB_ERR_NO_ERROR;
909 	rsb_nnz_idx_t dnnz = 0,wdnnz = 0,rnnz = 0,hrnnz = 0;
910 	register rsb_coo_idx_t i;
911 	rsb_nnz_idx_t nnz0 = 0;
912 
913 	hk += coff;
914 
915 	if(IR==NULL && IM==NULL && IL==NULL)
916 	{
917 		/* FIXME: write me, for cases where we want subdivision on barely-COO matrices */
918 	}
919 	else
920 	if(IR==NULL && IM==NULL)
921 	{
922 		// root matrix; should compute IL
923 		IL[0] = 0;
924 //		if(nnz>100*m)// TODO
925 		if(0)// TODO: determine which case is faster !
926 		{
927 			/* fill the row pointers array */
928 //			#pragma omp parallel for reduction(+:dnnz) RSB_NTC
929 //			for(i=0;i<m;++i)
930 			for(i=0;RSB_LIKELY(i<m);++i)
931 			{
932 				// delimit the current row
933 				rnnz = rsb__nnz_split_coo_bsearch(IA+dnnz,i+1,nnz-dnnz);
934 				/* i==m-1 || IA[dnnz+rnnz] > i */
935 				dnnz += rnnz;
936 				IL[i+1] = dnnz;
937 				RSB_C2R_ASSERT(rnnz>=0);
938 			}
939 		}
940 		else
941 		{
942 			/* for full matrices, this is faster */
943 			rsb_nnz_idx_t n = 0;
944 #if 1
945 			nnz0 = 0;
946 #if RSB_WANT_QUADRANT_QUICK_DETECT
947 			/* FIXME: UNFINISHED */
948 			if(IB[roff]==IB[roff+hm])
949 			{
950 				RSB_INFO("upper submatrix empty\n");
951 				// should do something sharp
952 			}
953 			else
954 			if(IB[roff+hm]==IB[roff+m])
955 			{
956 				RSB_INFO("lower submatrix empty\n");
957 				// should do something sharp
958 			}
959 #endif /* RSB_WANT_QUADRANT_QUICK_DETECT  */
960 			for(i=0;RSB_LIKELY(i<m);++i)
961 			{
962 				rnnz = 0;
963 				for(;RSB_LIKELY(n<nnz && IA[n]==i);++n)
964 					++rnnz;
965 				IL[i+1] = nnz0+rnnz;
966 				nnz0 += rnnz;
967 			}
968 #else
969 			for(i=0;RSB_LIKELY(i<m);++i)
970 				IL[i] = 0;
971 			for(n=0;RSB_LIKELY(n<nnz);++n)
972 				RSB_C2R_ASSERT(IA[n]>=0 && IA[n]<m);
973 			for(n=0;RSB_LIKELY(n<nnz);++n)
974 				IL[IA[n]+1]++;
975 			for(i=0;RSB_LIKELY(i<m);++i)
976 				IL[i+1] += IL[i];
977 #endif
978 		}
979 		RSB_C2R_ASSERT(IL[m]==nnz);
980 		goto err;
981 	}
982 	else
983 	if(IR==NULL)
984 	{
985 		RSB_C2R_ASSERT(0);
986 		RSB_ASSERT(ulp);
987 		RSB_ASSERT(llp);
988 		RSB_ASSERT(urp);
989 		RSB_ASSERT(lrp);
990 		// root matrix; should compute IL
991 		IL[0] = 0;
992 		/* fill the row pointers array */
993 		for(i=0;RSB_LIKELY(i<m);++i)
994 		{
995 			// delimit the current row
996 			rnnz = rsb__nnz_split_coo_bsearch(IA+dnnz,i+1,nnz-dnnz);
997 			/* i==m-1 || IA[dnnz+rnnz] > i */
998 
999 			IL[i+1] = dnnz+rnnz;
1000 
1001 			if(RSB_LIKELY(dnnz+rnnz<=nnz))
1002 			{
1003 				// the current row is non empty
1004 				hrnnz = rsb__nnz_split_coo_bsearch(JA+dnnz,hk,rnnz);
1005 				if(RSB_LIKELY(hrnnz<rnnz))
1006 					IM[i] = dnnz+hrnnz;
1007 				else
1008 					// all the elements are in the left submatrix
1009 					IM[i] = dnnz+ rnnz;
1010 			}
1011 			else
1012 				// last row
1013 				hrnnz = rnnz,
1014 				IM[i] = nnz;
1015 
1016 				if(RSB_UNLIKELY(IM[i]<IL[i]))
1017 				{
1018 					errval = RSB_ERR_INTERNAL_ERROR;
1019 					RSB_PERR_GOTO(err,RSB_ERRM_ES);
1020 				}
1021 
1022 			// TODO: split in two cycles: 0..hm-1, hm..nr-1
1023 			if(i<hm)
1024 				ul += IM[i  ]-IL[i],
1025 				ur += IL[i+1]-IM[i];
1026 			else
1027 				ll += IM[i  ]-IL[i],
1028 				lr += IL[i+1]-IM[i];
1029 			//RSB_INFO("%d @ %d~%d (%d/%d)\n",i,dnnz,dnnz+rnnz-1,hrnnz,rnnz);
1030 			dnnz += rnnz;
1031 		}
1032 		IM[m] = IL[m];
1033 	}
1034 	else
1035 	{
1036 		// compute middle pointers array, using the left and right ones
1037 		RSB_ASSERT(ulp);
1038 		RSB_ASSERT(llp);
1039 		RSB_ASSERT(urp);
1040 		RSB_ASSERT(lrp);
1041 		nnz0 = IL[0];
1042 		/* fill the middle row pointers array */
1043 		for(i=0;RSB_LIKELY(i<m);++i)
1044 		{
1045 			// delimit the current row
1046 			rsb_nnz_idx_t il = IL[i],ir = IR[i],im;
1047 			rnnz = ir-il;
1048 
1049 			RSB_C2R_ASSERT(ir>=il);
1050 			if(ir<il)
1051 			{
1052 				errval = RSB_ERR_INTERNAL_ERROR;
1053 				RSB_PERR_GOTO(err,RSB_ERRM_ES);
1054 			}
1055 			if(ir==il)
1056 			{
1057 				// empty row
1058 				IM[i] = IR[i];
1059 				continue;
1060 			}
1061 			/* i==m-1 || IA[dnnz+rnnz] > i */
1062 
1063 			// the current row is non empty
1064 			RSB_C2R_ASSERT(JA[il+0]>=coff  );
1065 			RSB_C2R_ASSERT(JA[ir-1]< coff+k);
1066 
1067 			hrnnz = rsb__nnz_split_coo_bsearch(JA+il,hk,rnnz);
1068 			im = il+hrnnz;
1069 
1070 			IM[i] = im;
1071 
1072 #if RSB_C2R_PARANOIA
1073 			if(IM[i]>IR[i])
1074 			{
1075 				errval = RSB_ERR_INTERNAL_ERROR;
1076 				RSB_PERR_GOTO(err,"i=%d, %d > %d!\n",i,IM[i],IR[i]);
1077 			}
1078 
1079 			if(IM[i]<IL[i])
1080 			{
1081 				errval = RSB_ERR_INTERNAL_ERROR;
1082 				RSB_PERR_GOTO(err,"i=%d, %d < %d!\n",i,IM[i],IL[i]);
1083 			}
1084 
1085 #endif /* RSB_C2R_PARANOIA */
1086 			// TODO: split in two cycles: 0..hm-1, hm..nr-1
1087 			if(i<hm)
1088 				ul += im-il,
1089 				ur += ir-im;
1090 			else
1091 				ll += im-il,
1092 				lr += ir-im;
1093 			//RSB_INFO("%d @ %d~%d (%d/%d)\n",i,dnnz,dnnz+rnnz-1,hrnnz,rnnz);
1094 			dnnz += rnnz;
1095 		}
1096 		IM[m] = IL[m];
1097 	}
1098 
1099 		if(RSB_C2R_PARANOIA)
1100 		{
1101 			rsb_coo_idx_t i;
1102 			rsb_nnz_idx_t lnz = 0,rnz = 0,tnz = 0;
1103 			if(IR==NULL)
1104 				IR = IL+1;
1105 
1106 /*			if(IL[m]!=nnz0+nnz)
1107 			{
1108 				RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1109 				RSB_PERR_GOTO(err,RSB_ERRM_ES);
1110 			}*/
1111 
1112 			for(i=0;RSB_LIKELY(i<m);++i)
1113 			{
1114 				lnz += IM[i]-IL[i];
1115 				rnz += IR[i]-IM[i];
1116 				tnz += IR[i]-IL[i];
1117 
1118 				if(RSB_UNLIKELY(IM[i]<IL[i] || IL[i]>IL[i+1]))
1119 				{
1120 					RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1121 					RSB_PERR_GOTO(err,RSB_ERRM_ES);
1122 				}
1123 			}
1124 			if(ul+ll!=lnz || ur+lr != rnz)
1125 			{
1126 				RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1127 				RSB_PERR_GOTO(err,RSB_ERRM_ES);
1128 			}
1129 			if(tnz!=nnz || (rnz+lnz)!=nnz)
1130 			{
1131 				RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1132 				RSB_PERR_GOTO(err,"tnz:%d, nnz:%d, rnz:%d, lnz:%d\n",tnz,nnz,rnz,lnz);
1133 			}
1134 		}
1135 		*llp = ll;
1136 		*lrp = lr;
1137 		*ulp = ul;
1138 		*urp = ur;
1139 err:
1140 		RSB_DO_ERR_RETURN(errval)
1141 }
1142 #endif /* RSB_WANT_PARALLEL_SUBDIVISION */
1143 
rsb_do_compute_vertical_split_parallel(const rsb_coo_idx_t * RSB_RESTRICT IA,const rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_coo_idx_t hm,rsb_coo_idx_t hk,rsb_nnz_idx_t nnz,rsb_coo_idx_t * IL,rsb_coo_idx_t * RSB_RESTRICT IM,rsb_coo_idx_t * IR,rsb_nnz_idx_t * ulp,rsb_nnz_idx_t * urp,rsb_nnz_idx_t * llp,rsb_nnz_idx_t * lrp)1144 static rsb_err_t rsb_do_compute_vertical_split_parallel(const rsb_coo_idx_t * RSB_RESTRICT IA, const rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t roff, rsb_coo_idx_t coff, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_coo_idx_t hm, rsb_coo_idx_t hk, rsb_nnz_idx_t nnz, rsb_coo_idx_t * IL, rsb_coo_idx_t * RSB_RESTRICT IM, rsb_coo_idx_t * IR, rsb_nnz_idx_t *ulp, rsb_nnz_idx_t *urp, rsb_nnz_idx_t *llp, rsb_nnz_idx_t *lrp)
1145 {
1146 
1147 	/**
1148 	Binary search for the boundaries of each row.
1149 	Assign threads to rows intervals.
1150 	Perform the row pointers vector fill calling rsb_do_compute_vertical_split.
1151 	*/
1152 #if 0
1153 	return rsb_do_compute_vertical_split(IA,JA,roff,coff,m,k,hm,hk,nnz,IL,IM,IR,ulp,urp,llp,lrp);
1154 #else
1155 	rsb_err_t errval = RSB_ERR_NO_ERROR;
1156 	/* const rsb_thread_t wet = rsb_get_num_threads(); */
1157 
1158 	if(m<1)
1159 		return RSB_ERR_NO_ERROR;/* TODO: limit case */
1160 	IL[0] = 0;
1161 	if(m==1)
1162 		goto after;
1163 
1164 	#pragma omp parallel RSB_NTC
1165 	{
1166 #if RSB_WANT_OMP_RECURSIVE_KERNELS
1167 		const rsb_thread_t tn = /*wet*/ omp_get_num_threads(), tnn = RSB_MIN(tn,m);
1168 		const rsb_thread_t th_id = omp_get_thread_num();
1169 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
1170 		const rsb_thread_t tn = 1, tnn = RSB_MIN(tn,m);
1171 		const rsb_thread_t th_id = 0;
1172 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
1173 		const rsb_coo_idx_t mm = ((m+tnn-1)/tnn),m0 = mm*th_id,m1 = RSB_MIN(m0+mm,m);
1174 		rsb_coo_idx_t i;
1175 		rsb_nnz_idx_t nnz0 = 0,nnz1 = nnz;
1176 		rsb_nnz_idx_t n,rnnz,fnnz,lnnz;
1177 		if(th_id>=m)
1178 			goto nowork;
1179 		/* binary search for the boundaries of each row  */
1180 		nnz0 = rsb__nnz_split_coo_bsearch(IA+nnz0,m0,nnz1-nnz0);
1181 		nnz1 = nnz0+rsb__nnz_split_coo_bsearch(IA+nnz0,m1,nnz1-nnz0);
1182 		/* assign threads to rows intervals */
1183 		if(nnz0>=nnz1)
1184 		{
1185 			//for(i=m0;RSB_LIKELY(i<m1);++i)
1186 			//	IL[i+1] = nnz0;
1187 			RSB_XCOO_VSET(IL,nnz0,m0+1,m1+1);
1188 			goto nowork;
1189 		}
1190 		//RSB_INFO("thread %d  rows %d..%d  nnz %d..%d\n",th_id,m0,m1,nnz0,nnz1);
1191 		/* perform the row pointers vector fill calling rsb_do_compute_vertical_split */
1192 		//RSB_DO_ERROR_CUMULATE(errval,rsb_do_compute_vertical_split(IA+nnz0,JA+nnz0,roff+m0,coff,m1-m0,k,hm,hk,nnz1-nnz0,IL+m0,NULL,NULL,ulp,urp,llp,lrp));
1193 		fnnz = nnz0;
1194 		n = nnz0;
1195 		for(i=m0;RSB_LIKELY(i<m1);++i)
1196 		{
1197 			if((nnz1-nnz0)/(m1-m0)<RSB_WANT_BINSEARCH_MIN_NZPR)
1198 			{
1199 				rnnz = 0;
1200 				for(;RSB_LIKELY(n<nnz1 && IA[n]==i);++n)
1201 					++rnnz;
1202 				IL[i+1] = nnz0+rnnz;
1203 				nnz0 += rnnz;
1204 			}
1205 			else
1206 			{
1207 				/* TODO : should use a smarter strategy than this one */
1208 				lnnz = fnnz+rsb__nnz_split_coo_bsearch(IA+fnnz,i+1,nnz1-fnnz);
1209 				//RSB_INFO("%d : %d\n",i,lnnz);
1210 				IL[i+1] = lnnz;
1211 				fnnz = lnnz;
1212 			}
1213 		}
1214 nowork:
1215 	RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
1216 	#pragma omp barrier
1217 	RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
1218 	}
1219 after:
1220 	IL[m] = nnz;
1221 	//int i; RSB_INFO(":::"); for(i=0;RSB_LIKELY(i<m+1);++i) RSB_INFO("%d ",IL[i]); RSB_INFO("\n");
1222 	//RSB_INFO(":::"); for(i=0;RSB_LIKELY(i<nnz);++i) RSB_INFO("%d ",IA[i]); RSB_INFO("\n");
1223 //err:
1224 	RSB_DO_ERR_RETURN(errval)
1225 #endif
1226 }
1227 
1228 #if 0
1229 static rsb_err_t rsb_do_fill_partially_rcsr_arrays_for_later(struct rsb_mtx_t * mtxAp,
1230 		const rsb_coo_idx_t * IL, const rsb_coo_idx_t * IR,
1231 		rsb_coo_idx_t * IA, rsb_coo_idx_t * JA,
1232 		//const rsb_coo_idx_t * IA, const rsb_coo_idx_t * JA,
1233 		rsb_nnz_idx_t nzoff, rsb_coo_idx_t m, rsb_coo_idx_t roff )
1234 {
1235 	/**
1236 		\ingroup gr_unfinished
1237 	 */
1238 	mtxAp->nzoff = nzoff;
1239 	mtxAp->bindx = IA+nzoff;
1240 	mtxAp->bpntr = NULL;
1241 #if RSB_WANT_FIRST_VERSION
1242 	RSB_COA_MEMMOVE(mtxAp->bindx,IL,0,roff,m+1);
1243 #endif /* RSB_WANT_FIRST_VERSION */
1244 #if RSB_WANT_FIRST_VERSION
1245 {
1246 	rsb_nnz_idx_t i;
1247 	for(i=mtxAp->roff;RSB_LIKELY(i<mtxAp->roff+mtxAp->nr);++i)
1248 	{
1249 		rsb_nnz_idx_t nnz1 = IR[i];
1250 		rsb_nnz_idx_t nnz0 = IL[i];
1251 //		RSB_C2R_ASSERT(IL[i-mtxAp->roff]>=IB[i]);
1252 //		RSB_C2R_ASSERT(IL[i-mtxAp->roff]<=IB[i+1]);
1253 //		RSB_C2R_ASSERT(IL[i-mtxAp->roff+1]>=IB[i+1]);
1254 		RSB_C2R_ASSERT(nnz0>=IL[i]);
1255 		RSB_C2R_ASSERT(nnz1<=IR[i]);
1256 		if(nnz1==nnz0)continue;
1257 //	       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
1258 		RSB_C2R_ASSERT(JA[nnz0+0]>=mtxAp->coff);
1259 		RSB_C2R_ASSERT(JA[nnz1-1]< mtxAp->coff+mtxAp->nc);
1260 	}
1261 	}
1262 #endif /* RSB_WANT_FIRST_VERSION */
1263 
1264 
1265 
1266 	return RSB_ERR_NO_ERROR;
1267 }
1268 #endif
1269 
1270 #if 0
1271 static rsb_err_t rsb_do_fill_rcsr_arrays_for_later(struct rsb_mtx_t * mtxAp,
1272 		const rsb_coo_idx_t * IL, const rsb_coo_idx_t * IR,
1273 	       	//const rsb_coo_idx_t * IA, const rsb_coo_idx_t * JA,
1274 	       	rsb_coo_idx_t * IA, rsb_coo_idx_t * JA,
1275 		rsb_nnz_idx_t nzoff, rsb_coo_idx_t m, rsb_coo_idx_t roff )
1276 {
1277 	/**
1278 		\ingroup gr_unfinished
1279 	 */
1280 	if(!IR)
1281 		IR = IL+1;
1282 	mtxAp->nzoff = nzoff;
1283 	mtxAp->bindx = IA+nzoff;
1284 	mtxAp->bpntr = IA+nzoff+m+1;
1285 #if RSB_C2R_WANT_MAYBE_FASTER
1286 	RSB_COA_MEMCPY_ROWSZ(mtxAp->bindx,IL,0,roff,m+1);
1287 	RSB_COA_MEMCPY_ROWSZ(mtxAp->bpntr,IR,0,roff,m+1);
1288 //	RSB_COA_MEMCPY(mtxAp->bindx,IL,0,roff,m+1);
1289 //	RSB_COA_MEMCPY(mtxAp->bpntr,IR,0,roff,m+1);
1290 //	RSB_COA_MEMCPY_parallel(mtxAp->bindx,IL,0,roff,m+1);
1291 //	RSB_COA_MEMCPY_parallel(mtxAp->bpntr,IR,0,roff,m+1);
1292 #else /* RSB_C2R_WANT_MAYBE_FASTER */
1293 	/* are we sure  we need MEMMOVE here ? FIXME */
1294 	RSB_COA_MEMMOVE(mtxAp->bindx,IL,0,roff,m+1);
1295 	RSB_COA_MEMMOVE(mtxAp->bpntr,IR,0,roff,m+1);
1296 #endif /* RSB_C2R_WANT_MAYBE_FASTER */
1297 
1298 #if RSB_C2R_PARANOIA
1299 	{
1300 	rsb_nnz_idx_t i;
1301 	for(i=0;i<mtxAp->nr;++i)
1302 	//for(i=mtxAp->roff;i<mtxAp->roff+mtxAp->nr;++i)
1303 	{
1304 		rsb_nnz_idx_t nnz1 = IR[i];
1305 		rsb_nnz_idx_t nnz0 = IL[i];
1306 //		RSB_C2R_ASSERT(IL[i-mtxAp->roff]>=IB[i]);
1307 //		RSB_C2R_ASSERT(IL[i-mtxAp->roff]<=IB[i+1]);
1308 //		RSB_C2R_ASSERT(IL[i-mtxAp->roff+1]>=IB[i+1]);
1309 		RSB_C2R_ASSERT(nnz0>=IL[i]);
1310 		RSB_C2R_ASSERT(nnz1<=IR[i]);
1311 		if(nnz1==nnz0)continue;
1312 //	       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
1313 		RSB_C2R_ASSERT(JA[nnz0+0]>=mtxAp->coff);
1314 		RSB_C2R_ASSERT(JA[nnz1-1]< mtxAp->coff+mtxAp->nc);
1315 	}
1316 	}
1317 #endif /* RSB_C2R_PARANOIA */
1318 	return RSB_ERR_NO_ERROR;
1319 }
1320 #endif
1321 
rsb_do_fill_early_leaf_matrix(struct rsb_mtx_t * mtxAp,struct rsb_mtx_t * submatrix,const rsb_coo_idx_t * IL,const rsb_coo_idx_t * IR,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,const rsb_coo_idx_t * VA,rsb_nnz_idx_t snzoff,rsb_nnz_idx_t nnz,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_type_t typecode,rsb_flags_t flags)1322 static rsb_err_t rsb_do_fill_early_leaf_matrix( struct rsb_mtx_t * mtxAp, struct rsb_mtx_t * submatrix,
1323 		const rsb_coo_idx_t * IL, const rsb_coo_idx_t * IR,
1324 		rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, const rsb_coo_idx_t * VA,
1325 		//const rsb_coo_idx_t * IA, const rsb_coo_idx_t * JA, const rsb_coo_idx_t * VA,
1326 		rsb_nnz_idx_t snzoff, rsb_nnz_idx_t nnz, rsb_coo_idx_t m, rsb_coo_idx_t k,
1327 		rsb_coo_idx_t roff, rsb_coo_idx_t coff, rsb_type_t typecode, rsb_flags_t flags )
1328 {
1329 	/**
1330 		\ingroup gr_unfinished
1331 	 */
1332 	rsb_err_t errval = RSB_ERR_NO_ERROR;
1333 
1334 //	if(!IR)
1335 //		IR = IL+1;
1336 
1337 #if 0
1338 	/* 20131206 nowadays IR and IL are always NULL */
1339 	if(!RSB_DO_TOOFEWNNZFORRCSR(nnz,m) && IR && IL)
1340 	{
1341 		// the matrix could be split further: we fill it with info to continue, if necessary
1342 		RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_rcsr_arrays_for_later(submatrix,IL,IR,IA,JA,snzoff,m,roff));
1343 	}
1344 	else
1345 	if(!RSB_DO_TOOFEWNNZFORCSR(nnz,m) && IR && IL)
1346 	{
1347 		RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_partially_rcsr_arrays_for_later(submatrix,IL,IR,IA,JA,snzoff,m,roff));
1348 		//RSB_ERROR("nnz=%d ,m=%d ! what shall we do ?\n",nnz,m);
1349 		RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE);
1350 	}
1351 	else
1352 #endif
1353 	{
1354 		if(RSB_C2R_IF_VERBOSE)
1355 			RSB_INFO("building a very sparse recursive matrix\n");
1356 
1357 		/* no hope for CSR : however, full/half word COO will fit  */
1358 		submatrix->nzoff = snzoff;
1359 		submatrix->bindx = NULL;
1360 		submatrix->bpntr = NULL;
1361 		RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE);
1362 		//RSB_ERROR("nnz=%d ,m=%d ! what shall we do ?\n",nnz,m);
1363 	}
1364 	mtxAp->sm[roff?(coff?3:2):(coff?1:0)] = submatrix;
1365 	submatrix->roff = roff+mtxAp->roff;
1366 	submatrix->coff = coff+mtxAp->coff;
1367 	RSB_DO_ERROR_CUMULATE(errval,rsb__set_init_flags_and_stuff(submatrix,NULL,NULL,m,k,nnz,nnz,nnz,typecode,flags));
1368 //err:
1369 	RSB_DO_ERR_RETURN(errval)
1370 }
1371 
rsb_compar_rcsr_matrix_leftmost_first(const void * ap,const void * bp)1372 int rsb_compar_rcsr_matrix_leftmost_first(const void * ap, const void * bp)
1373 {
1374 	/**
1375 		\ingroup gr_internals
1376 		A compare function to be used with qsort.
1377 		TODO:RENAME: rsb_compar_rcsr_matrix_leftmost_first -> ?
1378 		Orders first by column, then by row.
1379 	*/
1380 	struct rsb_mtx_t *a = *(struct rsb_mtx_t **)ap;
1381 	struct rsb_mtx_t *b = *(struct rsb_mtx_t **)bp;
1382 	int ss = 1;	/* should swap results ? */
1383 	rsb_bool_t at;
1384 	rsb_bool_t bt;
1385 
1386 	at=!RSB_DO_FLAG_HAS(a->flags,RSB_FLAG_QUAD_PARTITIONING);
1387 	bt=!RSB_DO_FLAG_HAS(b->flags,RSB_FLAG_QUAD_PARTITIONING);
1388 
1389 	if(at && !bt)
1390 		return 1;
1391 	if(!at && bt)
1392 		return -1;
1393 
1394 	if(a->coff < b->coff)
1395 	{
1396 		RSB_SWAP(struct rsb_mtx_t *,a,b);
1397 		ss = -1;/* should swap results ! */
1398 	}
1399 
1400 	return (a->coff==b->coff)?(a->roff>b->roff?1:(a->roff<b->roff?-1:0)):1*ss;
1401 }
1402 
1403 #if 0
1404 /* 20121001 unfinished code: commented */
1405 static struct rsb_mtx_t * rsb_do_find_ffmltart(struct rsb_mtx_t ** submatricesp, rsb_submatrix_idx_t smn, struct rsb_mtx_t * submatrix, rsb_coo_idx_t off)
1406 {
1407 	/**
1408 		\ingroup gr_unfinished
1409 	*/
1410 	rsb_submatrix_idx_t smi = 0;
1411 	rsb_coo_idx_t coff = submatrix->coff;
1412 	rsb_coo_idx_t roff = submatrix->roff;
1413 	rsb_coo_idx_t m = submatrix->nr;
1414 	rsb_coo_idx_t k = submatrix->nc;
1415 	/* leftmost from right */
1416 	for(smi=0;smi<smn;++smi)
1417 		if(submatricesp[smi]->coff>=coff+k &&
1418 			       	submatricesp[smi]->roff<=off+0 && submatricesp[smi]->roff+submatricesp[smi]->nr>off)
1419 			return submatricesp[smi];
1420 	/* leftmost from left, the line after */
1421 	for(smi=0;smi<smn;++smi)
1422 		if(submatricesp[smi]->coff<coff &&
1423 			       	submatricesp[smi]->roff<=off+1 && submatricesp[smi]->roff+submatricesp[smi]->nr>off)
1424 			return submatricesp[smi];
1425 	return NULL;
1426 }
1427 #endif
1428 
rsb__should_recursively_partition_matrix(rsb_coo_idx_t mB,rsb_coo_idx_t kB,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t element_count,rsb_nnz_idx_t block_count,rsb_nnz_idx_t nnz,rsb_blk_idx_t Mdim,rsb_blk_idx_t mdim,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_flags_t flags,size_t el_size,rsb_thread_t wet)1429 static rsb_bool_t rsb__should_recursively_partition_matrix(
1430 	rsb_coo_idx_t mB, rsb_coo_idx_t kB,
1431 	rsb_coo_idx_t m, rsb_coo_idx_t k,
1432 	rsb_nnz_idx_t element_count,
1433 	rsb_nnz_idx_t block_count,
1434 	rsb_nnz_idx_t nnz,
1435 	rsb_blk_idx_t Mdim,
1436 	rsb_blk_idx_t mdim,
1437 	rsb_coo_idx_t roff,
1438 	rsb_coo_idx_t coff,
1439 	rsb_flags_t flags,
1440 	size_t el_size,
1441 	rsb_thread_t wet
1442 )
1443 {
1444 #if (RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY == RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY_NAIVE)
1445 	/*
1446 		\ingroup gr_internals
1447 		NEW: UNFINISHED
1448 		TODO : ERROR HANDLING, DOCS
1449 		TODO : should partition on high nnz per row count
1450 			although in this case if the matrix is highly
1451 			compact (almost banded) it won't help.
1452 			Therefore, some sparseness statistics in the matrix constructor
1453 			would be nice.
1454 	*/
1455 	//long cs = rsb__get_lastlevel_c_size();
1456 	//long cs = rsb__get_lastlevel_c_size_per_thread();
1457 	long cs=(rsb__get_lastlevel_c_size()/(wet>0?wet:rsb_get_num_threads()));
1458 	rsb_bool_t sp = RSB_BOOL_FALSE;	/* should partition */
1459 	rsb_fillin_t efillin=1.0;		/* FIXME */
1460 	size_t smab=0;			/* spmv memory accessed bytes */
1461 	//cs/=20000;
1462 	//cs/=20;
1463 	//cs/=4;
1464 	/* FIXME */
1465 	if(nnz<RSB_RECURSION_MIN_NNZ  || m<RSB_RECURSION_MIN_DIM  || k<RSB_RECURSION_MIN_DIM  || !RSB_DO_FLAG_HAS(flags,RSB_FLAG_QUAD_PARTITIONING))
1466 	{
1467 		sp = RSB_BOOL_FALSE;
1468 		goto done;
1469 	}
1470 
1471 	if(kB<1)kB=1;
1472 	if(mB<1)mB=1;
1473 
1474 #if 0
1475 	if(flags & RSB_FLAG_RECURSIVE_DOUBLE_DETECTED_CACHE)
1476 		cs*=2;
1477 
1478 	if(flags & RSB_FLAG_RECURSIVE_HALF_DETECTED_CACHE)
1479 		cs/=2;
1480 #endif
1481 
1482 	if( (flags & RSB_FLAG_RECURSIVE_SUBDIVIDE_MORE_ON_DIAG) && roff == coff )
1483 		cs/=2;
1484 	/* this will imply a more fine grained subdivision on the diagonal
1485 	 * FIXME : we could use a factor different than 2 !
1486 	 * */
1487 
1488 
1489 	/* subdivide at least until matrix indices can be compressed */
1490 	if((flags & RSB_FLAG_USE_HALFWORD_INDICES_CSR) && m>1 && k>1 &&
1491 //			nnz>1
1492 //			nnz>(cs/4)
1493 			nnz*el_size>2*cs
1494 			&& !rsb__do_is_candidate_size_for_halfword_csr(m,k,nnz,flags))
1495 		return RSB_BOOL_TRUE;
1496 	if((flags & RSB_FLAG_USE_HALFWORD_INDICES_COO) && m>1 && k>1 &&
1497 //			nnz>1
1498 //			nnz>(cs/4)
1499 			nnz*el_size>2*cs
1500 			&& !rsb__do_is_candidate_size_for_halfword_coo(m,k,flags))
1501 		return RSB_BOOL_TRUE;
1502 
1503 	if(cs>0)
1504 	{
1505 		smab = rsb_spmv_memory_accessed_bytes_(mB,kB,m,k,efillin*nnz,((efillin*nnz)/mB)/kB,m/mB,el_size);
1506 
1507 		if( 2*smab > 3*3*cs )	/* FIXME : overflow possible */
1508 			sp=1;
1509 		else
1510 		if(
1511 			/* FIXME! */
1512 			(((Mdim+mdim+m+k)*sizeof(rsb_coo_idx_t))
1513 			/(nnz*el_size)) > 8*cs
1514 		)
1515 			sp = RSB_BOOL_TRUE;
1516 		else
1517 			sp = RSB_BOOL_FALSE;
1518 	}
1519 	else
1520 	{
1521 		/* no cache info (FIXME: there should be no section like this one) */
1522 		if(
1523 			Mdim<8 || mdim<8 || m < 500
1524 			|| k < 500 || nnz < 200*100)
1525 			sp = RSB_BOOL_FALSE;
1526 		else
1527 			sp = RSB_BOOL_TRUE;
1528 	}
1529 #ifdef RSB_EXPERIMENTAL_ROWS_SUBDIVIDE_TO_CORES_NUM
1530 	/* STILL UNIMPLEMENTED */
1531 #endif /* RSB_EXPERIMENTAL_ROWS_SUBDIVIDE_TO_CORES_NUM */
1532 #if RSB_EXPERIMENTAL_NO_SUBDIVIDE_ON_MIN_NNZ_PER_ROW_OR_COLUMN
1533 	if(1)
1534 	{
1535 		rsb_nnz_idx_t nnzpr;
1536 		if( (flags&RSB_FLAG_WANT_COLUMN_MAJOR_ORDER) != 0 )
1537 			nnzpr=nnz/k;
1538 		else
1539 			nnzpr=nnz/m;
1540 
1541 		if( nnzpr < RSB_CONST_MIN_NNZ_PER_ROW_OR_COLUMN_PER_SUBMATRIX )
1542 			sp = RSB_BOOL_FALSE;
1543 	}
1544 #endif /* RSB_EXPERIMENTAL_NO_SUBDIVIDE_ON_MIN_NNZ_PER_ROW_OR_COLUMN */
1545 done:
1546 	return sp;
1547 #else /* (RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY == RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY_NAIVE) */
1548 	#error "should use a RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY_NAIVE partitioning policy!"
1549 	return RSB_BOOL_FALSE;
1550 #endif /* (RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY == RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY_NAIVE) */
1551 }
1552 
rsb_do_copy_submatrix_coa(struct rsb_mtx_t * submatrix,void * VA,void * WA,rsb_coo_idx_t * IL,rsb_coo_idx_t * IR,size_t el_size,rsb_coo_idx_t n0,rsb_coo_idx_t i0,rsb_coo_idx_t m0)1553 static rsb_nnz_idx_t rsb_do_copy_submatrix_coa(struct rsb_mtx_t * submatrix, void * VA, void * WA, rsb_coo_idx_t * IL, rsb_coo_idx_t * IR, size_t el_size, rsb_coo_idx_t n0, rsb_coo_idx_t i0, rsb_coo_idx_t m0)
1554 {
1555 	rsb_nnz_idx_t i,n;
1556 	RSB_C2R_ASSERT(submatrix);
1557 	RSB_C2R_ASSERT(VA);
1558 	RSB_C2R_ASSERT(WA);
1559 	RSB_C2R_ASSERT(IL);
1560 	RSB_C2R_ASSERT(IR);
1561 	RSB_C2R_ASSERT(el_size>0);
1562 	RSB_C2R_ASSERT(n0>=0);
1563 	RSB_C2R_ASSERT(i0<m0);
1564 	submatrix->VA=((char*)VA)+el_size*submatrix->nzoff;
1565 	for(n=n0,i=i0;RSB_LIKELY(i<m0);n+=IR[i]-IL[i],++i)
1566 	{
1567 		RSB_C2R_ASSERT(n>=0);
1568 		RSB_A_MEMCPY_SMALL(WA,VA,submatrix->nzoff+n,IL[i],IR[i]-IL[i],el_size);
1569 	}
1570 	return n;
1571 }
1572 
1573 
rsb_do_pick_largest_open_matrix(struct rsb_mtx_t ** submatricesp,rsb_submatrix_idx_t smc)1574 static rsb_submatrix_idx_t rsb_do_pick_largest_open_matrix(struct rsb_mtx_t ** submatricesp, rsb_submatrix_idx_t smc)
1575 {
1576 	/*
1577 	 *	FIXME: NEW, UNFINISHED
1578 	 *	need a lock, too
1579 	 *	TODO:need a priority queue, here
1580 	 * */
1581 	rsb_submatrix_idx_t smi = 0,msmi = RSB_SUBM_IDX_MARKER;
1582 	rsb_nnz_idx_t maxnz = 0;
1583 	if(RSB_WANT_VERBOSE_SUBDIVISION)
1584 		if(smc==0)
1585 			RSB_INFO("warning: no largest open matrix among 0 matrices\n");
1586 	for(smi=0;smi<smc;++smi)
1587 	{
1588 		//RSB_INFO("looking %d : %d\n",smi,submatricesp[smi]->nnz);
1589 		/* FIXME: ">=" here is used to cope with diagonal implicit matrices (which could have nnz==0), too */
1590 		if(submatricesp[smi])
1591 		if(submatricesp[smi]->nnz>=maxnz)
1592 		{
1593 			maxnz = submatricesp[smi]->nnz;
1594 			msmi = smi;
1595 		}
1596 	}
1597 	return msmi;
1598 }
1599 
rsb_do_coo2rec_subdivide_parallel(void * VA,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz,rsb_type_t typecode,const struct rsb_mtx_partitioning_info_t * pinfop,rsb_flags_t flags,rsb_err_t * errvalp,struct rsb_mtx_t ** submatricesp,struct rsb_mtx_t * mtxAp,const rsb_nnz_idx_t * IB,const rsb_nnz_idx_t * IX,rsb_coo_idx_t * IT,rsb_coo_idx_t * WA,rsb_submatrix_idx_t cmc,rsb_submatrix_idx_t omc,rsb_submatrix_idx_t tmc,rsb_thread_t wet,rsb_submatrix_idx_t * cmcp)1600 static rsb_err_t rsb_do_coo2rec_subdivide_parallel(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp, struct rsb_mtx_t ** submatricesp, struct rsb_mtx_t * mtxAp, const rsb_nnz_idx_t * IB, const rsb_nnz_idx_t * IX, rsb_coo_idx_t * IT, rsb_coo_idx_t * WA, rsb_submatrix_idx_t cmc, rsb_submatrix_idx_t omc, rsb_submatrix_idx_t tmc, rsb_thread_t wet, rsb_submatrix_idx_t *cmcp)
1601 {
1602 	/*
1603 	 	TODO: clean this up.
1604 		Note that rsb__set_num_threads is outlawed here.
1605 	 */
1606 	rsb_err_t errval = RSB_ERR_NO_ERROR;
1607 	size_t el_size = RSB_SIZEOF(typecode);
1608 	const rsb_nnz_idx_t ttlnz = nnz;		/* total nnz */
1609 	rsb_nnz_idx_t maxnz = nnz;			/* max encountered nnz for a leaf */
1610 	rsb_submatrix_idx_t stmc = RSB_MIN(tmc,wet);	/* submatrices total count */
1611 	rsb_submatrix_idx_t lmc = 1;			/* leaf matrix count */
1612 	rsb_time_t cpt = RSB_TIME_ZERO,dt = RSB_TIME_ZERO;	/* cpt overwrites mtxAp->cpt */
1613 	rsb_thread_t tn = rsb_get_num_threads();	/* threads number */
1614 	rsb_thread_t mtn = RSB_MAX(1,(rsb_thread_t)(rsb_global_session_handle.subdivision_multiplier*tn));
1615 	rsb_thread_t tnn = 1;				/* threads number */
1616 	rsb_float_t skew = ((rsb_float_t)(maxnz))/(nnz/wet);	/* if more than one, will limit scaling */
1617 	long cbs = rsb__get_cache_block_byte_size();
1618 
1619 	if(RSB_WANT_VERBOSE_SUBDIVISION)
1620 		RSB_INFO("serial substage subdivision of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("\n");
1621 again:
1622 	#pragma omp parallel reduction(|:errval) shared(submatricesp) num_threads(tnn)
1623 	{
1624 		rsb_submatrix_idx_t smi = 0;
1625 		struct rsb_mtx_t * submatrix = NULL;
1626 #if RSB_WANT_OMP_RECURSIVE_KERNELS
1627 		rsb_thread_t th_id = omp_get_thread_num();
1628 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
1629 		rsb_thread_t th_id = 0;
1630 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
1631 #if RSB_WANT_VERBOSE_SUBDIVISION2
1632 		if(th_id==0){
1633 			RSB_MTXASM_INFO("entering %d threads in subdivision phase\n",tnn);
1634 			RSB_MTXASM_INFO("entering %d threads in subdivision phase\n",tnn);}
1635 #endif /* RSB_WANT_VERBOSE_SUBDIVISION2 */
1636 iagain:
1637 		#pragma omp critical (rsb_coo2rsbsub_crs)
1638 		{
1639 			smi = rsb_do_pick_largest_open_matrix(submatricesp+cmc,omc);
1640 			if(smi != RSB_SUBM_IDX_MARKER)
1641 			{
1642 				smi += cmc;
1643 				submatrix = submatricesp[smi];
1644 				maxnz = submatrix->nnz;
1645 				/* RSB_ASSERT(nnz>=wet); */
1646 				skew = ((rsb_float_t)(maxnz))/((rsb_float_t)(nnz/wet));
1647 				RSB_ASSERT(!isinf(skew));
1648 				omc--;
1649 				if(smi!=cmc)
1650 				{
1651 #if 0
1652 			  		assert(submatricesp[smi]);
1653 			  		assert(submatricesp[cmc]);
1654 #endif
1655 					RSB_SWAP(struct rsb_mtx_t *,submatricesp[smi],submatricesp[cmc]);
1656 				}
1657 				++cmc;
1658 			}
1659 			else
1660 			{
1661 				submatrix = NULL;
1662 			}
1663 	 		if(RSB_WANT_VERBOSE_SUBDIVISION)
1664 			{
1665 				if(submatrix)
1666 					RSB_INFO("subdividing "),RSB_INFO_MATRIX_SUMMARY(submatrix),RSB_INFO(" (open:%d,closed:%d) for thread %d\n",omc,cmc,th_id);
1667 				else
1668 					RSB_INFO("no available submatrix (open:%d,closed:%d) for thread %d/%d\n",omc,cmc,th_id,tnn);
1669 			}
1670 		}
1671 	if((smi)!=RSB_SUBM_IDX_MARKER)
1672 	{
1673 #if 0
1674 	for(smi=0;RSB_LIKELY(omc>0);smi=cmc+omc-1)
1675 	while(submatrix)
1676 	if(omc>0 && ((submatrix=submatricesp[smi])!=NULL))
1677 #endif
1678 	{
1679 		rsb_coo_idx_t k = submatrix->nc;
1680 		rsb_coo_idx_t m = submatrix->nr;
1681 		rsb_coo_idx_t hk = RSB_MIDDLE(k);
1682 		rsb_coo_idx_t hm = RSB_MIDDLE(m);
1683 		rsb_nnz_idx_t ul = 0,ur = 0,ll = 0,lr = 0;
1684 		rsb_nnz_idx_t nnz = submatrix->nnz;
1685 		rsb_coo_idx_t roff = submatrix->roff;
1686 		rsb_coo_idx_t coff = submatrix->coff;
1687 		rsb_flags_t flags = submatrix->flags;
1688 		rsb_nnz_idx_t nzoff = submatrix->nzoff;
1689 		rsb_bool_t sqs = RSB_BOOL_FALSE;		/* should quad subdivide */
1690 		rsb_submatrix_idx_t smc = 0;	/* submatrices count */
1691 
1692 		if(RSB_C2R_IF_VERBOSE)
1693 			RSB_INFO("cmc:%d omc:%d smi:%d tmc=%d stmc=%d th_id=%d\n",cmc,omc,smi,tmc,stmc,th_id);
1694 
1695 		/* too few nonzeros for recursion (TODO: may change in the future) */
1696 		if(RSB_DO_TOOFEWNNZFORRCSR(nnz,m))
1697 #if RSB_WANT_SUBDIVISION_FIXES_20101120
1698 		if(!RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE))
1699 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1700 		{
1701 			if(RSB_C2R_IF_VERBOSE)
1702 				RSB_INFO("matrix too sparse for RCSR: rejoining\n");
1703 			sqs = RSB_BOOL_FALSE;
1704 			goto nosqstest;
1705 		}
1706 
1707 		/* decide if the matrix is worth subdividing further (soft) */
1708 		sqs = rsb__should_recursively_partition_matrix(0,0,m,k,0,0,nnz,m,k,roff,coff,flags,el_size,mtn);
1709 #if RSB_WANT_SUBDIVISION_FIXES_20101120
1710 		if(nnz<RSB_RECURSION_MIN_NNZ  || m<RSB_RECURSION_MIN_DIM  || k<RSB_RECURSION_MIN_DIM  || !RSB_DO_FLAG_HAS(flags,RSB_FLAG_QUAD_PARTITIONING))
1711 		{
1712 			sqs = RSB_BOOL_FALSE;		/* a hard condition */
1713 			goto nosqstest;
1714 		}
1715 		else
1716 			if(cmc+omc<tmc)
1717 				if(skew>RSB_SUBDIVISION_SKEW_MAX)
1718 					sqs = RSB_BOOL_TRUE;	/* a soft condition */
1719 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1720 
1721 		if(!sqs)
1722 			if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_RECURSIVE_MORE_LEAVES_THAN_THREADS))
1723 				if(wet>lmc)
1724 					sqs = RSB_BOOL_TRUE;
1725 
1726 		if(sqs)
1727 		{
1728 			rsb_bool_t awfcsr = RSB_BOOL_FALSE; /* all of the matrices will fit csr ? */
1729 #if RSB_WANT_SUBDIVISION_FIXES_20101120
1730 			rsb_nnz_idx_t mqnnz = RSB_MAX(RSB_MAX(ul,ur),RSB_MAX(lr,ll));
1731 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1732 
1733 			/* compute the split vector */
1734 			dt = - rsb_time();
1735 			if((errval = rsb_do_compute_vertical_split_search_only(IA,JA,roff,coff,m,k,hm,hk,nnz,IB,&ul,&ur,&ll,&lr))!=RSB_ERR_NO_ERROR)
1736 				;/* goto err; */
1737 			dt += rsb_time();
1738 			cpt += dt;
1739 			RSB_C2R_ASSERT(IR);
1740 			awfcsr = ( (ul>0 && RSB_DO_TOOFEWNNZFORCSR(ul,hm))   || (ur>0 && RSB_DO_TOOFEWNNZFORCSR(ur,hm)) || (lr>0 && RSB_DO_TOOFEWNNZFORCSR(lr,m-hm)) || (ll>0 && RSB_DO_TOOFEWNNZFORCSR(ll,m-hm)))?RSB_BOOL_TRUE:RSB_BOOL_FALSE;
1741 
1742 			if(awfcsr) /* FIXME: misleading naming ! */
1743 			{
1744 				/* if some leaf won't fit in CSR, we don't split anymore */
1745 				if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE))
1746 					sqs = RSB_BOOL_TRUE;
1747 				else
1748 					sqs = RSB_BOOL_FALSE;
1749 				if(RSB_C2R_IF_VERBOSE)
1750 					RSB_INFO("no space for conversion of some leaf: rejoining ? %d\n",!sqs);
1751 			}
1752 
1753 #if RSB_WANT_SUBDIVISION_FIXES_20101120
1754 			/* an alternative would be to place this test in the branch above*/
1755 			if(	mqnnz>RSB_MAX_QUADRANTS_UNBALANCE*(nnz-mqnnz) &&
1756 				el_size*nnz<cbs &&
1757 				nnz < (ttlnz/wet) )
1758 				sqs = RSB_BOOL_FALSE;
1759 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1760 
1761 			/* how many submatrices out of four ? */
1762 			smc = (ul?1:0)+(ur?1:0)+(ll?1:0)+(lr?1:0);
1763 			if(cmc+omc+smc>tmc)
1764 			{
1765 				if(RSB_C2R_IF_VERBOSE)
1766 					RSB_INFO("too many submatrices (%d+%d>%d: rejoining\n",cmc+omc,smc,tmc);
1767 				sqs = RSB_BOOL_FALSE;
1768 				goto nosqstest;
1769 			}
1770 
1771 #if !RSB_WANT_SUBDIVISION_FIXES_20101120
1772 #ifdef RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES
1773 			if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES))
1774 				if(wet<lmc-1)
1775 				{
1776 					if(RSB_C2R_IF_VERBOSE)
1777 						RSB_INFO("RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES: rejoining\n");
1778 					sqs = RSB_BOOL_FALSE;
1779 				}
1780 #endif /* RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES */
1781 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1782 
1783 			if(RSB_C2R_IF_VERBOSE)
1784 			RSB_ERROR("splitting %d/%d -> %d/%d %d/%d %d/%d %d/%d sqs? %d\n",nnz,m,ul,hm,ur,hm,ll,m-hm,lr,m-hm,sqs);
1785 			if(ul+ur+ll+lr != nnz)
1786 			{
1787 				if(RSB_C2R_IF_VERBOSE)
1788 					RSB_ERROR("%d ?= %d + %d + %d + %d = %d\n",nnz,ul,ur,ll,lr,ul+ur+ll+lr);
1789 				RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1790 			}
1791 		}
1792 nosqstest:
1793 		if(sqs)
1794 		{
1795 			/* should quad-subdivide. let's take care of indices. */
1796 			rsb_nnz_idx_t snzoff = nzoff;
1797 			rsb_submatrix_idx_t smci = 0;
1798 			rsb_submatrix_idx_t smco = 0;
1799 			struct rsb_mtx_t*isms[4] = {NULL,NULL,NULL,NULL};
1800 
1801 			/*
1802 			the index arrays are copied/linked into the quadrants
1803 			some quadrants may seem ready for recursion, but they not result as such later on.
1804 			they will be made leaf later on, if necessary.
1805 			...
1806 			*/
1807 			RSB_C2R_ASSERT(ur>=0 && ul>=0 && lr>=0 && ll>=0);
1808 
1809 			#pragma omp critical (rsb_coo2rsbsub_crs)
1810 			{
1811 				if(cmc+omc+smc+RSB_SUBDIVISION_BUG_EXTRA>tmc)
1812 				{
1813 					if(RSB_C2R_IF_VERBOSE)
1814 						RSB_INFO("too many submatrices (%d+%d>%d): rejoining\n",cmc+omc,smc,tmc);
1815 					sqs = RSB_BOOL_FALSE;
1816 				}
1817 				else
1818 				{
1819 					lmc += smc;
1820 					lmc -= 1;
1821 					smco = cmc+omc;
1822 					snzoff = nzoff;
1823 					if(ul){ isms[0] = submatricesp[smco+smci];submatricesp[smco+smci] = NULL;smci++;}
1824 					if(ur){ isms[1] = submatricesp[smco+smci];submatricesp[smco+smci] = NULL;smci++;}
1825 					if(ll){ isms[2] = submatricesp[smco+smci];submatricesp[smco+smci] = NULL;smci++;}
1826 					if(lr){ isms[3] = submatricesp[smco+smci];submatricesp[smco+smci] = NULL;smci++;}
1827 					smci = 0;
1828 					omc += smc;
1829 				}
1830 			if(sqs)
1831 			{
1832 				if(ul)
1833 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,isms[0],NULL,NULL,IA,JA,VA,snzoff,ul,hm,hk,0,0,typecode,flags)), snzoff += ul,++smci;
1834 				if(ur)
1835 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,isms[1],NULL,NULL,IA,JA,VA,snzoff,ur,hm,k-hk,0,hk,typecode,flags)), snzoff += ur,++smci;
1836 				if(ll)
1837 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,isms[2],NULL,NULL,IA,JA,VA,snzoff,ll,m-hm,hk,hm,0,typecode,flags)), snzoff += ll,++smci;
1838 				if(lr)
1839 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,isms[3],NULL,NULL,IA,JA,VA,snzoff,lr,m-hm,k-hk,hm,hk,typecode,flags)), snzoff += lr,++smci;
1840 				RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
1841 			}
1842 
1843 			if(sqs)
1844 			{
1845 				smci = 0;
1846 				if(ul){ submatricesp[smco+smci] = isms[0];smci++;}
1847 				if(ur){ submatricesp[smco+smci] = isms[1];smci++;}
1848 				if(ll){ submatricesp[smco+smci] = isms[2];smci++;}
1849 				if(lr){ submatricesp[smco+smci] = isms[3];smci++;}
1850 			}
1851 
1852 			if(sqs)
1853 			{
1854 			if(snzoff-nzoff!=nnz)
1855 			{
1856 				/* is this a partition ? */
1857 				RSB_ERROR("%d - %d != %d ?= %d + %d + %d + %d = %d\n",snzoff,nzoff,nnz,ul,ur,ll,lr,ul+ur+ll+lr);
1858 				RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1859 			}
1860 			if(RSB_SOME_ERROR(errval))
1861 			{
1862 				RSB_ERROR(RSB_ERRM_ES); /* goto err; */
1863 			}
1864 			RSB_DO_FLAG_ADD(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING);
1865 			RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_NON_ROOT_MATRIX);
1866 			submatrix->bindx = NULL; submatrix->bpntr = NULL; submatrix->indptr = NULL;
1867 			}
1868 			}
1869 		}
1870 		if(!sqs)
1871 		{
1872 			RSB_DO_FLAG_SUBST(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING,RSB_FLAG_NON_ROOT_MATRIX);
1873 			/* selecting a format and declaring as leaf */
1874 			if(!RSB_DO_TOOFEWNNZFORCSR(nnz,m) /*&& IR && IL*/)
1875 			{
1876 /*				RSB_INFO("CSR -> COO ?\n"); */
1877 				if(RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE))
1878 					RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_COO_STORAGE);
1879 				if((errval = rsb__do_set_init_storage_flags(submatrix,submatrix->flags))!=RSB_ERR_NO_ERROR)
1880 					;/* goto err; */
1881 			}
1882 			else
1883 			{
1884 /*				RSB_INFO("COO !\n"); */
1885 				rsb_flags_t sflags = flags;
1886 				RSB_DO_FLAG_SUBST(sflags,RSB_FLAG_WANT_BCSS_STORAGE,RSB_FLAG_WANT_COO_STORAGE);
1887 				if((errval = rsb__do_set_init_storage_flags(submatrix,sflags))!=RSB_ERR_NO_ERROR)
1888 					;/* goto err; */
1889 			}
1890 			if(RSB_C2R_IF_VERBOSE)
1891 				RSB_INFO("freezing %d ",smi+1),
1892 				RSB_INFO_MATRIX_SUMMARY(submatrix),
1893 				RSB_INFO("\n");
1894 		}
1895 		/* matrix is declared as 'closed'.
1896 		   sorting, in a way smi will point to the biggest open mtxAp, which will be picked up next */
1897 #if 0
1898 		qsort(submatricesp+cmc,(size_t)(omc),sizeof(struct rsb_mtx_t*),& rsb_compar_rcsr_matrix_regarding_nnz);
1899 		RSB_SWAP(struct rsb_mtx_t *,submatricesp[smi],submatricesp[cmc-1]);
1900 #endif
1901 	}
1902 		/*smi = cmc+omc-1; */
1903 	}
1904 		if(omc>0 && cmc+omc<stmc)
1905 			goto iagain;
1906 #if RSB_WANT_VERBOSE_SUBDIVISION2
1907 		if(th_id==0)
1908 		{
1909 			RSB_MTXASM_INFO("thread %d:terminating subdivision",th_id);
1910 			if(omc==0)
1911 			{RSB_MTXASM_INFO(", no more open matrices");}
1912 			RSB_MTXASM_INFO("(closed %d= %d nodes + %d leaves, out of %d available)",cmc,cmc-lmc,lmc,tmc);
1913 			RSB_MTXASM_INFO(",(maxnz=%d,skew=%g)",maxnz,skew);
1914 			if(cmc+omc>=stmc)
1915 			{RSB_MTXASM_INFO(", no room left for submatrices");}
1916 			RSB_MTXASM_INFO(".\n");
1917 		}
1918 #endif
1919 	} /* parallel */
1920 
1921 	if(RSB_SOME_ERROR(errval))
1922 		goto err;
1923 
1924 	#pragma omp barrier
1925 	if(stmc!=tmc)
1926 	{
1927 		stmc = tmc;
1928 		if(RSB_WANT_VERBOSE_SUBDIVISION)
1929 			RSB_INFO("parallel substage subdivision of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("\n");
1930 		tnn = tn;
1931 		goto again;
1932 	}
1933 	else
1934 	{
1935 
1936 		if(RSB_WANT_VERBOSE_SUBDIVISION)
1937 			RSB_INFO("parallel substage subdivision of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO(" not required\n");
1938 	}
1939 	{
1940  		if(RSB_WANT_VERBOSE_SUBDIVISION)
1941 			RSB_INFO("subdivision of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("complete \n");
1942 	}
1943 	mtxAp->cpt = cpt;
1944 
1945 	*cmcp = cmc;
1946 err:
1947 	RSB_DO_ERR_RETURN(errval)
1948 }
1949 
1950 #if 0
1951 static rsb_err_t rsb_do_coo2rec_subdivide(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp, struct rsb_mtx_t ** submatricesp, struct rsb_mtx_t * mtxAp, const rsb_nnz_idx_t * IB, const rsb_nnz_idx_t * IX, rsb_coo_idx_t * IT, rsb_coo_idx_t * WA, rsb_submatrix_idx_t cmc, rsb_submatrix_idx_t omc, rsb_submatrix_idx_t tmc, rsb_thread_t wet, rsb_submatrix_idx_t *cmcp)
1952 {
1953 	rsb_nnz_idx_t tdnnz = 0;
1954 	rsb_submatrix_idx_t smi = 0;/* max matrix count, done matrix count, submatrix index */
1955 	rsb_err_t errval = RSB_ERR_NO_ERROR;
1956 	size_t el_size = RSB_SIZEOF(typecode);
1957 	rsb_time_t cpt = RSB_TIME_ZERO,dt = RSB_TIME_ZERO;
1958 
1959 	for(smi=0;RSB_LIKELY(omc>0);smi=cmc+omc-1)
1960 	{
1961 		struct rsb_mtx_t * submatrix = submatricesp[smi];
1962 		rsb_coo_idx_t k = submatrix->nc;
1963 		rsb_coo_idx_t m = submatrix->nr;
1964 		rsb_coo_idx_t hk = (k+1)/2;
1965 		rsb_coo_idx_t hm = (m+1)/2;
1966 		rsb_nnz_idx_t ul = 0,ur = 0,ll = 0,lr = 0;
1967 		rsb_nnz_idx_t nnz = submatrix->nnz;
1968 		rsb_coo_idx_t roff = submatrix->roff;
1969 		rsb_coo_idx_t coff = submatrix->coff;
1970 		rsb_coo_idx_t*IL = submatrix->bindx;	// IL will be hosted here
1971 		rsb_coo_idx_t*IM = IT;			// IM will be hosted in a temporary vector
1972 		rsb_coo_idx_t*IR = submatrix->bpntr;	// IR will be hosted here
1973 		rsb_flags_t flags = submatrix->flags;
1974 		rsb_nnz_idx_t nzoff = submatrix->nzoff;
1975 		rsb_bool_t sqs = RSB_BOOL_FALSE;		// should quad subdivide
1976 		rsb_submatrix_idx_t smc = 0;	/* submatrices count */
1977 
1978 //		RSB_INFO("picked up %d/%d -> %d x %d, %d nnz, @ %d %d \n",smi+1,tmc,m,k,nnz,roff,coff);
1979 
1980 		if(!IL && !RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE) )
1981 		{
1982 			/* if there is no line pointer, we make this submatrix leaf */
1983 			sqs = RSB_BOOL_FALSE;
1984 			if(RSB_C2R_IF_VERBOSE)
1985 				RSB_INFO("no split, as no line pointer found\n");
1986 			goto nosqstest;
1987 		}
1988 
1989 		if(/*!IL || */!IM)
1990 		{
1991 			/* if this happens, this is an error */
1992 			RSB_ERROR("IL:%p, IM:%p\n",IL,IM);
1993 			errval = RSB_ERR_INTERNAL_ERROR;
1994 			goto err;
1995 		}
1996 
1997 		/* too few nonzeros for recursion (TODO: may change in the future) */
1998 		if(RSB_DO_TOOFEWNNZFORRCSR(nnz,m)
1999 				/*
2000 				 * Uncommenting the following allows subdivision for very spare matrices
2001 				 * However, this feature is unfinished and bugful (segfault risk)
2002 				 * */
2003 			       /*	&& !RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE) */
2004 				)
2005 		{
2006 			if(RSB_C2R_IF_VERBOSE)
2007 				RSB_INFO("matrix too sparse for RCSR: rejoining\n");
2008 
2009 			sqs = RSB_BOOL_FALSE;
2010 			goto nosqstest;
2011 		}
2012 
2013 		/* decide if the matrix is worth subdividing further */
2014 		sqs = rsb__should_recursively_partition_matrix(0,0,m,k,0,0,nnz,m,k,roff,coff,flags,el_size,0);
2015 
2016 		/* if we want subdivision  */
2017 
2018 		if(!sqs)
2019 		if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_RECURSIVE_MORE_LEAVES_THAN_THREADS))
2020 		if(wet>cmc+1+smc)/* /FIXME : this may not terminate! */
2021 			sqs = RSB_BOOL_TRUE;
2022 
2023 		if(sqs)
2024 		{
2025 			rsb_bool_t awfcsr = RSB_BOOL_FALSE; /* all of the matrices will fit csr ? */
2026 
2027 			// compute the split vector
2028 			dt = - rsb_time();
2029 			if((!RSB_DO_TOOFEWNNZFORCSR(nnz,m)) && IR && IL)
2030 			{if((errval = rsb_do_compute_vertical_split(IA,JA,roff,coff,m,k,hm,hk,nnz,IL,IM,IR,&ul,&ur,&ll,&lr))!=RSB_ERR_NO_ERROR) goto err;}
2031 			else
2032 			{
2033 				if(RSB_C2R_IF_VERBOSE)
2034 					RSB_INFO("using the sparse splitter\n");
2035 				if((errval = rsb_do_compute_vertical_split_search_only(IA,JA,roff,coff,m,k,hm,hk,nnz,IB,&ul,&ur,&ll,&lr))!=RSB_ERR_NO_ERROR) goto err;
2036 			}
2037 			dt += rsb_time();
2038 			cpt += dt;
2039 			RSB_C2R_ASSERT(IR);
2040 			awfcsr = ( (ul>0 && RSB_DO_TOOFEWNNZFORCSR(ul,hm))   || (ur>0 && RSB_DO_TOOFEWNNZFORCSR(ur,hm)) || (lr>0 && RSB_DO_TOOFEWNNZFORCSR(lr,m-hm)) || (ll>0 && RSB_DO_TOOFEWNNZFORCSR(ll,m-hm)))?RSB_BOOL_TRUE:RSB_BOOL_FALSE;
2041 
2042 			// after computing the split vector, we can still resign from subdividing
2043 			// especially if some submatrix is deemed too small and the overall submatrices count is enough
2044 			// TODO: if(rsb__should_rejoin_small_leaf(...
2045 			// ...
2046 
2047 			/* is there room for these additional submatrices ? */
2048 //			if( (ul>0 && RSB_DO_TOOFEWNNZFORRCSR(ul,hm))   || (ur>0 && RSB_DO_TOOFEWNNZFORRCSR(ur,hm)) || (lr>0 && RSB_DO_TOOFEWNNZFORRCSR(lr,m-hm)) || (ll>0 && RSB_DO_TOOFEWNNZFORRCSR(ll,m-hm)))
2049 			if(awfcsr)
2050 			{
2051 				/* if some leaf won't fit in CSR, we don't split anymore */
2052 				if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE))
2053 					sqs = RSB_BOOL_TRUE;
2054 				else
2055 					sqs = RSB_BOOL_FALSE;
2056 
2057 				if(RSB_C2R_IF_VERBOSE)
2058 					RSB_INFO("no space for conversion of some leaf: rejoining ? %d\n",!sqs);
2059 			}
2060 
2061 			/* how many submatrices out of four ? */
2062 			smc = (ul?1:0)+(ur?1:0)+(ll?1:0)+(lr?1:0);
2063 			if(cmc+omc+smc>tmc)
2064 			{
2065 				if(RSB_C2R_IF_VERBOSE)
2066 					RSB_INFO("too many submatrices: rejoining\n");
2067 				sqs = RSB_BOOL_FALSE;
2068 			}
2069 
2070 #ifdef RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES
2071 			/*
2072  			  if we want to avoid micro leaves, we could stop here
2073  			  FIXME: we need a better criteria (for proper load balancing!)
2074  			*/
2075 			if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES))
2076 				if(wet<cmc)
2077 				{
2078 					if(RSB_C2R_IF_VERBOSE)
2079 						RSB_INFO("RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES: rejoining\n");
2080 					sqs = RSB_BOOL_FALSE;
2081 				}
2082 #endif /* RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES */
2083 
2084 			if(RSB_C2R_IF_VERBOSE)
2085 			RSB_INFO("splitting %d/%d -> %d/%d %d/%d %d/%d %d/%d sqs? %d\n",nnz,m,ul,hm,ur,hm,ll,m-hm,lr,m-hm,sqs);
2086 		}
2087 
2088 nosqstest:
2089 		omc--;
2090 		if(smi!=cmc)
2091 			RSB_SWAP(struct rsb_mtx_t *,submatricesp[smi],submatricesp[cmc]);
2092 		++cmc;
2093 		if(sqs)
2094 		{
2095 			/* should quad-subdivide. let's take care of indices. */
2096 			rsb_nnz_idx_t snzoff = nzoff;
2097 
2098 			// the index arrays are copied/linked into the quadrants
2099 			// some quadrants may seem ready for recursion, but they not result as such later on.
2100 			// they will be made leaf later on, if necessary.
2101 			// ...
2102 			RSB_C2R_ASSERT(ur>=0 && ul>=0 && lr>=0 && ll>=0);
2103 
2104 #if RSB_C2R_WANT_MAYBE_FASTER
2105 			if(IL)
2106 				RSB_COA_MEMCPY_ROWSZ(IX,IL,0  ,0,m+1),
2107 				IL = IX;
2108 			if(IR)
2109 				RSB_COA_MEMCPY_ROWSZ(IX,IR,m+1,0,m+1),
2110 				IR = IX+m+1;
2111 #else /* RSB_C2R_WANT_MAYBE_FASTER */
2112 			if(IL)
2113 				RSB_COA_MEMMOVE(IX,IL,0  ,0,m+1),
2114 				IL = IX;
2115 			if(IR)
2116 				RSB_COA_MEMMOVE(IX,IR,m+1,0,m+1),
2117 				IR = IX+m+1;
2118 #endif /* RSB_C2R_WANT_MAYBE_FASTER */
2119 
2120 			if(ul)
2121 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,submatricesp[cmc+omc],IL,IM,IA,JA,VA,snzoff,ul,hm,hk,0,0,typecode,flags), ++omc, snzoff += ul);
2122 			if(ur)
2123 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,submatricesp[cmc+omc],IM,IR,IA,JA,VA,snzoff,ur,hm,k-hk,0,hk,typecode,flags), ++omc, snzoff += ur);
2124 			if(ll)
2125 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,submatricesp[cmc+omc],IL,IM,IA,JA,VA,snzoff,ll,m-hm,hk,hm,0,typecode,flags), ++omc, snzoff += ll);
2126 			if(lr)
2127 				RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,submatricesp[cmc+omc],IM,IR,IA,JA,VA,snzoff,lr,m-hm,k-hk,hm,hk,typecode,flags), ++omc, snzoff += lr);
2128 
2129 			if(snzoff-nzoff!=nnz)
2130 			{
2131 				/* is this a partition ? */
2132 				RSB_ERROR("%d - %d != %d ?= %d + %d + %d + %d = %d\n",snzoff,nzoff,nnz,ul,ur,ll,lr,ul+ur+ll+lr);
2133 				errval = RSB_ERR_INTERNAL_ERROR; goto err;
2134 			}
2135 			if(RSB_SOME_ERROR(errval))
2136 			{
2137 				RSB_ERROR(RSB_ERRM_ES); goto err;
2138 			}
2139 			RSB_DO_FLAG_ADD(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING);
2140 			RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_NON_ROOT_MATRIX);
2141 			submatrix->bindx = NULL; submatrix->bpntr = NULL; submatrix->indptr = NULL;
2142 		}
2143 		else
2144 		{
2145 			RSB_DO_FLAG_SUBST(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING,RSB_FLAG_NON_ROOT_MATRIX);
2146 			// we should decide a format, and proceed declaring it as leaf
2147 			if(!RSB_DO_TOOFEWNNZFORRCSR(nnz,m) && IR && IL)
2148 			{
2149 				if(RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE))
2150 					RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_COO_STORAGE);
2151 #if RSB_WANT_LITTLE_IMPROVED
2152 				if(submatrix==mtxAp)/*  root only */
2153 					/* FIXME: TODO: IR is NOT needed AT ALL!  */
2154 					rsb_do_fill_rcsr_arrays_for_later(submatrix,IL,IR,IA,JA,nzoff,m,0);
2155 #else /* RSB_WANT_LITTLE_IMPROVED */
2156 					rsb_do_fill_rcsr_arrays_for_later(submatrix,IL,IR,IA,JA,nzoff,m,0);
2157 #endif /* RSB_WANT_LITTLE_IMPROVED */
2158 				if((errval = rsb__do_set_init_storage_flags(submatrix,submatrix->flags))!=RSB_ERR_NO_ERROR)
2159 					goto err;
2160 				submatrix->VA = VA;	// FIXME: we will place pointers to partially swapped VA, here.
2161 			}
2162 			else
2163 			if(!RSB_DO_TOOFEWNNZFORCSR(nnz,m) /*&& IR && IL*/)
2164 			{
2165 //				RSB_INFO("CSR -> COO ?\n");
2166 				if(RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE))
2167 					RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_COO_STORAGE);
2168 				if((errval = rsb__do_set_init_storage_flags(submatrix,submatrix->flags))!=RSB_ERR_NO_ERROR)
2169 					goto err;
2170 			}
2171 			else
2172 			{
2173 //				RSB_INFO("COO !\n");
2174 				rsb_flags_t sflags = flags;
2175 				RSB_DO_FLAG_SUBST(sflags,RSB_FLAG_WANT_BCSS_STORAGE,RSB_FLAG_WANT_COO_STORAGE);
2176 				if((errval = rsb__do_set_init_storage_flags(submatrix,sflags))!=RSB_ERR_NO_ERROR)
2177 					goto err;
2178 			}
2179 			if(RSB_C2R_IF_VERBOSE)
2180 				RSB_INFO("freezing %d ",smi+1),
2181 				RSB_INFO_MATRIX_SUMMARY(submatrix),
2182 				RSB_INFO("\n");
2183 		}
2184 		// matrix is declared as 'closed'.
2185 		// sorting, in a way smi will point to the biggest open mtxAp, which will be picked up next
2186 		qsort(submatricesp+cmc,(size_t)(omc),sizeof(struct rsb_mtx_t*),& rsb_compar_rcsr_matrix_regarding_nnz);
2187 		// FIXME: a priority queue would do the job, here
2188 	}
2189 	mtxAp->cpt = cpt;
2190 err:
2191 	*cmcp = cmc;
2192 	RSB_DO_ERR_RETURN(errval)
2193 }
2194 #endif
2195 
rsb_do_coo2rec_shuffle(void * VA,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz,rsb_type_t typecode,const struct rsb_mtx_partitioning_info_t * pinfop,rsb_flags_t flags,rsb_err_t * errvalp,struct rsb_mtx_t ** submatricesp,struct rsb_mtx_t * mtxAp,const rsb_nnz_idx_t * IB,rsb_coo_idx_t * WA,rsb_submatrix_idx_t cmc)2196 static rsb_err_t rsb_do_coo2rec_shuffle(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp, struct rsb_mtx_t ** submatricesp, struct rsb_mtx_t * mtxAp, const rsb_nnz_idx_t * IB, rsb_coo_idx_t * WA, rsb_submatrix_idx_t cmc)
2197 {
2198 	rsb_nnz_idx_t tdnnz = 0;
2199 	rsb_submatrix_idx_t smi = 0;/* max matrix count, done matrix count, submatrix index */
2200 	rsb_err_t errval = RSB_ERR_NO_ERROR;
2201 	size_t el_size = RSB_SIZEOF(typecode);
2202 #if RSB_WANT_VERBOSE_TIMINGS
2203 	rsb_time_t pmt = RSB_TIME_ZERO;
2204 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2205 
2206 	if(!VA || cmc==1)
2207 	{
2208 		mtxAp->VA = VA;
2209 		goto no_va_cp;
2210 	}
2211 
2212 	// the following is a highly parallel phase
2213 	#pragma omp parallel for schedule(static,1) reduction(|:errval)  shared(tdnnz,submatricesp,IB) RSB_NTC
2214 	for(smi=0;smi<cmc;++smi)
2215 	{
2216 		struct rsb_mtx_t * submatrix = submatricesp[smi];
2217 		rsb_coo_idx_t*IL = submatrix->bindx;
2218 		rsb_coo_idx_t*IR = submatrix->bpntr;
2219 		rsb_nnz_idx_t dnnz = 0;
2220 		rsb_coo_idx_t i;
2221 //		if(RSB_C2R_IF_VERBOSE)
2222 //		RSB_INFO("%d -> %d\n",smi,omp_get_thread_num());
2223 		if(!RSB_DO_FLAG_HAS((submatrix->flags),RSB_FLAG_QUAD_PARTITIONING))
2224 		{
2225 			if(RSB_DO_ENOUGHNNZFORINDEXBASEDBUILD(submatrix) && !rsb__is_coo_matrix(submatrix) && IL && IR)
2226 			{
2227 				RSB_C2R_ASSERT(IL);
2228 				RSB_C2R_ASSERT(IR);
2229 				dnnz = rsb_do_copy_submatrix_coa(submatrix,VA,WA,IL,IR,el_size,0,0,submatrix->nr);
2230 #if 0
2231 				for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2232 				{
2233 					rsb_nnz_idx_t nnz1 = IR[i-submatrix->roff];
2234 					rsb_nnz_idx_t nnz0 = IL[i-submatrix->roff];
2235 					RSB_C2R_ASSERT(IL[i-submatrix->roff]>=IB[i]);
2236 					RSB_C2R_ASSERT(IL[i-submatrix->roff]<=IB[i+1]);
2237 					RSB_C2R_ASSERT(IL[i-submatrix->roff+1]>=IB[i+1]);
2238 					RSB_C2R_ASSERT(nnz0>=IL[i-submatrix->roff]);
2239 					RSB_C2R_ASSERT(nnz1<=IL[i-submatrix->roff+1]);
2240 				       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2241 					if(IB[i]==IB[i+1])continue;
2242 					RSB_C2R_ASSERT(nnz1>=nnz0);
2243 					if(nnz1==nnz0)continue;
2244 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2245 					RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2246 				}
2247 #endif
2248 			}
2249 			else
2250 			if(!RSB_DO_TOOFEWNNZFORCSR(submatrix->nnz,submatrix->nr))
2251 			{
2252 				//rsb_coo_idx_t*IL = submatrix->bindx;
2253 				for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2254 				{
2255 					//rsb_nnz_idx_t fel;
2256 					// offset of line i in the global line pointers array
2257 					rsb_nnz_idx_t nnz0 = IB[i];
2258 					// nnz1..nnz0 are the boundaries of line i
2259 					rsb_nnz_idx_t nnz1 = IB[i+1];
2260 					// check
2261 					RSB_C2R_ASSERT(nnz0>=IB[i]);
2262 					RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2263 					// skip line if empty
2264 					if(nnz1-nnz0<1)continue;
2265 					// find first element of line i also in the submatrix
2266 					nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff,nnz1-nnz0);
2267 					// skip line if empty in the submatrix
2268 					if(nnz1-nnz0<1)continue;
2269 					// find the length of the subrow i in the submatrix
2270 					nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff+submatrix->nc,nnz1-nnz0);
2271 					//check
2272 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2273 					// skip line if empty in the submatrix
2274 					if(nnz1-nnz0<1)continue;
2275 					// nnz1 .. nnz0 contain nonempty subrow i in the submatrix
2276 //					RSB_INFO("i:%d, %d..%d -> %d\n",i,nnz0,nnz1-1,submatrix->nzoff+dnnz);
2277 					// checks
2278 //					RSB_C2R_ASSERT(IL[i-submatrix->roff]>=IB[i]);
2279 //					RSB_C2R_ASSERT(IL[i-submatrix->roff]<=IB[i+1]);
2280 //					RSB_C2R_ASSERT(IL[i-submatrix->roff+1]>=IB[i+1]);
2281 //					RSB_C2R_ASSERT(nnz0>=IL[i-submatrix->roff]);
2282 //					RSB_C2R_ASSERT(nnz1<=IL[i-submatrix->roff+1]);
2283 				       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2284 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2285 					RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2286 					// perform the copy
2287 					RSB_A_MEMCPY_SMALL(WA,VA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0,el_size);
2288 					//RSB_COA_MEMCPY(WA,JA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0);
2289 					// update the actual offset in the destination array
2290 					dnnz += nnz1-nnz0;
2291 				}
2292 			}
2293 			else
2294 			{
2295 				//rsb_coo_idx_t*IL = submatrix->bindx;
2296 				for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2297 				{
2298 					//rsb_nnz_idx_t fel;
2299 					// offset of line i in the global line pointers array
2300 					rsb_nnz_idx_t nnz0 = IB[i];
2301 					// nnz1..nnz0 are the boundaries of line i
2302 					rsb_nnz_idx_t nnz1 = IB[i+1];
2303 					// check
2304 					RSB_C2R_ASSERT(nnz0>=IB[i]);
2305 					RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2306 					// skip line if empty
2307 					if(nnz1-nnz0<1)continue;
2308 					// find first element of line i also in the submatrix
2309 					nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff,nnz1-nnz0);
2310 					// skip line if empty in the submatrix
2311 					if(nnz1-nnz0<1)continue;
2312 					// find the length of the subrow i in the submatrix
2313 					nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff+submatrix->nc,nnz1-nnz0);
2314 					//check
2315 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2316 					// skip line if empty in the submatrix
2317 					if(nnz1-nnz0<1)continue;
2318 					// nnz1 .. nnz0 contain nonempty subrow i in the submatrix
2319 //					RSB_INFO("i:%d, %d..%d -> %d\n",i,nnz0,nnz1-1,submatrix->nzoff+dnnz);
2320 					// checks
2321 //					RSB_C2R_ASSERT(IL[i-submatrix->roff]>=IB[i]);
2322 //					RSB_C2R_ASSERT(IL[i-submatrix->roff]<=IB[i+1]);
2323 //					RSB_C2R_ASSERT(IL[i-submatrix->roff+1]>=IB[i+1]);
2324 //					RSB_C2R_ASSERT(nnz0>=IL[i-submatrix->roff]);
2325 //					RSB_C2R_ASSERT(nnz1<=IL[i-submatrix->roff+1]);
2326 				       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2327 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2328 					RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2329 					// perform the copy
2330 					RSB_A_MEMCPY_SMALL(WA,VA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0,el_size);
2331 					//RSB_COA_MEMCPY(WA,JA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0);
2332 					// update the actual offset in the destination array
2333 					dnnz += nnz1-nnz0;
2334 				}
2335 			}
2336 			if(dnnz!=submatrix->nnz)
2337 			{
2338 				RSB_ERROR("@%d,%d: found %d, should have found %d\n",
2339 						submatrix->roff, submatrix->coff, dnnz,submatrix->nnz);
2340 				RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2341 			}
2342 			#pragma omp critical (rsb_coo2rsb_nzinc_crs)
2343 			{tdnnz += dnnz;}
2344 		}
2345 	}
2346 //gerr:
2347 	RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
2348 #if   !defined(__xlC__)
2349 	/* FIXME: xlc does not allow this, but we have experienced problems, without */
2350 	#pragma omp barrier
2351 #endif /* __xlC__ */
2352 	if(RSB_SOME_ERROR(errval))
2353 	{
2354 		RSB_PERR_GOTO(err,RSB_ERRM_NL);
2355 	}
2356 
2357 	if(tdnnz!=nnz)
2358 	{
2359 		RSB_ERROR("found %d, should have found %d\n", tdnnz,nnz);
2360 		errval = RSB_ERR_INTERNAL_ERROR;
2361 	       	goto err;
2362 	}
2363 
2364 #if RSB_WANT_VERBOSE_TIMINGS
2365 	pmt -= rsb_time();
2366 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2367 	RSB_A_MEMCPY_parallel(VA,WA,0,0,nnz,el_size);
2368 #if RSB_WANT_VERBOSE_TIMINGS
2369 	pmt += rsb_time();
2370 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2371 no_va_cp:
2372 
2373 	tdnnz = 0;
2374 	#pragma omp parallel for schedule(static,1) reduction(|:errval)  shared(tdnnz,submatricesp,IB) RSB_NTC
2375 	for(smi=0;smi<cmc;++smi)
2376 	{
2377 		struct rsb_mtx_t * submatrix = submatricesp[smi];
2378 		if(!RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING))
2379 		{
2380 			rsb_coo_idx_t oll,nll;
2381 			rsb_coo_idx_t i;
2382 			rsb_coo_idx_t*IR = submatrix->bpntr;
2383 			rsb_coo_idx_t*IL = submatrix->bindx;
2384 			rsb_nnz_idx_t dnnz = 0;
2385 
2386 			//if(!RSB_DO_TOOFEWNNZFORRCSR(submatrix->nnz,submatrix->nr))
2387 			if(RSB_DO_ENOUGHNNZFORINDEXBASEDBUILD(submatrix) && !rsb__is_coo_matrix(submatrix) && IL && IR)
2388 			{
2389 				if(RSB_C2R_IF_VERBOSE)
2390 				RSB_INFO("CSR:%d/%d:%d..%d\n",smi,cmc,submatrix->nzoff,submatrix->nzoff+submatrix->nnz);
2391 				submatrix->bpntr = IA+submatrix->nzoff;
2392 				RSB_C2R_ASSERT(IL); RSB_C2R_ASSERT(IR);
2393 				RSB_C2R_ASSERT(IR< IA+submatrix->nzoff+submatrix->nnz);
2394 				RSB_C2R_ASSERT(IL< IA+submatrix->nzoff+submatrix->nnz);
2395 				RSB_C2R_ASSERT(IR>=IA+submatrix->nzoff);
2396 				RSB_C2R_ASSERT(IL>=IA+submatrix->nzoff);
2397 				for(dnnz=0,i=0;RSB_LIKELY(i<submatrix->nr);dnnz += IR[i]-IL[i],++i)
2398 					RSB_COA_MEMCPY_SMALL(WA,JA,submatrix->nzoff+dnnz,IL[i],IR[i]-IL[i]);
2399 
2400 //				RSB_INFO("%d x %d (%d) @ %d, %d (rcsr)\n",submatrix->nr,submatrix->nc,submatrix->nnz,submatrix->roff,submatrix->coff);
2401 				if(dnnz!=submatrix->nnz)
2402 				{
2403 					RSB_ERROR("@%d,%d: found %d, should have found %d\n",
2404 							submatrix->roff, submatrix->coff, dnnz,submatrix->nnz);
2405 					RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2406 				}
2407 				RSB_C2R_ASSERT(IL); RSB_C2R_ASSERT(IR);
2408 
2409 				oll = IR[0]-IL[0];
2410 				submatrix->bpntr[0] = 0;
2411 				for(i=1;RSB_LIKELY(i<submatrix->nr);++i)
2412 				{
2413 					nll = IR[i]-IL[i];
2414 					submatrix->bpntr[i] = submatrix->bpntr[i-1]+oll;
2415 					oll = nll;
2416 				}
2417 				submatrix->bpntr[submatrix->nr] = submatrix->bpntr[submatrix->nr-1]+oll;
2418 				if(submatrix->bpntr[submatrix->nr]!=submatrix->nnz)
2419 				{
2420 					RSB_ERROR("@%d,%d: found %d, should have found %d\n",
2421 					submatrix->roff,submatrix->coff,submatrix->bpntr[submatrix->nr],submatrix->nnz);
2422 					RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2423 				}
2424 			}
2425 			else
2426 			if(!RSB_DO_TOOFEWNNZFORCSR(submatrix->nnz,submatrix->nr) && !rsb__is_coo_matrix(submatrix))
2427 			{
2428 				if(RSB_C2R_IF_VERBOSE)
2429 				RSB_INFO("CSR:%d/%d:%d..%d\n",smi,cmc,submatrix->nzoff,submatrix->nzoff+submatrix->nnz);
2430 				oll = 0;
2431 				submatrix->bpntr = IA+submatrix->nzoff;
2432 				submatrix->bpntr[0] = 0;
2433 //				RSB_INFO("%d x %d (%d) @ %d, %d (rcsr)\n",submatrix->nr,submatrix->nc,submatrix->nnz,submatrix->roff,submatrix->coff);
2434 				for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2435 				{
2436 					//rsb_nnz_idx_t fel;
2437 					// offset of line i in the global line pointers array
2438 					rsb_nnz_idx_t nnz0 = IB[i];
2439 					// nnz1..nnz0 are the boundaries of line i
2440 					rsb_nnz_idx_t nnz1 = IB[i+1];
2441 					// check
2442 					RSB_C2R_ASSERT(nnz0>=IB[i]);
2443 					RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2444 					// skip line if empty
2445 					if(nnz1-nnz0<1)goto is_empty_subrow;
2446 					// find first element of line i also in the submatrix
2447 					nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff,nnz1-nnz0);
2448 					// skip line if empty in the submatrix
2449 					if(nnz1-nnz0<1)goto is_empty_subrow;
2450 					// find the length of the subrow i in the submatrix
2451 					nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff+submatrix->nc,nnz1-nnz0);
2452 					//check
2453 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2454 					// skip line if empty in the submatrix
2455 					if(nnz1-nnz0<1)goto is_empty_subrow;
2456 					// nnz1 .. nnz0 contain nonempty subrow i in the submatrix
2457 //					RSB_INFO("i:%d, %d..%d -> %d\n",i,nnz0,nnz1-1,submatrix->nzoff+dnnz);
2458 					// checks
2459 				       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2460 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2461 					RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2462 					// convert row indices
2463 					nll = nnz1-nnz0;
2464 					if(i>submatrix->roff)
2465 						submatrix->bpntr[i-submatrix->roff] = submatrix->bpntr[i-submatrix->roff-1]+oll;
2466 					oll = nll;
2467 					// perform the copy
2468 					RSB_COA_MEMCPY_SMALL(WA,JA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0);
2469 					// update the actual offset in the destination array
2470 					dnnz += nnz1-nnz0;
2471 					continue;
2472 is_empty_subrow:
2473 					// convert row indices
2474 					nll = 0;
2475 					if(RSB_LIKELY(i>submatrix->roff))
2476 						submatrix->bpntr[i-submatrix->roff] = submatrix->bpntr[i-submatrix->roff-1]+oll;
2477 					oll = nll;
2478 				}
2479 				submatrix->bpntr[submatrix->nr] = submatrix->bpntr[submatrix->nr-1]+oll;
2480 				if(dnnz!=submatrix->nnz || submatrix->bpntr[submatrix->nr]!=submatrix->nnz)
2481 				{
2482 					RSB_ERROR("@%d,%d: found %d, and %d; should have found %d\n",
2483 							submatrix->roff, submatrix->coff,
2484 							dnnz, submatrix->bpntr[submatrix->nr],submatrix->nnz);
2485 					RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2486 				}
2487 			}
2488 			else
2489 			{
2490 				if(RSB_C2R_IF_VERBOSE)
2491 				RSB_INFO("COO:%d/%d:%d..%d\n",smi,cmc,submatrix->nzoff,submatrix->nzoff+submatrix->nnz);
2492 				oll = 0;
2493 				submatrix->bpntr = IA+submatrix->nzoff;
2494 				submatrix->bpntr[0] = 0;
2495 				for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2496 				{
2497 					//rsb_nnz_idx_t fel;
2498 					// offset of line i in the global line pointers array
2499 					rsb_nnz_idx_t nnz0 = IB[i];
2500 					// nnz1..nnz0 are the boundaries of line i
2501 					rsb_nnz_idx_t nnz1 = IB[i+1];
2502 					// check
2503 					RSB_C2R_ASSERT(nnz0>=IB[i]);
2504 					RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2505 					// skip line if empty
2506 					if(nnz1-nnz0<1)continue;
2507 					// find first element of line i also in the submatrix
2508 					nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff,nnz1-nnz0);
2509 					// skip line if empty in the submatrix
2510 					if(nnz1-nnz0<1)continue;
2511 					// find the length of the subrow i in the submatrix
2512 					nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff+submatrix->nc,nnz1-nnz0);
2513 					//check
2514 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2515 					// skip line if empty in the submatrix
2516 					if(nnz1-nnz0<1)continue;
2517 					// nnz1 .. nnz0 contain nonempty subrow i in the submatrix
2518 //					RSB_INFO("i:%d, %d..%d -> %d\n",i,nnz0,nnz1-1,submatrix->nzoff+dnnz);
2519 					// checks
2520 				       	RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2521 					RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2522 					RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2523 					// convert row indices
2524 					// perform the copy
2525 					RSB_COA_MEMCPY_SMALL(WA,JA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0);
2526 					rsb__util_coo_array_set(IA+submatrix->nzoff+dnnz,nnz1-nnz0,i-submatrix->roff);
2527 					// update the actual offset in the destination array
2528 					dnnz += nnz1-nnz0;
2529 				}
2530 				if(dnnz!=submatrix->nnz )
2531 				{
2532 					RSB_ERROR("@%d,%d: found %d; should have found %d\n",
2533 							submatrix->roff, submatrix->coff, dnnz, submatrix->nnz);
2534 					RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2535 				}
2536 			}
2537 			rsb__util_coo_array_sub(WA+submatrix->nzoff,dnnz,submatrix->coff);
2538 			#pragma omp critical (rsb_coo2rsb_nzinc_crs)
2539 			{tdnnz += dnnz;}
2540 		}
2541 	}
2542 	#pragma omp barrier
2543 #if RSB_WANT_VERBOSE_TIMINGS
2544 	pmt -= rsb_time();
2545 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2546 #if 1
2547 	RSB_COA_MEMCPY_parallel(JA,WA,0,0,nnz);
2548 #else
2549 	RSB_COA_MEMCPY(JA,WA,0,0,nnz);
2550 #endif
2551 
2552 #if RSB_WANT_VERBOSE_TIMINGS
2553 	pmt += rsb_time();
2554 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2555 
2556 	if(RSB_SOME_ERROR(errval))
2557 	{
2558 		RSB_PERR_GOTO(err,RSB_ERRM_NL);
2559 	}
2560 
2561 	if(tdnnz!=nnz)
2562 	{
2563 		RSB_ERROR("found %d, should have found %d\n", tdnnz,nnz);
2564 		errval = RSB_ERR_INTERNAL_ERROR;
2565 	       	goto err;
2566 	}
2567 err:
2568 	RSB_DO_ERR_RETURN(errval)
2569 }
2570 
rsb__allocate_recursive_sparse_matrix_from_row_major_coo(void * VA,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz,rsb_type_t typecode,const struct rsb_mtx_partitioning_info_t * pinfop,rsb_flags_t flags,rsb_err_t * errvalp)2571 struct rsb_mtx_t * rsb__allocate_recursive_sparse_matrix_from_row_major_coo(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp)
2572 {
2573 	/**
2574 		\ingroup gr_experimental
2575 
2576 		Once finished, should assembly a complete R-CSR/R-CSC matrix in-place in the provided COO arrays.
2577 		Should use no more than ??? temporary memory.
2578 
2579 		TODO: get rid of pinfop.
2580 		TODO: interleave the matrix structs into the data arrays.
2581 		TODO: missing proper error handling.
2582 		TODO: guaranteed preallocation needed
2583 		TODO: may continue the line of rsb_allocate_new__ and plug it here.
2584 	 */
2585 	rsb_err_t errval = RSB_ERR_NO_ERROR;
2586 	struct rsb_mtx_t * submatrices = NULL;
2587 	struct rsb_mtx_t ** submatricesp = NULL;
2588 	struct rsb_mtx_t * mtxAp = NULL;
2589 	long fcs = rsb__get_first_level_c_size();
2590 	long lcs = rsb__get_lastlevel_c_size();
2591 	long lcspt = rsb__get_lastlevel_c_size_per_thread();
2592 	long cbs = rsb__get_cache_block_byte_size();
2593 #if RSB_WANT_SUBDIVISION_FIXES_20101213
2594 	long wcbs = cbs;
2595 #else /* RSB_WANT_SUBDIVISION_FIXES_20101213 */
2596 	long wcbs = lcspt;
2597 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101213 */
2598 #if RSB_WANT_VERBOSE_TIMINGS
2599 	rsb_time_t pmt = RSB_TIME_ZERO;
2600 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2601 	rsb_coo_idx_t * IB = NULL;
2602 	rsb_coo_idx_t * IT = NULL;
2603 	rsb_coo_idx_t * IX = NULL;
2604 	rsb_coo_idx_t * WA = NULL;
2605 	rsb_submatrix_idx_t tmc = 0, smi = 0; /* max matrix count, done matrix count, submatrix index */
2606 	rsb_submatrix_idx_t cmc = 0, omc = 0; /* closed matrices count, open matrices count */
2607 	rsb_submatrix_idx_t lm = 0;           /* leaf matrices */
2608 	rsb_time_t dt = RSB_TIME_ZERO,eit = RSB_TIME_ZERO,est = RSB_TIME_ZERO,ect = RSB_TIME_ZERO,tat = RSB_TIME_ZERO,sat = RSB_TIME_ZERO;
2609 	const rsb_thread_t wet = rsb_get_num_threads(); /* want executing threads */
2610 	size_t el_size = RSB_SIZEOF(typecode);
2611 	rsb_coo_idx_t roff = 0;
2612 	rsb_coo_idx_t coff = 0;
2613 
2614 	tat = - rsb_time();
2615 #if RSB_ALLOW_EMPTY_MATRICES
2616 	if(!RSB_HAVE_GOOD_PARMS_FOR_EMPTY(m,k,nnz,flags))
2617 #endif /* RSB_ALLOW_EMPTY_MATRICES */
2618 	if(!RSB_HAVE_GOOD_PARMS_FOR_IN_PLACE_RCSR(m,k,nnz,flags))
2619 	{
2620 		RSB_ERROR(RSB_ERRM_MDNFARTS);
2621 	       	errval = RSB_ERR_BADARGS;
2622 	       	goto err;
2623 	}
2624 
2625 	if( fcs > lcs || fcs<1 || cbs<1 ) /* we allow declaration of 1 level of cache only */
2626 	{
2627 		/* TODO : find a reasonable solution, and declare it in ./rsbench -I, which should give some diagnostic about this */
2628 		RSB_ERROR("innermost cache size:%d, outermost cache size:%d, cache block size %d\n",fcs,lcs,cbs);
2629 	       	errval = RSB_ERR_FAILED_MEMHIER_DETECTION;
2630 	       	goto err;
2631 	}
2632 
2633 	if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COLUMN_MAJOR_ORDER))
2634 	{
2635 		RSB_ERROR(RSB_ERRM_CMOINIY);
2636 	       	errval = RSB_ERR_UNIMPLEMENTED_YET;
2637 	       	goto err;
2638 	}
2639 
2640 #if RSB_WANT_DEBUG_VERBOSE_INTERFACE_NOTICE
2641 	if((rsb_global_session_handle.rsb_g_verbose_interface&2))
2642 			RSB_STDOUT("building a matrix with %d nnz, %d x %d\n",nnz,m,k);
2643 #endif /* RSB_WANT_DEBUG_VERBOSE_INTERFACE_NOTICE */
2644 
2645 	if(RSB_DO_FLAGS_EXTRACT_STORAGE(flags)==RSB_FLAG_NOFLAGS)
2646 	{
2647 		RSB_DO_FLAG_ADD(flags,RSB_FLAG_DEFAULT_STORAGE_FLAGS);
2648 	}
2649 
2650 	/* TODO: plug *here* the eventually to-come RSB_WANT_FASTER_EXPERIMENTAL_CONSTRUCTOR stuff */
2651 
2652 	tmc = RSB_SUBDIVISION_BUG_EXTRA+2*(((nnz+wcbs)*(RSB_SIZEOF(typecode)+2*sizeof(rsb_coo_idx_t)))/(wcbs)); /* TODO: clean this up */
2653 	tmc = RSB_MAX(1,(rsb_submatrix_idx_t)(rsb_global_session_handle.subdivision_multiplier*tmc));
2654 
2655 	if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_RECURSIVE_MORE_LEAVES_THAN_THREADS))
2656 	if(wet>tmc)
2657 	{
2658 		tmc = wet;
2659 	}
2660 
2661 	submatrices = rsb__calloc(sizeof(struct rsb_mtx_t )*tmc);
2662 	submatricesp = rsb__calloc(sizeof(struct rsb_mtx_t*)*tmc);
2663 
2664 	if(!submatrices || !submatricesp)
2665 	{
2666 	       	errval = RSB_ERR_ENOMEM;
2667 		RSB_PERR_GOTO(err,RSB_ERRM_ES);
2668 	}
2669 
2670 	for(smi=0;smi<tmc;++smi)
2671 		submatricesp[smi] = submatrices + smi;
2672 
2673 	if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_FORTRAN_INDICES_INTERFACE)) /* TODO: this is *slow*, speed this up */
2674 		rsb__util_coo_array_from_fortran_indices(IA,nnz,RSB_BOOL_TRUE),
2675 		rsb__util_coo_array_from_fortran_indices(JA,nnz,RSB_BOOL_TRUE),
2676 		RSB_DO_FLAG_DEL(flags,RSB_FLAG_FORTRAN_INDICES_INTERFACE);
2677 
2678 	dt = - rsb_time();
2679 	if((errval = rsb__do_cleanup_nnz(VA,IA,JA,nnz,roff,coff,m,k,&nnz,typecode,flags))!=RSB_ERR_NO_ERROR)
2680 		goto err;
2681 
2682 	ect = dt;
2683 	dt = - rsb_time();
2684 	ect -= dt;
2685 
2686 	if(!RSB_DO_FLAG_HAS(flags,RSB_FLAG_SORTED_INPUT))
2687 	if((errval = rsb__util_sort_row_major_inner(VA,IA,JA,nnz,m,k,typecode,flags))!=RSB_ERR_NO_ERROR)
2688 	{
2689 		RSB_ERROR(RSB_ERRM_ES);
2690 		goto err;
2691 	}
2692 	RSB_DO_FLAG_ADD(flags,RSB_FLAG_SORTED_INPUT); /* TODO: is this needed ? */
2693 
2694 	/* we need duplicates removal, and this can only take place after sorting */
2695 #if RSB_WANT_VERBOSE_TIMINGS
2696 	{rsb_nnz_idx_t dnnz = nnz - rsb_weed_out_duplicated(IA,JA,VA,nnz,typecode,flags);
2697 	RSB_INFO("duplicate removal: %zd - %zd = %zd\n",nnz,dnnz,nnz-dnnz);
2698 	nnz -= dnnz;}
2699 #else /* RSB_WANT_VERBOSE_TIMINGS */
2700 	nnz = rsb_weed_out_duplicates(IA,JA,VA,nnz,typecode,flags);
2701 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2702 	dt += rsb_time();
2703 
2704 	est = dt;	/* with 'sorting' (est) we DO NOT intend also cleanup (in ect) */
2705 
2706 	/* work vectors allocation */
2707 /*	IL = rsb__malloc(sizeof(rsb_coo_idx_t)*(m+1)); */
2708 	IT = rsb__malloc(sizeof(rsb_coo_idx_t)*(m+1));
2709 	IX = rsb__malloc(sizeof(rsb_coo_idx_t)*2*(m+1));
2710 	IB = rsb__malloc(sizeof(rsb_coo_idx_t)*(m+1));
2711 	if(/*  !IL ||*/ !IT || !IX || !IB)
2712 	{
2713 		RSB_ERROR(RSB_ERRM_ES);
2714 	       	errval = RSB_ERR_ENOMEM; goto err;
2715 	}
2716 
2717 	/* declaring this first matrix (smi == 0) as 'open' */
2718 	smi = 0; omc = 1;
2719 	/* compile in the first mtxAp, linking into to the temporary split vector */
2720 	submatrices[smi].nzoff = 0;
2721 	submatrices[smi].roff = roff;
2722 	submatrices[smi].coff = coff;
2723 	submatrices[smi].bindx = IB;
2724 	submatrices[smi].bpntr = IB+1;
2725 	submatrices[smi].indptr = NULL;
2726 /*	RSB_DO_FLAG_ADD(flags,RSB_FLAG_QUAD_PARTITIONING); */
2727 	RSB_DO_FLAG_ADD(flags,RSB_FLAG_ASSEMBLED_IN_COO_ARRAYS);
2728 	if( (errval = rsb__set_init_flags_and_stuff(
2729 		&submatrices[smi],NULL,NULL,m,k,nnz,nnz,nnz,typecode,flags)
2730 				)!=RSB_ERR_NO_ERROR)
2731 	{
2732 		RSB_ERROR(RSB_ERRM_ES);
2733 		goto err;
2734 	}
2735 
2736 	if(nnz==0)
2737 	{
2738 		/* a special case. we copy the arrays addresses because they may be non-NULL and containing duplicate/diagonal/etc.. elements we have honoured to free, afterwards. */
2739 		++cmc; --omc;
2740 		mtxAp = &submatrices[0];
2741 		RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_QUAD_PARTITIONING); /* necessary, too */
2742 		mtxAp->bpntr = IA;
2743 		mtxAp->bindx = JA;
2744 		mtxAp->indptr = NULL;
2745 		mtxAp->VA = VA;
2746 		goto arrays_done;
2747 	}
2748 	else
2749 		mtxAp = &submatrices[0];
2750 
2751 	sat = - rsb_time();
2752 	/* computing the first right-left pointer vectors */
2753 #if RSB_WANT_PARALLEL_SUBDIVISION
2754 	if((errval = rsb_do_compute_vertical_split_parallel(IA,JA,roff,coff,m,k,0,0,nnz,IB,NULL,NULL,NULL,NULL,NULL,NULL))!=RSB_ERR_NO_ERROR)
2755 #else /* RSB_WANT_PARALLEL_SUBDIVISION */
2756 	if((errval = rsb_do_compute_vertical_split_parallel(IA,JA,roff,coff,m,k,0,0,nnz,IB,NULL,NULL,NULL,NULL,NULL,NULL))!=RSB_ERR_NO_ERROR)
2757 	/*if((errval = rsb_do_compute_vertical_split(IA,JA,roff,coff,m,k,0,0,nnz,IB,NULL,NULL,NULL,NULL,NULL,NULL))!=RSB_ERR_NO_ERROR) */
2758 #endif /* RSB_WANT_PARALLEL_SUBDIVISION */
2759 	{
2760 		RSB_ERROR(RSB_ERRM_ES);
2761 		goto err;
2762 	}
2763 
2764 	if(RSB_C2R_IF_VERBOSE)
2765 		RSB_INFO("beginning (%d x %d) @ %p with flags 0x%x (coo:%d, csr:%d), storage: 0x%x, max %d submatrices\n",
2766 			submatrices[smi].nr, submatrices[smi].nc, (const void*)&submatrices[smi], submatrices[smi].flags,
2767 			RSB_DO_FLAG_HAS(submatrices[smi].flags,RSB_FLAG_WANT_COO_STORAGE),
2768 			RSB_DO_FLAG_HAS(submatrices[smi].flags,RSB_FLAG_WANT_BCSS_STORAGE),
2769 			submatrices[smi].matrix_storage,tmc
2770 			);
2771 
2772 /*	if(!RSB_WANT_MORE_PARALLELISM || (RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_QUAD_PARTITIONING))) */ /* TODO */
2773 #if 1 /* the code is not yet ready for this */
2774 /* #if RSB_WANT_OMP_RECURSIVE_KERNELS */
2775 	errval = rsb_do_coo2rec_subdivide_parallel(VA,IA,JA,m,k,nnz,typecode,pinfop,flags,errvalp,submatricesp,mtxAp,IB,IX,IT,WA,cmc,omc,tmc,RSB_MAX(1,RSB_MIN(wet,nnz)),&cmc);
2776 #else
2777 	errval = rsb_do_coo2rec_subdivide(VA,IA,JA,m,k,nnz,typecode,pinfop,flags,errvalp,submatricesp,mtxAp,IB,IX,IT,WA,cmc,omc,tmc,wet,&cmc);
2778 #endif
2779 	RSB_CONDITIONAL_FREE(IX);
2780 	if(RSB_SOME_ERROR(errval))
2781 		goto err;
2782 
2783 	eit = - rsb_time();
2784 	sat -= eit;
2785 
2786 	/*
2787 	RSB_CONDITIONAL_FREE(IL);
2788 	RSB_CONDITIONAL_FREE(IT);
2789        	*/
2790 	/* WA will is needed for shuffle only (so, after IL,IM deallocation, in a way that total memory need is max(WA,IL)) */
2791 	WA = rsb__malloc(RSB_MAX(sizeof(rsb_coo_idx_t),el_size)*nnz);
2792 	if(!WA)
2793 	{
2794 		RSB_ERROR(RSB_ERRM_FAOTAFS);
2795 	       	errval = RSB_ERR_ENOMEM; goto err;
2796 	}
2797 
2798 #if 0
2799 	/* after symbolic partitioning is done, we are ready to shuffle all of the arrays using the temporary storage and add an intermediate node */
2800 	RSB_INFO("assembling leaf %d -> %d x %d, %d\n",smi,submatricesp[smi]->nr,submatricesp[smi]->nc,submatricesp[smi]->nnz);
2801 	RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows(VA,IA,JA,m,0,nnz,0,typecode,IL,IM,WA));
2802 	RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows(VA,IA,JA,(m+1)/2,0,IL[(m+1)/2],0,typecode,IL,IR,WA));
2803 	RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows(VA,IA,JA,m,(m+1)/2,nnz,IL[(m+1)/2],typecode,IL,IR,WA));
2804 	/* TODO : should use a temporary vector, here. */
2805 #endif
2806 
2807 	for(smi=0;smi<cmc;++smi)
2808 		if(rsb__is_terminal_recursive_matrix(submatricesp[smi]))
2809 			++lm;
2810 
2811 /*	qsort(submatricesp+(cmc-lm),(size_t)(lm),sizeof(struct rsb_mtx_t*),&rsb_compar_rcsr_matrix_leftmost_first); */
2812 	qsort(submatricesp,(size_t)(cmc),sizeof(struct rsb_mtx_t*),&rsb_compar_rcsr_matrix_leftmost_first);
2813 	/* TODO: a priority queue would do the job, here */
2814 	for(smi=0;smi<cmc-lm;++smi)
2815 		if(rsb__is_terminal_recursive_matrix(submatricesp[smi]))
2816 		{
2817 			errval = RSB_ERR_INTERNAL_ERROR;
2818 			RSB_PERR_GOTO(err,RSB_ERRM_ANLSMIT);
2819 		}
2820 	for(smi=cmc-lm;smi<cmc;++smi)
2821 		if(!rsb__is_terminal_recursive_matrix(submatricesp[smi]))
2822 		{
2823 			errval = RSB_ERR_INTERNAL_ERROR;
2824 			RSB_PERR_GOTO(err,RSB_ERRM_ALSMINT);
2825 		}
2826 
2827 	errval = rsb_do_coo2rec_shuffle(VA,IA,JA,m,k,nnz,typecode,pinfop,flags,errvalp,submatricesp,mtxAp,IB,WA,cmc);
2828 	if(RSB_SOME_ERROR(errval))
2829 	{
2830 		RSB_PERR_GOTO(err,RSB_ERRM_SEOWS);
2831 	}
2832 
2833 	rsb__do_set_in_place_submatrices_offsets(submatrices,cmc,VA,IA,JA,el_size);
2834 
2835 /*	RSB_INFO("VA:%p, IA:%p, JA:%p\n",VA,IA,JA); */
2836 
2837 	if(RSB_C2R_IF_VERBOSE)
2838 		RSB_INFO("IA? :%p / %p\n",(const void*)IA,
2839 			(const void*)(rsb__do_get_first_submatrix(mtxAp)->bpntr-
2840 			(rsb__do_get_first_submatrix(mtxAp)->nr+1))
2841 /*			rsb__do_get_first_submatrix(mtxAp)->roff-
2842   			((submatricesp[0])->nr+1) */
2843 			);
2844 
2845 	/* after shuffling, the last vectors conversion happens and we are done. */
2846 arrays_done:
2847 	eit += rsb_time();
2848 
2849 	/* first matrix is always root (even if a CSR one) */
2850 	RSB_DO_FLAG_DEL(submatrices[0].flags,RSB_FLAG_NON_ROOT_MATRIX);
2851 	#if RSB_EXPERIMENTAL_SHOULD_TRAVERSE_RECURSIVE_MATRICES_AS_BLOCKS
2852 	if(!(submatrices[0].flags & RSB_FLAG_NON_ROOT_MATRIX))
2853 	{
2854 		submatrices[0].all_leaf_matrices = NULL;
2855 		errval = rsb__get_array_of_leaf_matrices(&submatrices[0],&(submatrices[0].all_leaf_matrices),&submatrices[0].all_leaf_matrices_n);
2856 		if(RSB_SOME_ERROR(errval))
2857 			goto err;
2858 	}
2859 	else
2860 	{
2861 		/* this is a non root matrix */
2862 		submatrices[0].all_leaf_matrices = NULL;
2863 		submatrices[0].all_leaf_matrices_n = 0;
2864 	}
2865 	#endif /* RSB_EXPERIMENTAL_SHOULD_TRAVERSE_RECURSIVE_MATRICES_AS_BLOCKS */
2866 
2867 #if RSB_WANT_VERBOSE_TIMINGS
2868 	rsb_time_t hct = - rsb_time();
2869 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2870 	if(
2871 		/* RSB_DO_FLAG_HAS(flags,RSB_FLAG_USE_HALFWORD_INDICES_COO)
2872 		       ||	RSB_DO_FLAG_HAS(flags,RSB_FLAG_USE_HALFWORD_INDICES_CSR)
2873 		       ||	RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE)*/
2874 		       	RSB_DO_FLAG_HAS(flags,RSB_FLAG_USE_HALFWORD_INDICES)
2875 		)
2876 #if RSB_WANT_MORE_PARALLELISM
2877 		RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_recursive_matrix_to_halfword_storages_parallel(mtxAp));
2878 #else /* RSB_WANT_MORE_PARALLELISM */
2879 		RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_recursive_matrix_to_halfword_storages(mtxAp));
2880 #endif /* RSB_WANT_MORE_PARALLELISM */
2881 	else
2882 	{
2883 		if(RSB_C2R_IF_VERBOSE)
2884 			RSB_INFO("no  RSB_FLAG_USE_HALFWORD_INDICES flag\n");
2885 	}
2886 	RSB_DO_ERROR_CUMULATE(errval,rsb_do_compute_bounded_boxes(mtxAp));
2887 
2888 #if RSB_WANT_VERBOSE_TIMINGS
2889 	hct += rsb_time();
2890 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2891 
2892 	if(RSB_SOME_ERROR(errval))
2893 	{
2894 		RSB_PERR_GOTO(err,RSB_ERRM_ES);
2895 	}
2896 	tat += rsb_time();
2897 
2898 	mtxAp->sat = sat;
2899 	mtxAp->ect = ect;
2900 	mtxAp->eit = eit;
2901 	mtxAp->est = est;
2902 	mtxAp->pet = RSB_TIME_ZERO;
2903 	mtxAp->rpt = RSB_TIME_ZERO;
2904 	mtxAp->tat = tat;
2905 
2906 	/* some statistics */
2907 #if RSB_WANT_VERBOSE_TIMINGS
2908 	RSB_INFO("analyzed arrays in %g s\n",sat);
2909 	RSB_INFO("cleaned-up arrays in %g s\n",ect);
2910 	RSB_INFO("sorted arrays in %g s\n",est);
2911 	RSB_INFO("computed partitions in %g s\n",mtxAp->cpt);
2912 	RSB_INFO("shuffled partitions in %g s\n",eit);
2913 	RSB_INFO(" (of which %g s for parallel memcpy and %g for halfword conversion)\n",pmt,hct);
2914 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2915 	mtxAp->cpt = 0;/* cpt is contained in sat, so should not be counted here! */
2916 #if RSB_STORE_IDXSA
2917 	mtxAp->idxsa = rsb__get_index_storage_amount(mtxAp);
2918 #endif
2919 
2920 #if 0
2921   	RSB_INFO("got %d matrices (%d leaves)\n",cmc,lm);
2922 
2923   	if( !rsb__mtx_chk(mtxAp) )
2924   	{
2925   		errval = RSB_ERR_INTERNAL_ERROR;
2926   		RSB_PERR_GOTO(derr,RSB_ERRM_NL);
2927   	}
2928 
2929 	errval = rsb__do_switch_recursive_matrix_to_fullword_storage(mtxAp);
2930 
2931 	errval = rsb__do_switch_recursive_in_place_matrix_to_in_place_rcoo(mtxAp);
2932 ok:
2933 #endif
2934 	goto noerr;
2935 
2936 	/* TODO: missing proper error handling here ! --- destroy any allocated data for the matrix (e.g.: submatrices, .. ) */
2937 	if(mtxAp)
2938 		rsb__do_mtx_free(mtxAp);
2939 err:
2940 	mtxAp = NULL;
2941 noerr:
2942 	if(RSB_SOME_ERROR(errval))
2943 		rsb__do_perror(NULL,errval);
2944 	RSB_CONDITIONAL_FREE(IB);
2945 	RSB_CONDITIONAL_FREE(IT);
2946 	RSB_CONDITIONAL_FREE(IX);
2947 /*	RSB_CONDITIONAL_FREE(IL); */
2948 	RSB_CONDITIONAL_FREE(WA);
2949 	RSB_CONDITIONAL_FREE(submatricesp);
2950 	if(!mtxAp)RSB_CONDITIONAL_FREE(submatrices);
2951 	RSB_CONDITIONAL_ERRPSET(errvalp,errval);
2952 #if RSB_WANT_DEBUG_VERBOSE_INTERFACE_NOTICE
2953 	if(mtxAp && (rsb_global_session_handle.rsb_g_verbose_interface&2))
2954 			RSB_STDOUT_MATRIX_SUMMARY(mtxAp),RSB_STDOUT("\n");
2955 #endif /* RSB_WANT_DEBUG_VERBOSE_INTERFACE_NOTICE */
2956 	return mtxAp;
2957 }
2958 
rsb__init_coo_struct_from_rsb(const struct rsb_mtx_t * mtxAp,struct rsb_coo_matrix_t * coop)2959 rsb_err_t rsb__init_coo_struct_from_rsb(const struct rsb_mtx_t *mtxAp, struct rsb_coo_matrix_t *coop)
2960 {
2961 	/* FIXME: unfinished; shall replace by direct use of RSB_INIT_COO_FROM_MTX, or the opposite */
2962 	if(!mtxAp || !coop)
2963 	{
2964 		return RSB_ERR_BADARGS;
2965 	}
2966 
2967 	RSB_INIT_COO_FROM_MTX(coop,mtxAp);
2968 	return RSB_ERR_NO_ERROR;
2969 }
2970 
rsb__project_rsb_to_coo(struct rsb_mtx_t * mtxAp,struct rsb_coo_matrix_t * coop)2971 rsb_err_t rsb__project_rsb_to_coo(struct rsb_mtx_t *mtxAp, struct rsb_coo_matrix_t *coop)
2972 {
2973 	rsb_err_t errval = RSB_ERR_NO_ERROR;
2974 	if(!mtxAp || !coop)
2975 	{
2976 		errval = RSB_ERR_BADARGS;
2977 		RSB_PERR_GOTO(err,RSB_ERRM_ES);
2978 	}
2979 	errval = rsb__init_coo_struct_from_rsb(mtxAp,coop);
2980 	if(RSB_SOME_ERROR(errval))
2981 	{
2982 		RSB_PERR_GOTO(err,RSB_ERRM_ES);
2983        	}
2984 
2985 	RSB_BIND_COO_TO_MTX(coop,mtxAp);
2986 err:
2987 	return errval;
2988 }
2989 
2990 /* @endcond */
2991