1 /*
2
3 Copyright (C) 2008-2021 Michele Martone
4
5 This file is part of librsb.
6
7 librsb is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 librsb is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with librsb; see the file COPYING.
19 If not, see <http://www.gnu.org/licenses/>.
20
21 */
22 /* @cond INNERDOC */
23 /*!
24 * @file
25 * @brief Recursive Sparse matrices assembling code.
26 * @author Michele Martone
27 * */
28 /*
29 * TODO: improve this code, because there is a number of unclean practices which could break a build.
30 * */
31 #include "rsb_common.h"
32
33 #ifndef RSB_C2R_ASSERT
34 //#define RSB_C2R_ASSERT(e) assert(e) // uncomment this to use asserts
35 #define RSB_C2R_ASSERT(e) // uncomment this to avoid asserts
36 #else /* RSB_C2R_ASSERT */
37 #undef RSB_C2R_ASSERT
38 #define RSB_C2R_ASSERT(e)
39 #endif /* RSB_C2R_ASSERT */
40
41 #define RSB_DO_ENOUGHNNZFORINDEXBASEDBUILD(M) (!RSB_DO_TOOFEWNNZFORRCSR((M)->nnz,(M)->nr))
42 #define RSB_C2R_IF_VERBOSE 0 /* activates output which is useful for debugging */
43 #define RSB_C2R_PARANOIA 0
44
45 #define RSB_MEMCPY_SMALL_GENERAL(ID,IS,DOFF,SOFF,NNZ,TYPE) \
46 { \
47 TYPE*dp = ((TYPE*)(ID))+(DOFF),*ld = dp+(NNZ); \
48 const register TYPE*sp = ((TYPE*)(IS))+(SOFF); \
49 for(;dp<ld;++sp,++dp)*dp = *sp; \
50 }
51
52 #define RSB_C2R_WANT_MAYBE_FASTER 0
53
54 #if RSB_C2R_WANT_MAYBE_FASTER
55 #define RSB_COA_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ) RSB_MEMCPY_SMALL_GENERAL(ID,IS,DOFF,SOFF,NNZ,rsb_coo_idx_t)
56 #define RSB_COA_MEMCPY_ROWSZ(ID,IS,DOFF,SOFF,NNZ) RSB_COA_MEMCPY_parallel(ID,IS,DOFF,SOFF,NNZ)
57 //#define RSB_A_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ,ES) RSB_MEMCPY_SMALL_GENERAL(ID,IS,DOFF,SOFF,NNZ,double) /* FIXME */
58 #define RSB_A_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ,ES) RSB_A_MEMCPY(ID,IS,DOFF,SOFF,NNZ,ES)
59 #else /* RSB_C2R_WANT_MAYBE_FASTER */
60 #define RSB_COA_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ) RSB_COA_MEMCPY(ID,IS,DOFF,SOFF,NNZ)
61 #define RSB_A_MEMCPY_SMALL(ID,IS,DOFF,SOFF,NNZ,ES) RSB_A_MEMCPY(ID,IS,DOFF,SOFF,NNZ,ES)
62 #endif /* RSB_C2R_WANT_MAYBE_FASTER */
63
64 #define RSB_TIC(T) (T) = -rsb_time()
65 #define RSB_TOC(T) (T) += rsb_time()
66 #define RSB_TOC_TIC(T,U) {rsb_time_t t = rsb_time();(T) += t;(U) = -t;}
67 #define RSB_TIC_TOC(U,T) {rsb_time_t t = rsb_time();(T) += t;(U) = -t;}
68
69 #define RSB_WANT_BINSEARCH_MIN_NZPR 8 /* FIXME */
70 #define RSB_WANT_VERBOSE_TIMINGS 0
71 #define RSB_WANT_VERBOSE_SUBDIVISION 0 /* */
72 #define RSB_WANT_VERBOSE_SUBDIVISION2 0 /* */
73 #define RSB_WANT_MORE_PARALLELISM 1 /* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
74 #define RSB_WANT_FIRST_VERSION 0 /* FIXME: EXPERIMENTAL, BY DEFAULT TURNED ON (1) */
75 #define RSB_WANT_LITTLE_IMPROVED 1 /* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
76 #if RSB_WANT_OMP_RECURSIVE_KERNELS
77 #define RSB_WANT_PARALLEL_SUBDIVISION 1 /* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
78 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
79 #define RSB_WANT_PARALLEL_SUBDIVISION 0 /* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
80 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
81 #define RSB_WANT_QUADRANT_QUICK_DETECT 0/* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
82 #define RSB_WANT_SUBDIVISION_FIXES_20101120 1 /* FIXME: EXPERIMENTAL, BY DEFAULT TURNED OFF (0) */
83 #define RSB_WANT_SUBDIVISION_FIXES_20101213 0 /* FIXME: EXPERIMENTAL ) */
84 #define RSB_WANT_FIX_BUG_DISCOVERED_20121210 1 /* this bug prevents from HCSR usage */
85 #if RSB_WANT_VERBOSE_SUBDIVISION2
86 //#define RSB_MTXASM_INFO RSB_INFO /* NEW */
87 #define RSB_MTXASM_INFO printf /* NEW */
88 #else /* RSB_MTXASM_INFO */
89 #define RSB_MTXASM_INFO /* NEW */
90 #endif /* RSB_MTXASM_INFO */
91
92 #define RSB_SUBDIVISION_SKEW_MAX (RSB_FLOAT_ONE/2.0)
93 #define RSB_MAX_QUADRANTS_UNBALANCE (4-1)
94 #define RSB_SUBDIVISION_BUG_EXTRA (4) /* incorrect behaviour is encountered if setting this to 0, as it should (experienced on a 12 core machine). proper bugfix remains unknown to me. */
95
96 #define RSB_WANT_FASTER_EXPERIMENTAL_CONSTRUCTOR 0 /* FIXME: this is experimental code and shall be finished ! */
97 RSB_INTERNALS_COMMON_HEAD_DECLS
98
99 #if RSB_WANT_FASTER_EXPERIMENTAL_CONSTRUCTOR
100 #define RSB_POW2(P) (1<<(P))
101 //#define RSB_MUL2(P) ((P) *= 2)
102 #define RSB_MUL2(P) ((P)<<=1)
103 //#define RSB_HALF(V) (((V)+1)/2)
104 #define RSB_HALF(V) (((V)+1)>>1)
105 #define RSB_POW4(P) (RSB_POW2(P)*RSB_POW2(P))
rsb_coo_index_bit_interleave(rsb_coo_idx_t o,rsb_coo_idx_t e)106 static inline rsb_nnz_idx_t rsb_coo_index_bit_interleave(rsb_coo_idx_t o, rsb_coo_idx_t e)
107 {
108 /* FIXME: this is DUPLICATE code !!! */
109 rsb_nnz_idx_t i = 0, O = o, E = e;
110 RSB_DEBUG_ASSERT(O>=0);
111 RSB_DEBUG_ASSERT(E>=0);
112 if (sizeof(rsb_nnz_idx_t)==1)
113 {
114 E = (E | (E << 2)) & 0x33;
115 E = (E | (E << 1)) & 0x55;
116 O = (O | (O << 2)) & 0x33;
117 O = (O | (O << 1)) & 0x55;
118 }
119 else
120 if (sizeof(rsb_nnz_idx_t)==2)
121 {
122 E = (E | (E << 4)) & 0x0F0F;
123 E = (E | (E << 2)) & 0x3333;
124 E = (E | (E << 1)) & 0x5555;
125 O = (O | (O << 4)) & 0x0F0F;
126 O = (O | (O << 2)) & 0x3333;
127 O = (O | (O << 1)) & 0x5555;
128 }
129 else
130 if (sizeof(rsb_nnz_idx_t)==4)
131 {
132 E = (E | (E << 8)) & 0x00FF00FF;
133 E = (E | (E << 4)) & 0x0F0F0F0F;
134 E = (E | (E << 2)) & 0x33333333;
135 E = (E | (E << 1)) & 0x55555555;
136 O = (O | (O << 8)) & 0x00FF00FF;
137 O = (O | (O << 4)) & 0x0F0F0F0F;
138 O = (O | (O << 2)) & 0x33333333;
139 O = (O | (O << 1)) & 0x55555555;
140 }
141 else
142 if (sizeof(rsb_nnz_idx_t)==8)
143 {
144 E = (E | (E <<16)) & 0x0000FFFF0000FFFF;
145 E = (E | (E << 8)) & 0x00FF00FF00FF00FF;
146 E = (E | (E << 4)) & 0x0F0F0F0F0F0F0F0F;
147 E = (E | (E << 2)) & 0x3333333333333333;
148 E = (E | (E << 1)) & 0x5555555555555555;
149 O = (O | (O <<16)) & 0x0000FFFF0000FFFF;
150 O = (O | (O << 8)) & 0x00FF00FF00FF00FF;
151 O = (O | (O << 4)) & 0x0F0F0F0F0F0F0F0F;
152 O = (O | (O << 2)) & 0x3333333333333333;
153 O = (O | (O << 1)) & 0x5555555555555555;
154 }
155 else
156 {
157 RSB_ERROR(RSB_ERRM_FYRYNS);
158 /* FIXME : fatal! */
159 }
160
161 i = (E | (O << 1));
162 RSB_DEBUG_ASSERT((i & ~-1)>=0);
163 return i;
164 }
165
rsb_assign_subm__(rsb_coo_idx_t * RSB_RESTRICT IA,rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz)166 rsb_err_t rsb_assign_subm__(rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz)
167 {
168 rsb_err_t errval = RSB_ERR_NO_ERROR;
169 const rsb_coo_idx_t nlev = 4;
170 /* FIXME: irm, jrm fit together in a single byte! (1byte/nnz!) */
171 rsb_coo_idx_t nlevi,nlevj;
172 rsb_coo_idx_t mink = RSB_POW2(nlev),maxk = mink;
173 rsb_nnz_idx_t scnt[16*16], nzi;
174 rsb_coo_idx_t idiv[ 16];
175 rsb_coo_idx_t jdiv[ 16];
176 rsb_nnz_idx_t nzoff = 0;
177 //rsb_coo_idx_t * zIA, * zJA;
178 rsb_byte_t * zIA = NULL, * zJA = NULL;
179 rsb_coo_idx_t * oIA = NULL, * oJA = NULL;
180
181 RSB_BZERO_P(&scnt);
182 oJA = rsb__malloc(nnz*sizeof(rsb_coo_idx_t));
183 oIA = rsb__malloc(nnz*sizeof(rsb_coo_idx_t));
184 zJA = rsb__malloc(nnz*sizeof(rsb_coo_idx_t));
185 zIA = rsb__malloc(nnz*sizeof(rsb_coo_idx_t));
186 if(!oJA || !oIA) goto err;
187 if(!zJA || !zIA) goto err;
188 for(nzi=0;RSB_LIKELY(nzi<nnz);++nzi)
189 {
190 const rsb_coo_idx_t i = IA[nzi],j = JA[nzi];
191 rsb_coo_idx_t irm = 0,jrm = 0;
192 rsb_coo_idx_t sm = m,sk = k;
193 rsb_coo_idx_t om = 0,ok = 0;
194 rsb_int sidx;
195 for(nlevi=0;nlevi<nlev;++nlevi)
196 {
197 rsb_coo_idx_t hm = RSB_HALF(sm),hk = RSB_HALF(sk);
198 RSB_MUL2(irm);RSB_MUL2(jrm);
199 if(i>=hm+om){irm += 1;sm-=hm;om += hm;}else{sm = hm;}
200 if(j>=hk+ok){jrm += 1;sk-=hk;ok += hk;}else{sk = hk;}
201 }
202 zIA[nzi] = irm;
203 zJA[nzi] = jrm;
204 //sidx = 16*irm+jrm;
205 sidx = rsb_coo_index_bit_interleave(irm,jrm);
206 //scnt[sidx]++;
207 //printf("hm:%d sm:%d\n",hm,sm); printf("hk:%d sk:%d\n",hk,sk);
208 //printf("%d %d -> %d %d (%d) at %d %d sized %d %d\n",i,j,irm,jrm,sidx,om,ok,sm,sk);
209 }
210 if(0)
211 for(nlevi=0;nlevi<nlev;++nlevi)
212 for(nlevj=0;nlevj<nlev;++nlevj)
213 {
214 printf("%d %d : %d\n",nlevi,nlevj,scnt[16*nlevi+nlevj]);
215 }
216 for(nlevi=1;nlevi<nlev*nlev;++nlevi)
217 scnt[nlev*nlev-nlevi] = scnt[nlev*nlev-nlevi-1];
218 scnt[0] = 0;
219 for(nlevi=1;nlevi<nlev*nlev;++nlevi)
220 scnt[nlevi] += scnt[nlevi-1]; /* FIXME: shall use rsb__do_prefix_sum_coo_idx_t */
221 for(nzi=0;RSB_LIKELY(nzi<nnz);++nzi)
222 {
223 rsb_int sidx;
224 rsb_coo_idx_t irm = 0,jrm = 0;
225 irm = zIA[nzi];
226 jrm = zJA[nzi];
227 sidx = rsb_coo_index_bit_interleave(irm,jrm);
228 //sidx = 0;
229 oIA[ scnt[sidx] ] = IA[nzi];
230 oJA[ scnt[sidx]++] = JA[nzi];
231 }
232 rsb__memcpy(IA,oIA,sizeof(rsb_coo_idx_t)*nnz);
233 rsb__memcpy(JA,oJA,sizeof(rsb_coo_idx_t)*nnz);
234 //for(nzi=0;RSB_LIKELY(nzi<nnz);++nzi)
235 //printf("please ignore this value: %d\n",scnt[0]);
236 err:
237 RSB_CONDITIONAL_FREE(oJA);
238 RSB_CONDITIONAL_FREE(oIA);
239 RSB_CONDITIONAL_FREE(zJA);
240 RSB_CONDITIONAL_FREE(zIA);
241 return errval;
242 }
243
rsb_allocate_new__(void * RSB_RESTRICT VA,rsb_coo_idx_t * RSB_RESTRICT IA,rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz,rsb_type_t typecode,const struct rsb_mtx_partitioning_info_t * pinfop,rsb_flags_t flags,rsb_err_t * errvalp)244 static void rsb_allocate_new__(void *RSB_RESTRICT VA, rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp)
245 {
246 rsb_time_t dt;
247 long cs = rsb__get_first_level_c_size();
248 //long cs = rsb__get_lastlevel_c_size();
249 rsb_nnz_idx_t bnz = RSB_MIN(cs/(4*sizeof(rsb_coo_idx_t)),nnz),fnz;
250 //rsb_nnz_idx_t bnz = nnz,fnz;
251 if(!getenv("RSB_CB"))bnz = nnz;
252 RSB_TIC(dt);
253 for(fnz=0;fnz<nnz;fnz+=bnz)
254 {
255 rsb_assign_subm__(IA+fnz,JA+fnz,m,k,RSB_MIN(bnz,nnz-fnz));
256 }
257 RSB_TOC(dt);
258 printf("%d cache blocks\n",(nnz+bnz-1)/bnz);
259 printf("EXPERIMENTAL: processed indices at %lf Mnnz/s in %lf s\n",(RSB_FPINV(dt)*nnz)/RSB_MILLION_F,dt);
260 exit(0);
261 }
262 #endif /* RSB_WANT_FASTER_EXPERIMENTAL_CONSTRUCTOR */
263
rsb__do_set_in_place_submatrices_offsets(struct rsb_mtx_t * RSB_RESTRICT submatrices,rsb_submatrix_idx_t cmc,rsb_char_t * RSB_RESTRICT VA,rsb_coo_idx_t * RSB_RESTRICT IA,rsb_coo_idx_t * RSB_RESTRICT JA,size_t el_size)264 void rsb__do_set_in_place_submatrices_offsets(struct rsb_mtx_t *RSB_RESTRICT submatrices, rsb_submatrix_idx_t cmc, rsb_char_t *RSB_RESTRICT VA, rsb_coo_idx_t *RSB_RESTRICT IA, rsb_coo_idx_t *RSB_RESTRICT JA, size_t el_size)
265 {
266 /**
267 \ingroup gr_internals
268 \note: if nnz==0 and diagonal implicit, this could be dangerous
269 */
270 rsb_submatrix_idx_t smi;
271 for(smi=0;smi<cmc;++smi)
272 {
273 struct rsb_mtx_t * submatrix = submatrices+smi;
274 if(!RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING))
275 {
276 submatrix->bpntr = IA+submatrix->nzoff;
277 submatrix->bindx = JA+submatrix->nzoff;
278 submatrix->VA = ((rsb_char_t*)VA)+el_size*submatrix->nzoff;
279 }
280 }
281 }
282
rsb__do_switch_recursive_matrix_to_fullword_storage(struct rsb_mtx_t * mtxAp)283 rsb_err_t rsb__do_switch_recursive_matrix_to_fullword_storage(struct rsb_mtx_t * mtxAp)
284 {
285 /**
286 \ingroup gr_internals
287 TODO: move somewhere else
288 FIXME: may be UNFINISHED
289 */
290 rsb_err_t errval = RSB_ERR_NO_ERROR;
291 if(!mtxAp)
292 {
293 RSB_ERROR(RSB_ERRM_E_MTXAP);
294 return RSB_ERR_BADARGS;
295 }
296
297 if(rsb__is_recursive_matrix(mtxAp->flags))
298 {
299 rsb_submatrix_idx_t i,j;
300 struct rsb_mtx_t * submatrix;
301 RSB_SUBMATRIX_FOREACH(mtxAp,submatrix,i,j)
302 if(submatrix)
303 RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_recursive_matrix_to_fullword_storage(submatrix));
304 }
305 else
306 {
307 // if(rsb__do_is_candidate_for_halfword_coo(mtxAp))
308 // RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_fullword_coo(mtxAp));
309 // else
310 // if(rsb__do_is_candidate_for_halfword_csr(mtxAp))
311 // RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_fullword_csr(mtxAp));
312 // else
313 // if(!rsb__is_root_matrix(mtxAp) || rsb__is_terminal_recursive_matrix(mtxAp)) /* root recursive or root nonrec. */
314 // RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_MUTUALLY_EXCLUSIVE_SWITCHES);
315 // else
316 // ;/* for root matrices, we keep the flags, because some of the leaves MAY have it */
317 if( mtxAp->matrix_storage == RSB_MATRIX_STORAGE_BCOR )
318 {
319 if(RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES))
320 {
321 rsb__do_switch_array_to_fullword_coo((rsb_half_idx_t*)(mtxAp->bpntr),mtxAp->nnz,0);
322 rsb__do_switch_array_to_fullword_coo((rsb_half_idx_t*)(mtxAp->bindx),mtxAp->nnz,0);
323 RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
324 }
325 }
326 else
327 if( mtxAp->matrix_storage == RSB_MATRIX_STORAGE_BCSR )
328 {
329 if(RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES))
330 {
331 rsb__do_switch_array_to_fullword_coo((rsb_half_idx_t*)(mtxAp->bindx),mtxAp->nnz,0);
332 RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
333 }
334 }
335 else
336 errval = RSB_ERR_BADARGS;
337 }
338
339 RSB_DO_ERR_RETURN(errval)
340 }
341
rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(struct rsb_mtx_t * mtxAp)342 static rsb_err_t rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(struct rsb_mtx_t * mtxAp)
343 {
344 /**
345 \ingroup gr_unfinished
346 TODO: move somewhere else
347 */
348 rsb_err_t errval = RSB_ERR_NO_ERROR;
349 if(!mtxAp)
350 {
351 RSB_ERROR(RSB_ERRM_E_MTXAP);
352 return RSB_ERR_BADARGS;
353 }
354
355 if(rsb__is_recursive_matrix(mtxAp->flags))
356 {
357 RSB_ERROR(RSB_ERRM_ES);
358 return RSB_ERR_BADARGS;
359 }
360 else
361 {
362 if(RSB_C2R_IF_VERBOSE && 0)
363 RSB_INFO_MATRIX_SUMMARY(mtxAp),
364 RSB_INFO(" -switch.."),
365 RSB_INFO("HCOO?(%d)..",rsb__do_is_candidate_for_halfword_coo(mtxAp)),
366 RSB_INFO("HCSR?(%d)",rsb__do_is_candidate_for_halfword_csr(mtxAp)),
367 RSB_INFO("\n");
368 if(RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_WANT_COO_STORAGE))
369 {
370 if(rsb__do_is_candidate_for_halfword_coo(mtxAp))
371 {
372 if(RSB_C2R_IF_VERBOSE)
373 RSB_INFO("to halfword COO:"),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("\n");
374 RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_halfword_coo(mtxAp));
375 }
376 else
377 RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
378 }
379 else
380 // if(rsb_do_is_candidate_for_fullword_coo(mtxAp))
381 // {
382 // RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_recursive_in_place_matrix_to_in_place_rcoo(mtxAp,RSB_BOOL_FALSE));
383 // RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_fullword_coo(mtxAp));// FIXME: wrong naming :(
384 // }
385 // else
386 #if RSB_WANT_FIX_BUG_DISCOVERED_20121210
387 if(!RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_WANT_COO_STORAGE))
388 #else /* RSB_WANT_FIX_BUG_DISCOVERED_20121210 */
389 if( RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_WANT_COO_STORAGE))
390 #endif /* RSB_WANT_FIX_BUG_DISCOVERED_20121210 */
391 {
392 if(RSB_SOME_ERROR(rsb__do_is_candidate_for_halfword_csr(mtxAp)))
393 {
394 if(RSB_C2R_IF_VERBOSE)
395 RSB_INFO("to halfword CSR:"),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("\n");
396 RSB_DO_ERROR_CUMULATE(errval,rsb__do_switch_to_halfword_csr(mtxAp));
397 }
398 else
399 RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
400 }
401 else
402 //if(!rsb__is_root_matrix(mtxAp) || rsb__is_terminal_recursive_matrix(mtxAp)) /* root recursive or root nonrec. */
403 // RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_MUTUALLY_EXCLUSIVE_SWITCHES);
404 //else
405 RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES);
406 ;/* for root matrices, we keep the flags, because some of the leaves MAY have it */
407 }
408
409 RSB_DO_ERR_RETURN(errval)
410 }
411
412 #if !RSB_WANT_MORE_PARALLELISM
rsb_do_switch_fresh_recursive_matrix_to_halfword_storages(struct rsb_mtx_t * mtxAp)413 static rsb_err_t rsb_do_switch_fresh_recursive_matrix_to_halfword_storages(struct rsb_mtx_t * mtxAp)
414 {
415 /**
416 \ingroup gr_unfinished
417 TODO: move somewhere else
418 */
419 rsb_err_t errval = RSB_ERR_NO_ERROR;
420 if(!mtxAp)
421 {
422 RSB_ERROR(RSB_ERRM_E_MTXAP);
423 return RSB_ERR_BADARGS;
424 }
425
426 if(rsb__is_recursive_matrix(mtxAp->flags))
427 {
428 rsb_submatrix_idx_t i,j;
429 const struct rsb_mtx_t * submatrix;
430 RSB_SUBMATRIX_FOREACH(mtxAp,submatrix,i,j)
431 if(submatrix)
432 RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_recursive_matrix_to_halfword_storages(submatrix));
433 }
434 else
435 errval = rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(mtxAp);
436
437 RSB_DO_ERR_RETURN(errval)
438 }
439 #endif /* RSB_WANT_MORE_PARALLELISM */
440
rsb__check_bounds(struct rsb_mtx_t * mtxAp)441 rsb_err_t rsb__check_bounds(struct rsb_mtx_t * mtxAp)
442 {
443 /* FIXME: need checks on Mdim, mdim, ... */
444 rsb_err_t errval = RSB_ERR_NO_ERROR;
445
446 if(
447 mtxAp->broff<0 || mtxAp->bcoff<0 ||
448 /* mtxAp->broff<mtxAp->roff+broff || mtxAp->bcoff<mtxAp->coff+bcoff || */
449 mtxAp->bm>mtxAp->nr ||
450 mtxAp->bk>mtxAp->nc
451 )
452 {
453 RSB_ERROR(RSB_PRINTF_MATRIX_BOUNDS_SUMMARY_ARGS(mtxAp)); RSB_ERROR("\n");
454
455 RSB_ASSERT(! ( mtxAp->broff<0) );
456 RSB_ASSERT(! ( mtxAp->bcoff<0) );
457 /* RSB_ASSERT(! ( mtxAp->broff<mtxAp->roff+broff) );
458 RSB_ASSERT(! ( mtxAp->bcoff<mtxAp->coff+bcoff) ); */
459 RSB_ASSERT(! ( mtxAp->bm>mtxAp->nr) );
460 RSB_ASSERT(! ( mtxAp->bk>mtxAp->nc) );
461
462 errval = RSB_ERR_INTERNAL_ERROR;
463 RSB_ERROR(RSB_ERRM_BCE);
464 RSB_ERROR(RSB_ERRM_BM),RSB_ERROR_MATRIX_SUMMARY(mtxAp),RSB_ERROR(RSB_ERRM_NL);
465 return errval;
466 }
467 return errval;
468 }
469
rsb__compute_bounded_box(struct rsb_mtx_t * mtxAp)470 rsb_err_t rsb__compute_bounded_box(struct rsb_mtx_t * mtxAp)
471 {
472 /*
473 * Set have to be: nr, nc.
474 * Will compute: ...
475 *
476 * TODO: make sure it does not depend on RSB_FLAG_QUAD_PARTITIONING.
477 * */
478 rsb_err_t errval = RSB_ERR_NO_ERROR;
479 rsb_coo_idx_t broff = RSB_INVALID_COO_IDX_VAL,bcoff = RSB_INVALID_COO_IDX_VAL,bm = RSB_INVALID_COO_IDX_VAL,bk = RSB_INVALID_COO_IDX_VAL;
480
481 if(rsb__is_coo_matrix(mtxAp))
482 {
483 //rsb_nnz_idx_t nnz0 = 0,nnz1 = mtxAp->nnz;
484 if(RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_USE_HALFWORD_INDICES))
485 {
486 RSB_DECLARE_CONST_HALFCOO_ARRAYS_FROM_MATRIX(IA,JA,mtxAp)
487 rsb_half_idx_t li, ui;
488 rsb_half_idx_t lj, uj;
489 // FIXME: could optimize, since IA is sorted
490 rsb__util_find_extremal_half_index_val(IA,mtxAp->nnz,0,mtxAp->nr,&li,&ui);
491 rsb__util_find_extremal_half_index_val(JA,mtxAp->nnz,0,mtxAp->nc,&lj,&uj);
492 bk = 1;bk += uj; bm = 1;bm += ui; broff = li; bcoff = lj;
493 }
494 else
495 {
496 RSB_DECLARE_CONST_FULLCOO_ARRAYS_FROM_MATRIX(IA,JA,mtxAp)
497 rsb_coo_idx_t li, ui;
498 rsb_coo_idx_t lj, uj;
499 // FIXME: could optimize, since IA is sorted
500 rsb__util_find_extremal_full_index_val(IA,mtxAp->nnz,0,mtxAp->nr,&li,&ui);
501 rsb__util_find_extremal_full_index_val(JA,mtxAp->nnz,0,mtxAp->nc,&lj,&uj);
502 bk = 1;bk += uj; bm = 1;bm += ui; broff = li; bcoff = lj;
503 }
504 }
505 else
506 if(rsb__is_csr_matrix(mtxAp))
507 {
508 if(RSB_DO_FLAG_HAS(mtxAp->flags,(RSB_FLAG_USE_HALFWORD_INDICES)))
509 {
510 RSB_DECLARE_CONST_HALFCSR_ARRAYS_FROM_MATRIX(PA,JA,mtxAp)
511 rsb_half_idx_t lj, uj;
512 rsb_coo_idx_t li, ui;
513 rsb__util_find_extremal_half_index_val(JA,mtxAp->nnz,0,mtxAp->nr,&lj,&uj);
514 ui = rsb__nnz_split_nnz_bsearch(PA,mtxAp->nnz,mtxAp->nr+1);
515 li = rsb__nnz_split_nnz_bsearch(PA,1,mtxAp->nr+1)-1;
516 bk = 1;bk += uj; bm = ui; broff = li; bcoff = lj;
517 }
518 else
519 {
520 RSB_DECLARE_CONST_FULLCSR_ARRAYS_FROM_MATRIX(PA,JA,mtxAp)
521 rsb_coo_idx_t lj, uj;
522 rsb_coo_idx_t li, ui;
523 rsb__util_find_extremal_full_index_val(JA,mtxAp->nnz,0,mtxAp->nr,&lj,&uj);
524 ui = rsb__nnz_split_nnz_bsearch(PA,mtxAp->nnz,mtxAp->nr+1);
525 li = rsb__nnz_split_nnz_bsearch(PA,1,mtxAp->nr+1)-1;
526 bk = 1;bk += uj; bm = ui; broff = li; bcoff = lj;
527 }
528 }
529 else
530 RSB_ERROR(RSB_ERRMSG_BADFORMAT);
531
532 mtxAp->broff = mtxAp->roff+broff;
533 mtxAp->bcoff = mtxAp->coff+bcoff;
534 mtxAp->bm = bm;
535 mtxAp->bk = bk;
536
537 errval = rsb__check_bounds(mtxAp);
538 #if 0
539 RSB_INFO("bounding box of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),
540 RSB_INFO(": %.2f%% x %.2f %%\n",(100.0f*(float)(bm-broff))/mtxAp->nr,(100.0f*(float)(bk-bcoff))/mtxAp->nc);
541 RSB_INFO(": %d,%d %d,%d\n",mtxAp->roff+broff,mtxAp->coff+bcoff,bm,bk);
542 RSB_INFO(": %d,%d %d,%d\n",mtxAp->roff+broff,mtxAp->coff+bcoff,bm,bk);
543 #endif
544 return errval;
545 }
546
rsb_do_compute_bounded_boxes(struct rsb_mtx_t * mtxAp)547 static rsb_err_t rsb_do_compute_bounded_boxes(struct rsb_mtx_t * mtxAp)
548 {
549 /**
550 \ingroup gr_unfinished
551 */
552 rsb_err_t errval = RSB_ERR_NO_ERROR;
553 rsb_submatrix_idx_t smi = 0;
554 rsb_bool_t want_really = 0;
555 #if RSB_WANT_BOUNDED_BOXES
556 want_really = (rsb_global_session_handle.want_bounded_box!=0);
557 #else /* RSB_WANT_BOUNDED_BOXES */
558 mtxAp->broff = roff;
559 mtxAp->bcoff = coff;
560 mtxAp->bm = m;
561 mtxAp->bk = k;
562 goto err;
563 #endif /* RSB_WANT_BOUNDED_BOXES */
564
565 if(!mtxAp)
566 {
567 RSB_ERROR(RSB_ERRM_E_MTXAP);
568 return RSB_ERR_BADARGS;
569 }
570
571 if(mtxAp->nnz==0)
572 goto err;
573
574 if(want_really)
575 {
576 if(rsb__is_terminal_recursive_matrix(mtxAp)) // fix for serial 20101206
577 {
578 RSB_DO_ERROR_CUMULATE(errval,rsb__compute_bounded_box(mtxAp));
579 goto err;
580 }
581 #pragma omp parallel for schedule(static,1) reduction(|:errval) shared(mtxAp) RSB_NTC
582 for(smi=0;smi<mtxAp->all_leaf_matrices_n;++smi)
583 {
584 struct rsb_mtx_t * submatrix = mtxAp->all_leaf_matrices[smi].mtxlp;
585 RSB_DO_ERROR_CUMULATE(errval,rsb__compute_bounded_box(submatrix));
586 }
587 #pragma omp barrier
588 }
589 else
590 {
591 #pragma omp parallel for schedule(static,1) reduction(|:errval) shared(mtxAp) RSB_NTC
592 for(smi=0;smi<mtxAp->all_leaf_matrices_n;++smi)
593 {
594 struct rsb_mtx_t * submatrix = mtxAp->all_leaf_matrices[smi].mtxlp;
595 submatrix->bm = submatrix->nr;
596 submatrix->bk = submatrix->nc;
597 submatrix->broff = submatrix->roff;
598 submatrix->bcoff = submatrix->coff;
599 }
600 }
601 err:
602 RSB_DO_ERR_RETURN(errval)
603 }
604
rsb_do_switch_fresh_recursive_matrix_to_halfword_storages_parallel(struct rsb_mtx_t * mtxAp)605 static rsb_err_t rsb_do_switch_fresh_recursive_matrix_to_halfword_storages_parallel(struct rsb_mtx_t * mtxAp)
606 {
607 /**
608 \ingroup gr_unfinished
609 TODO: move somewhere else
610 */
611 rsb_err_t errval = RSB_ERR_NO_ERROR;
612 rsb_submatrix_idx_t smi = 0;
613 if(!mtxAp)
614 {
615 errval = RSB_ERR_BADARGS;
616 RSB_PERR_GOTO(err,RSB_ERRM_E_MTXAP);
617 }
618
619 /* FIXME: 20100809 it seems that 'switching' a 0-nnz matrix overwrites something which should not be overwritten */
620 if(mtxAp->nnz==0)
621 goto err;
622
623 if(rsb__is_terminal_recursive_matrix(mtxAp)) // fix for serial 20101206
624 {
625 RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(mtxAp));
626 goto err;
627 }
628 #pragma omp parallel for schedule(static,1) reduction(|:errval) shared(mtxAp) RSB_NTC
629 for(smi=0;smi<mtxAp->all_leaf_matrices_n;++smi)
630 {
631 struct rsb_mtx_t * submatrix = mtxAp->all_leaf_matrices[smi].mtxlp;
632 RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_terminal_matrix_to_halfword_storages(submatrix));
633 }
634 #pragma omp barrier
635 err:
636 RSB_DO_ERR_RETURN(errval)
637 }
638
639 #if 0
640 static rsb_err_t rsb_do_shuffle_left_and_right_rows_inner(rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t m, rsb_coo_idx_t m0, rsb_nnz_idx_t nnz, rsb_nnz_idx_t nnz0, rsb_coo_idx_t * RSB_RESTRICT IL, rsb_coo_idx_t * IM, rsb_coo_idx_t * RSB_RESTRICT WA, size_t sz)
641 {
642 /**
643 \ingroup gr_unfinished
644 FIXME: UNFINISHED, EXPERIMENTAL
645 */
646 rsb_err_t errval = RSB_ERR_NO_ERROR;
647 rsb_coo_idx_t iu = m0,id = m-1;
648 rsb_coo_idx_t wl = nnz,wr = 0,ns = 0,nu = 0,ws = 0,nd = nnz;
649
650 if( sz<1 || !IA || !IM || !IL || !WA || RSB_INVALID_NNZ_INDEX(nnz) || RSB_INVALID_COO_INDEX(m) )
651 {
652 errval = RSB_ERR_BADARGS;
653 RSB_PERR_GOTO(err,RSB_ERRM_E_MTXAP);
654 }
655 if(RSB_UNLIKELY(IL[m]!=nnz))
656 {
657 errval = RSB_ERR_INTERNAL_ERROR;
658 RSB_PERR_GOTO(err,RSB_ERRM_ES);
659 }
660 if(iu>=id)
661 {
662 errval = RSB_ERR_INTERNAL_ERROR;
663 RSB_PERR_GOTO(err,RSB_ERRM_ES);
664 }
665 nu = IL[iu];
666
667 while(RSB_LIKELY(iu<=id))
668 {
669 /* compute the left subrow length */
670 ns = IM[iu]-IL[iu];
671 /* shift left the left subrow */
672 RSB_A_MEMMOVE(IA,IA,nu,IL[iu],ns,sz);
673 /* update the counter of left subrows elements in IA */
674 nu += ns;
675 /* compute the right subrow length */
676 ws = (IL[iu+1]-IM[iu]);
677 /* buffer the right subrow */
678 RSB_A_MEMCPY(WA,IA,wr,IM[iu],ws,sz);
679 /* update the (complementary) counter of right subrows elements in the buffer */
680 wr += ws;
681
682 if(RSB_UNLIKELY(iu>=id))
683 {
684 /* skip row, as id was already done */
685 ++id;
686 goto done;
687 }
688 /* compute the right subrow length */
689 ns = IL[id+1]-IM[id];
690 /* update the (complementary) counter of right subrows elements in IA */
691 nd -= ns;
692 /* shift right the right subrow */
693 RSB_A_MEMMOVE(IA,IA,nd,IM[id],ns,sz);
694 /* compute the left subrow length */
695 ws = IM[id]-IL[id];
696 /* update the counter of right subrows elements in the buffer */
697 wl -= ws;
698 /* buffer the left subrow */
699 RSB_A_MEMCPY(WA,IA,wl,IL[id],ws,sz);
700
701 ++iu,--id;
702 }
703 /* IA has definitive elements, from left at 0..nu-1 and from right at (nnz-nd)..nnz-1 */
704 {
705 //rsb_nnz_idx_t
706 }
707 /* WA has definitive elements, from right at 0..wr-1 and from left at wl..nnz-1 */
708 /* it should be : nnz == nu + nnz-wl+nd-wr */
709 if(nu+((nnz)-wl)!=nd-wr)
710 {
711 errval = RSB_ERR_INTERNAL_ERROR;
712 RSB_PERR_GOTO(err,RSB_ERRM_ES);
713 }
714 done:
715 /* compute the number of left submatrix elements in the buffer */
716 ns = (nnz)-wl;
717 /* copy the partial left submatrix from the buffer to the array */
718 RSB_A_MEMMOVE(IA,WA,nu,wl,ns,sz);
719 /* update the counter of left subrows elements in IA */
720 nu += ns;
721 /* compute the number of right submatrix elements in the buffer */
722 ns = wr;
723 /* copy the partial right submatrix from the buffer to the array */
724 RSB_A_MEMMOVE(IA,WA,nu,0,ns,sz);
725 /* update the counter to all subrows elements in IA (those already present, too) */
726 nd -= ns;
727
728 /* minimal coherence check */
729 err:
730 if(RSB_UNLIKELY(nu!=nd))
731 {
732 RSB_ERROR("nnz=%d != nu+nd = %d; nu=%d, wl=%d, wr=%d, nd=%d\n",nnz,nu+nd,nu,wl,wr,nd);
733 // RSB_ERROR("nnz=%d != nu+nnz-wl+nd = %d; nu=%d, wl=%d, wr=%d, nd=%d\n",nnz,nu+nnz+wl+nd,nu,wl,wr,nd);
734 errval = RSB_ERR_INTERNAL_ERROR;
735 }
736 /* the buffer is empty now, and the arrays are left-right partitioned */
737 RSB_DO_ERR_RETURN(errval)
738 }
739 #endif
740
741 #if 0
742 static rsb_err_t rsb_do_shuffle_left_and_right_rows(void * RSB_RESTRICT VA, rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t m, rsb_coo_idx_t m0, rsb_nnz_idx_t nnz, rsb_nnz_idx_t nnz0, rsb_type_t typecode, rsb_coo_idx_t * RSB_RESTRICT IL, rsb_coo_idx_t * IM, rsb_coo_idx_t * RSB_RESTRICT WA)
743 {
744 /**
745 \ingroup gr_unfinished
746 FIXME: UNFINISHED, EXPERIMENTAL
747 */
748 rsb_err_t errval = RSB_ERR_NO_ERROR;
749 const size_t sz = sizeof(rsb_coo_idx_t);
750 if(!IL || !IM || !WA)
751 {
752 errval = RSB_ERR_BADARGS;
753 RSB_PERR_GOTO(err,RSB_ERRM_E_MTXAP);
754 }
755 RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows_inner(IA,m,m0,nnz,nnz0,IL,IM,WA,sz));
756 RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows_inner(JA,m,m0,nnz,nnz0,IL,IM,WA,sz));
757 RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows_inner(VA,m,m0,nnz,nnz0,IL,IM,WA,RSB_SIZEOF(typecode)));
758 err:
759 RSB_DO_ERR_RETURN(errval)
760 }
761 #endif
762
rsb_do_compute_vertical_split_search_only(const rsb_coo_idx_t * RSB_RESTRICT IA,const rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_coo_idx_t hm,rsb_coo_idx_t hk,rsb_nnz_idx_t nnz,const rsb_coo_idx_t * IB,rsb_nnz_idx_t * ulp,rsb_nnz_idx_t * urp,rsb_nnz_idx_t * llp,rsb_nnz_idx_t * lrp)763 static rsb_err_t rsb_do_compute_vertical_split_search_only(
764 const rsb_coo_idx_t * RSB_RESTRICT IA, const rsb_coo_idx_t * RSB_RESTRICT JA,
765 rsb_coo_idx_t roff, rsb_coo_idx_t coff, rsb_coo_idx_t m, rsb_coo_idx_t k,
766 rsb_coo_idx_t hm, rsb_coo_idx_t hk, rsb_nnz_idx_t nnz,
767 const rsb_coo_idx_t * IB, rsb_nnz_idx_t *ulp, rsb_nnz_idx_t *urp, rsb_nnz_idx_t *llp, rsb_nnz_idx_t *lrp)
768 {
769 /**
770 \ingroup gr_unfinished
771
772
773 */
774 rsb_nnz_idx_t ul = 0,ur = 0,ll = 0,lr = 0;
775 rsb_err_t errval = RSB_ERR_NO_ERROR;
776 rsb_nnz_idx_t dnnz = 0/*,wdnnz = 0,rnnz = 0,hrnnz = 0*/;
777 register rsb_coo_idx_t i;
778 //rsb_nnz_idx_t nnz0 = 0;
779 //rsb_coo_idx_t xroff = 0;
780
781
782 if(nnz>m || 1)
783 //if(nnz>m)
784 {
785 for(i = roff;RSB_LIKELY(i<roff+m);++i)
786 {
787 // offset of line i in the global line pointers array
788 rsb_nnz_idx_t nnz0 = IB[i];
789 // nnz1..nnz0 are the boundaries of line i
790 rsb_nnz_idx_t nnz1 = IB[i+1];
791 rsb_nnz_idx_t nnz2 = 0;
792 // check
793 RSB_C2R_ASSERT(nnz0>=IB[i]);
794 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
795 // skip line if empty
796 if(nnz1-nnz0<1)continue;
797 // find first element of line i also in the submatrix
798 nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,coff,nnz1-nnz0);
799 // skip line if empty in the submatrix
800 if(nnz1-nnz0<1)continue;
801 // find the length of the subrow i in the submatrix
802 nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,coff+k,nnz1-nnz0);
803 //check
804 RSB_C2R_ASSERT(JA[nnz0+0]>=coff);
805 // skip line if empty in the submatrix
806 if(nnz1-nnz0<1)continue;
807 nnz2 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,coff+hk,nnz1-nnz0);
808 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
809 RSB_C2R_ASSERT(JA[nnz0+0]>=coff);
810 RSB_C2R_ASSERT(JA[nnz1-1]< coff+k);
811 dnnz += nnz1-nnz0;
812 if(i<roff+hm)
813 ul += nnz2-nnz0,
814 ur += nnz1-nnz2;
815 else
816 ll += nnz2-nnz0,
817 lr += nnz1-nnz2;
818 }
819 }
820 else
821 {
822 // FIXME: UNFINISHED
823 rsb_nnz_idx_t nnz0,nnz1,n;
824 //RSB_INFO("almost empty matrix !\n");
825 for(n=0;n<nnz;++n)
826 {
827 rsb_nnz_idx_t nnz2 = 0;
828 i = IA[n];
829 nnz0 = IB[i];
830 nnz1 = IB[i+1];
831 // ...
832 #if 1
833 // skip line if empty
834 if(nnz1-nnz0<1)continue;
835 // find first element of line i also in the submatrix
836 nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,coff,nnz1-nnz0);
837 // skip line if empty in the submatrix
838 if(nnz1-nnz0<1)continue;
839 // find the length of the subrow i in the submatrix
840 nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,coff+k,nnz1-nnz0);
841 //check
842 RSB_C2R_ASSERT(JA[nnz0+0]>=coff);
843 // skip line if empty in the submatrix
844 if(nnz1-nnz0<1)continue;
845 nnz2 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,coff+hk,nnz1-nnz0);
846 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
847 RSB_C2R_ASSERT(JA[nnz0+0]>=coff);
848 RSB_C2R_ASSERT(JA[nnz1-1]< coff+k);
849 dnnz += nnz1-nnz0;
850 if(i<roff+hm)
851 ul += nnz2-nnz0,
852 ur += nnz1-nnz2;
853 else
854 ll += nnz2-nnz0,
855 lr += nnz1-nnz2;
856 #else
857 if(nnz1-nnz0<1)continue;
858 if(i<roff+hm)
859 {
860 for(;n<nnz1;++n)
861 if(JA[n]>=coff+hk)
862 ++ur;
863 else
864 ++ul;
865 }
866 else
867 {
868 for(;n<nnz1;++n)
869 if(JA[n]>=coff+hk)
870 ++lr;
871 else
872 ++ll;
873 }
874 #endif
875 }
876 }
877 //done:
878 *llp = ll;
879 *lrp = lr;
880 *ulp = ul;
881 *urp = ur;
882 //err:
883 RSB_DO_ERR_RETURN(errval)
884 }
885
886 #if !RSB_WANT_PARALLEL_SUBDIVISION
rsb_do_compute_vertical_split(const rsb_coo_idx_t * RSB_RESTRICT IA,const rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_coo_idx_t hm,rsb_coo_idx_t hk,rsb_nnz_idx_t nnz,rsb_coo_idx_t * IL,rsb_coo_idx_t * RSB_RESTRICT IM,rsb_coo_idx_t * IR,rsb_nnz_idx_t * ulp,rsb_nnz_idx_t * urp,rsb_nnz_idx_t * llp,rsb_nnz_idx_t * lrp)887 static rsb_err_t rsb_do_compute_vertical_split(const rsb_coo_idx_t * RSB_RESTRICT IA, const rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t roff, rsb_coo_idx_t coff, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_coo_idx_t hm, rsb_coo_idx_t hk, rsb_nnz_idx_t nnz, rsb_coo_idx_t * IL, rsb_coo_idx_t * RSB_RESTRICT IM, rsb_coo_idx_t * IR, rsb_nnz_idx_t *ulp, rsb_nnz_idx_t *urp, rsb_nnz_idx_t *llp, rsb_nnz_idx_t *lrp)
888 {
889 /**
890 \ingroup gr_unfinished
891 FIXME: UNFINISHED, EXPERIMENTAL
892
893 Computes two arrays: IM, IL.
894 IM[i], contains the index of the first element >= hk on line i,
895 IL[i], contains the index of the first element on line i.
896 Notes:
897 IM[i]==IL[i+1] if no element >=hk exists
898 IM[i]==IL[i] if no IL[i] >=hk
899 IL[i]==IL[i+1] if line i is empty
900 IL[0]==0
901 IL[m]==nnz
902 IM is valid on the 0..nr-1 range.
903 IL is valid on the 0..nr range.
904
905 TODO: blocking support
906 */
907 rsb_nnz_idx_t ul = 0,ur = 0,ll = 0,lr = 0;
908 rsb_err_t errval = RSB_ERR_NO_ERROR;
909 rsb_nnz_idx_t dnnz = 0,wdnnz = 0,rnnz = 0,hrnnz = 0;
910 register rsb_coo_idx_t i;
911 rsb_nnz_idx_t nnz0 = 0;
912
913 hk += coff;
914
915 if(IR==NULL && IM==NULL && IL==NULL)
916 {
917 /* FIXME: write me, for cases where we want subdivision on barely-COO matrices */
918 }
919 else
920 if(IR==NULL && IM==NULL)
921 {
922 // root matrix; should compute IL
923 IL[0] = 0;
924 // if(nnz>100*m)// TODO
925 if(0)// TODO: determine which case is faster !
926 {
927 /* fill the row pointers array */
928 // #pragma omp parallel for reduction(+:dnnz) RSB_NTC
929 // for(i=0;i<m;++i)
930 for(i=0;RSB_LIKELY(i<m);++i)
931 {
932 // delimit the current row
933 rnnz = rsb__nnz_split_coo_bsearch(IA+dnnz,i+1,nnz-dnnz);
934 /* i==m-1 || IA[dnnz+rnnz] > i */
935 dnnz += rnnz;
936 IL[i+1] = dnnz;
937 RSB_C2R_ASSERT(rnnz>=0);
938 }
939 }
940 else
941 {
942 /* for full matrices, this is faster */
943 rsb_nnz_idx_t n = 0;
944 #if 1
945 nnz0 = 0;
946 #if RSB_WANT_QUADRANT_QUICK_DETECT
947 /* FIXME: UNFINISHED */
948 if(IB[roff]==IB[roff+hm])
949 {
950 RSB_INFO("upper submatrix empty\n");
951 // should do something sharp
952 }
953 else
954 if(IB[roff+hm]==IB[roff+m])
955 {
956 RSB_INFO("lower submatrix empty\n");
957 // should do something sharp
958 }
959 #endif /* RSB_WANT_QUADRANT_QUICK_DETECT */
960 for(i=0;RSB_LIKELY(i<m);++i)
961 {
962 rnnz = 0;
963 for(;RSB_LIKELY(n<nnz && IA[n]==i);++n)
964 ++rnnz;
965 IL[i+1] = nnz0+rnnz;
966 nnz0 += rnnz;
967 }
968 #else
969 for(i=0;RSB_LIKELY(i<m);++i)
970 IL[i] = 0;
971 for(n=0;RSB_LIKELY(n<nnz);++n)
972 RSB_C2R_ASSERT(IA[n]>=0 && IA[n]<m);
973 for(n=0;RSB_LIKELY(n<nnz);++n)
974 IL[IA[n]+1]++;
975 for(i=0;RSB_LIKELY(i<m);++i)
976 IL[i+1] += IL[i];
977 #endif
978 }
979 RSB_C2R_ASSERT(IL[m]==nnz);
980 goto err;
981 }
982 else
983 if(IR==NULL)
984 {
985 RSB_C2R_ASSERT(0);
986 RSB_ASSERT(ulp);
987 RSB_ASSERT(llp);
988 RSB_ASSERT(urp);
989 RSB_ASSERT(lrp);
990 // root matrix; should compute IL
991 IL[0] = 0;
992 /* fill the row pointers array */
993 for(i=0;RSB_LIKELY(i<m);++i)
994 {
995 // delimit the current row
996 rnnz = rsb__nnz_split_coo_bsearch(IA+dnnz,i+1,nnz-dnnz);
997 /* i==m-1 || IA[dnnz+rnnz] > i */
998
999 IL[i+1] = dnnz+rnnz;
1000
1001 if(RSB_LIKELY(dnnz+rnnz<=nnz))
1002 {
1003 // the current row is non empty
1004 hrnnz = rsb__nnz_split_coo_bsearch(JA+dnnz,hk,rnnz);
1005 if(RSB_LIKELY(hrnnz<rnnz))
1006 IM[i] = dnnz+hrnnz;
1007 else
1008 // all the elements are in the left submatrix
1009 IM[i] = dnnz+ rnnz;
1010 }
1011 else
1012 // last row
1013 hrnnz = rnnz,
1014 IM[i] = nnz;
1015
1016 if(RSB_UNLIKELY(IM[i]<IL[i]))
1017 {
1018 errval = RSB_ERR_INTERNAL_ERROR;
1019 RSB_PERR_GOTO(err,RSB_ERRM_ES);
1020 }
1021
1022 // TODO: split in two cycles: 0..hm-1, hm..nr-1
1023 if(i<hm)
1024 ul += IM[i ]-IL[i],
1025 ur += IL[i+1]-IM[i];
1026 else
1027 ll += IM[i ]-IL[i],
1028 lr += IL[i+1]-IM[i];
1029 //RSB_INFO("%d @ %d~%d (%d/%d)\n",i,dnnz,dnnz+rnnz-1,hrnnz,rnnz);
1030 dnnz += rnnz;
1031 }
1032 IM[m] = IL[m];
1033 }
1034 else
1035 {
1036 // compute middle pointers array, using the left and right ones
1037 RSB_ASSERT(ulp);
1038 RSB_ASSERT(llp);
1039 RSB_ASSERT(urp);
1040 RSB_ASSERT(lrp);
1041 nnz0 = IL[0];
1042 /* fill the middle row pointers array */
1043 for(i=0;RSB_LIKELY(i<m);++i)
1044 {
1045 // delimit the current row
1046 rsb_nnz_idx_t il = IL[i],ir = IR[i],im;
1047 rnnz = ir-il;
1048
1049 RSB_C2R_ASSERT(ir>=il);
1050 if(ir<il)
1051 {
1052 errval = RSB_ERR_INTERNAL_ERROR;
1053 RSB_PERR_GOTO(err,RSB_ERRM_ES);
1054 }
1055 if(ir==il)
1056 {
1057 // empty row
1058 IM[i] = IR[i];
1059 continue;
1060 }
1061 /* i==m-1 || IA[dnnz+rnnz] > i */
1062
1063 // the current row is non empty
1064 RSB_C2R_ASSERT(JA[il+0]>=coff );
1065 RSB_C2R_ASSERT(JA[ir-1]< coff+k);
1066
1067 hrnnz = rsb__nnz_split_coo_bsearch(JA+il,hk,rnnz);
1068 im = il+hrnnz;
1069
1070 IM[i] = im;
1071
1072 #if RSB_C2R_PARANOIA
1073 if(IM[i]>IR[i])
1074 {
1075 errval = RSB_ERR_INTERNAL_ERROR;
1076 RSB_PERR_GOTO(err,"i=%d, %d > %d!\n",i,IM[i],IR[i]);
1077 }
1078
1079 if(IM[i]<IL[i])
1080 {
1081 errval = RSB_ERR_INTERNAL_ERROR;
1082 RSB_PERR_GOTO(err,"i=%d, %d < %d!\n",i,IM[i],IL[i]);
1083 }
1084
1085 #endif /* RSB_C2R_PARANOIA */
1086 // TODO: split in two cycles: 0..hm-1, hm..nr-1
1087 if(i<hm)
1088 ul += im-il,
1089 ur += ir-im;
1090 else
1091 ll += im-il,
1092 lr += ir-im;
1093 //RSB_INFO("%d @ %d~%d (%d/%d)\n",i,dnnz,dnnz+rnnz-1,hrnnz,rnnz);
1094 dnnz += rnnz;
1095 }
1096 IM[m] = IL[m];
1097 }
1098
1099 if(RSB_C2R_PARANOIA)
1100 {
1101 rsb_coo_idx_t i;
1102 rsb_nnz_idx_t lnz = 0,rnz = 0,tnz = 0;
1103 if(IR==NULL)
1104 IR = IL+1;
1105
1106 /* if(IL[m]!=nnz0+nnz)
1107 {
1108 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1109 RSB_PERR_GOTO(err,RSB_ERRM_ES);
1110 }*/
1111
1112 for(i=0;RSB_LIKELY(i<m);++i)
1113 {
1114 lnz += IM[i]-IL[i];
1115 rnz += IR[i]-IM[i];
1116 tnz += IR[i]-IL[i];
1117
1118 if(RSB_UNLIKELY(IM[i]<IL[i] || IL[i]>IL[i+1]))
1119 {
1120 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1121 RSB_PERR_GOTO(err,RSB_ERRM_ES);
1122 }
1123 }
1124 if(ul+ll!=lnz || ur+lr != rnz)
1125 {
1126 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1127 RSB_PERR_GOTO(err,RSB_ERRM_ES);
1128 }
1129 if(tnz!=nnz || (rnz+lnz)!=nnz)
1130 {
1131 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1132 RSB_PERR_GOTO(err,"tnz:%d, nnz:%d, rnz:%d, lnz:%d\n",tnz,nnz,rnz,lnz);
1133 }
1134 }
1135 *llp = ll;
1136 *lrp = lr;
1137 *ulp = ul;
1138 *urp = ur;
1139 err:
1140 RSB_DO_ERR_RETURN(errval)
1141 }
1142 #endif /* RSB_WANT_PARALLEL_SUBDIVISION */
1143
rsb_do_compute_vertical_split_parallel(const rsb_coo_idx_t * RSB_RESTRICT IA,const rsb_coo_idx_t * RSB_RESTRICT JA,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_coo_idx_t hm,rsb_coo_idx_t hk,rsb_nnz_idx_t nnz,rsb_coo_idx_t * IL,rsb_coo_idx_t * RSB_RESTRICT IM,rsb_coo_idx_t * IR,rsb_nnz_idx_t * ulp,rsb_nnz_idx_t * urp,rsb_nnz_idx_t * llp,rsb_nnz_idx_t * lrp)1144 static rsb_err_t rsb_do_compute_vertical_split_parallel(const rsb_coo_idx_t * RSB_RESTRICT IA, const rsb_coo_idx_t * RSB_RESTRICT JA, rsb_coo_idx_t roff, rsb_coo_idx_t coff, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_coo_idx_t hm, rsb_coo_idx_t hk, rsb_nnz_idx_t nnz, rsb_coo_idx_t * IL, rsb_coo_idx_t * RSB_RESTRICT IM, rsb_coo_idx_t * IR, rsb_nnz_idx_t *ulp, rsb_nnz_idx_t *urp, rsb_nnz_idx_t *llp, rsb_nnz_idx_t *lrp)
1145 {
1146
1147 /**
1148 Binary search for the boundaries of each row.
1149 Assign threads to rows intervals.
1150 Perform the row pointers vector fill calling rsb_do_compute_vertical_split.
1151 */
1152 #if 0
1153 return rsb_do_compute_vertical_split(IA,JA,roff,coff,m,k,hm,hk,nnz,IL,IM,IR,ulp,urp,llp,lrp);
1154 #else
1155 rsb_err_t errval = RSB_ERR_NO_ERROR;
1156 /* const rsb_thread_t wet = rsb_get_num_threads(); */
1157
1158 if(m<1)
1159 return RSB_ERR_NO_ERROR;/* TODO: limit case */
1160 IL[0] = 0;
1161 if(m==1)
1162 goto after;
1163
1164 #pragma omp parallel RSB_NTC
1165 {
1166 #if RSB_WANT_OMP_RECURSIVE_KERNELS
1167 const rsb_thread_t tn = /*wet*/ omp_get_num_threads(), tnn = RSB_MIN(tn,m);
1168 const rsb_thread_t th_id = omp_get_thread_num();
1169 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
1170 const rsb_thread_t tn = 1, tnn = RSB_MIN(tn,m);
1171 const rsb_thread_t th_id = 0;
1172 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
1173 const rsb_coo_idx_t mm = ((m+tnn-1)/tnn),m0 = mm*th_id,m1 = RSB_MIN(m0+mm,m);
1174 rsb_coo_idx_t i;
1175 rsb_nnz_idx_t nnz0 = 0,nnz1 = nnz;
1176 rsb_nnz_idx_t n,rnnz,fnnz,lnnz;
1177 if(th_id>=m)
1178 goto nowork;
1179 /* binary search for the boundaries of each row */
1180 nnz0 = rsb__nnz_split_coo_bsearch(IA+nnz0,m0,nnz1-nnz0);
1181 nnz1 = nnz0+rsb__nnz_split_coo_bsearch(IA+nnz0,m1,nnz1-nnz0);
1182 /* assign threads to rows intervals */
1183 if(nnz0>=nnz1)
1184 {
1185 //for(i=m0;RSB_LIKELY(i<m1);++i)
1186 // IL[i+1] = nnz0;
1187 RSB_XCOO_VSET(IL,nnz0,m0+1,m1+1);
1188 goto nowork;
1189 }
1190 //RSB_INFO("thread %d rows %d..%d nnz %d..%d\n",th_id,m0,m1,nnz0,nnz1);
1191 /* perform the row pointers vector fill calling rsb_do_compute_vertical_split */
1192 //RSB_DO_ERROR_CUMULATE(errval,rsb_do_compute_vertical_split(IA+nnz0,JA+nnz0,roff+m0,coff,m1-m0,k,hm,hk,nnz1-nnz0,IL+m0,NULL,NULL,ulp,urp,llp,lrp));
1193 fnnz = nnz0;
1194 n = nnz0;
1195 for(i=m0;RSB_LIKELY(i<m1);++i)
1196 {
1197 if((nnz1-nnz0)/(m1-m0)<RSB_WANT_BINSEARCH_MIN_NZPR)
1198 {
1199 rnnz = 0;
1200 for(;RSB_LIKELY(n<nnz1 && IA[n]==i);++n)
1201 ++rnnz;
1202 IL[i+1] = nnz0+rnnz;
1203 nnz0 += rnnz;
1204 }
1205 else
1206 {
1207 /* TODO : should use a smarter strategy than this one */
1208 lnnz = fnnz+rsb__nnz_split_coo_bsearch(IA+fnnz,i+1,nnz1-fnnz);
1209 //RSB_INFO("%d : %d\n",i,lnnz);
1210 IL[i+1] = lnnz;
1211 fnnz = lnnz;
1212 }
1213 }
1214 nowork:
1215 RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
1216 #pragma omp barrier
1217 RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
1218 }
1219 after:
1220 IL[m] = nnz;
1221 //int i; RSB_INFO(":::"); for(i=0;RSB_LIKELY(i<m+1);++i) RSB_INFO("%d ",IL[i]); RSB_INFO("\n");
1222 //RSB_INFO(":::"); for(i=0;RSB_LIKELY(i<nnz);++i) RSB_INFO("%d ",IA[i]); RSB_INFO("\n");
1223 //err:
1224 RSB_DO_ERR_RETURN(errval)
1225 #endif
1226 }
1227
1228 #if 0
1229 static rsb_err_t rsb_do_fill_partially_rcsr_arrays_for_later(struct rsb_mtx_t * mtxAp,
1230 const rsb_coo_idx_t * IL, const rsb_coo_idx_t * IR,
1231 rsb_coo_idx_t * IA, rsb_coo_idx_t * JA,
1232 //const rsb_coo_idx_t * IA, const rsb_coo_idx_t * JA,
1233 rsb_nnz_idx_t nzoff, rsb_coo_idx_t m, rsb_coo_idx_t roff )
1234 {
1235 /**
1236 \ingroup gr_unfinished
1237 */
1238 mtxAp->nzoff = nzoff;
1239 mtxAp->bindx = IA+nzoff;
1240 mtxAp->bpntr = NULL;
1241 #if RSB_WANT_FIRST_VERSION
1242 RSB_COA_MEMMOVE(mtxAp->bindx,IL,0,roff,m+1);
1243 #endif /* RSB_WANT_FIRST_VERSION */
1244 #if RSB_WANT_FIRST_VERSION
1245 {
1246 rsb_nnz_idx_t i;
1247 for(i=mtxAp->roff;RSB_LIKELY(i<mtxAp->roff+mtxAp->nr);++i)
1248 {
1249 rsb_nnz_idx_t nnz1 = IR[i];
1250 rsb_nnz_idx_t nnz0 = IL[i];
1251 // RSB_C2R_ASSERT(IL[i-mtxAp->roff]>=IB[i]);
1252 // RSB_C2R_ASSERT(IL[i-mtxAp->roff]<=IB[i+1]);
1253 // RSB_C2R_ASSERT(IL[i-mtxAp->roff+1]>=IB[i+1]);
1254 RSB_C2R_ASSERT(nnz0>=IL[i]);
1255 RSB_C2R_ASSERT(nnz1<=IR[i]);
1256 if(nnz1==nnz0)continue;
1257 // RSB_C2R_ASSERT(nnz1<=IB[i+1]);
1258 RSB_C2R_ASSERT(JA[nnz0+0]>=mtxAp->coff);
1259 RSB_C2R_ASSERT(JA[nnz1-1]< mtxAp->coff+mtxAp->nc);
1260 }
1261 }
1262 #endif /* RSB_WANT_FIRST_VERSION */
1263
1264
1265
1266 return RSB_ERR_NO_ERROR;
1267 }
1268 #endif
1269
1270 #if 0
1271 static rsb_err_t rsb_do_fill_rcsr_arrays_for_later(struct rsb_mtx_t * mtxAp,
1272 const rsb_coo_idx_t * IL, const rsb_coo_idx_t * IR,
1273 //const rsb_coo_idx_t * IA, const rsb_coo_idx_t * JA,
1274 rsb_coo_idx_t * IA, rsb_coo_idx_t * JA,
1275 rsb_nnz_idx_t nzoff, rsb_coo_idx_t m, rsb_coo_idx_t roff )
1276 {
1277 /**
1278 \ingroup gr_unfinished
1279 */
1280 if(!IR)
1281 IR = IL+1;
1282 mtxAp->nzoff = nzoff;
1283 mtxAp->bindx = IA+nzoff;
1284 mtxAp->bpntr = IA+nzoff+m+1;
1285 #if RSB_C2R_WANT_MAYBE_FASTER
1286 RSB_COA_MEMCPY_ROWSZ(mtxAp->bindx,IL,0,roff,m+1);
1287 RSB_COA_MEMCPY_ROWSZ(mtxAp->bpntr,IR,0,roff,m+1);
1288 // RSB_COA_MEMCPY(mtxAp->bindx,IL,0,roff,m+1);
1289 // RSB_COA_MEMCPY(mtxAp->bpntr,IR,0,roff,m+1);
1290 // RSB_COA_MEMCPY_parallel(mtxAp->bindx,IL,0,roff,m+1);
1291 // RSB_COA_MEMCPY_parallel(mtxAp->bpntr,IR,0,roff,m+1);
1292 #else /* RSB_C2R_WANT_MAYBE_FASTER */
1293 /* are we sure we need MEMMOVE here ? FIXME */
1294 RSB_COA_MEMMOVE(mtxAp->bindx,IL,0,roff,m+1);
1295 RSB_COA_MEMMOVE(mtxAp->bpntr,IR,0,roff,m+1);
1296 #endif /* RSB_C2R_WANT_MAYBE_FASTER */
1297
1298 #if RSB_C2R_PARANOIA
1299 {
1300 rsb_nnz_idx_t i;
1301 for(i=0;i<mtxAp->nr;++i)
1302 //for(i=mtxAp->roff;i<mtxAp->roff+mtxAp->nr;++i)
1303 {
1304 rsb_nnz_idx_t nnz1 = IR[i];
1305 rsb_nnz_idx_t nnz0 = IL[i];
1306 // RSB_C2R_ASSERT(IL[i-mtxAp->roff]>=IB[i]);
1307 // RSB_C2R_ASSERT(IL[i-mtxAp->roff]<=IB[i+1]);
1308 // RSB_C2R_ASSERT(IL[i-mtxAp->roff+1]>=IB[i+1]);
1309 RSB_C2R_ASSERT(nnz0>=IL[i]);
1310 RSB_C2R_ASSERT(nnz1<=IR[i]);
1311 if(nnz1==nnz0)continue;
1312 // RSB_C2R_ASSERT(nnz1<=IB[i+1]);
1313 RSB_C2R_ASSERT(JA[nnz0+0]>=mtxAp->coff);
1314 RSB_C2R_ASSERT(JA[nnz1-1]< mtxAp->coff+mtxAp->nc);
1315 }
1316 }
1317 #endif /* RSB_C2R_PARANOIA */
1318 return RSB_ERR_NO_ERROR;
1319 }
1320 #endif
1321
rsb_do_fill_early_leaf_matrix(struct rsb_mtx_t * mtxAp,struct rsb_mtx_t * submatrix,const rsb_coo_idx_t * IL,const rsb_coo_idx_t * IR,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,const rsb_coo_idx_t * VA,rsb_nnz_idx_t snzoff,rsb_nnz_idx_t nnz,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_type_t typecode,rsb_flags_t flags)1322 static rsb_err_t rsb_do_fill_early_leaf_matrix( struct rsb_mtx_t * mtxAp, struct rsb_mtx_t * submatrix,
1323 const rsb_coo_idx_t * IL, const rsb_coo_idx_t * IR,
1324 rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, const rsb_coo_idx_t * VA,
1325 //const rsb_coo_idx_t * IA, const rsb_coo_idx_t * JA, const rsb_coo_idx_t * VA,
1326 rsb_nnz_idx_t snzoff, rsb_nnz_idx_t nnz, rsb_coo_idx_t m, rsb_coo_idx_t k,
1327 rsb_coo_idx_t roff, rsb_coo_idx_t coff, rsb_type_t typecode, rsb_flags_t flags )
1328 {
1329 /**
1330 \ingroup gr_unfinished
1331 */
1332 rsb_err_t errval = RSB_ERR_NO_ERROR;
1333
1334 // if(!IR)
1335 // IR = IL+1;
1336
1337 #if 0
1338 /* 20131206 nowadays IR and IL are always NULL */
1339 if(!RSB_DO_TOOFEWNNZFORRCSR(nnz,m) && IR && IL)
1340 {
1341 // the matrix could be split further: we fill it with info to continue, if necessary
1342 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_rcsr_arrays_for_later(submatrix,IL,IR,IA,JA,snzoff,m,roff));
1343 }
1344 else
1345 if(!RSB_DO_TOOFEWNNZFORCSR(nnz,m) && IR && IL)
1346 {
1347 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_partially_rcsr_arrays_for_later(submatrix,IL,IR,IA,JA,snzoff,m,roff));
1348 //RSB_ERROR("nnz=%d ,m=%d ! what shall we do ?\n",nnz,m);
1349 RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE);
1350 }
1351 else
1352 #endif
1353 {
1354 if(RSB_C2R_IF_VERBOSE)
1355 RSB_INFO("building a very sparse recursive matrix\n");
1356
1357 /* no hope for CSR : however, full/half word COO will fit */
1358 submatrix->nzoff = snzoff;
1359 submatrix->bindx = NULL;
1360 submatrix->bpntr = NULL;
1361 RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE);
1362 //RSB_ERROR("nnz=%d ,m=%d ! what shall we do ?\n",nnz,m);
1363 }
1364 mtxAp->sm[roff?(coff?3:2):(coff?1:0)] = submatrix;
1365 submatrix->roff = roff+mtxAp->roff;
1366 submatrix->coff = coff+mtxAp->coff;
1367 RSB_DO_ERROR_CUMULATE(errval,rsb__set_init_flags_and_stuff(submatrix,NULL,NULL,m,k,nnz,nnz,nnz,typecode,flags));
1368 //err:
1369 RSB_DO_ERR_RETURN(errval)
1370 }
1371
rsb_compar_rcsr_matrix_leftmost_first(const void * ap,const void * bp)1372 int rsb_compar_rcsr_matrix_leftmost_first(const void * ap, const void * bp)
1373 {
1374 /**
1375 \ingroup gr_internals
1376 A compare function to be used with qsort.
1377 TODO:RENAME: rsb_compar_rcsr_matrix_leftmost_first -> ?
1378 Orders first by column, then by row.
1379 */
1380 struct rsb_mtx_t *a = *(struct rsb_mtx_t **)ap;
1381 struct rsb_mtx_t *b = *(struct rsb_mtx_t **)bp;
1382 int ss = 1; /* should swap results ? */
1383 rsb_bool_t at;
1384 rsb_bool_t bt;
1385
1386 at=!RSB_DO_FLAG_HAS(a->flags,RSB_FLAG_QUAD_PARTITIONING);
1387 bt=!RSB_DO_FLAG_HAS(b->flags,RSB_FLAG_QUAD_PARTITIONING);
1388
1389 if(at && !bt)
1390 return 1;
1391 if(!at && bt)
1392 return -1;
1393
1394 if(a->coff < b->coff)
1395 {
1396 RSB_SWAP(struct rsb_mtx_t *,a,b);
1397 ss = -1;/* should swap results ! */
1398 }
1399
1400 return (a->coff==b->coff)?(a->roff>b->roff?1:(a->roff<b->roff?-1:0)):1*ss;
1401 }
1402
1403 #if 0
1404 /* 20121001 unfinished code: commented */
1405 static struct rsb_mtx_t * rsb_do_find_ffmltart(struct rsb_mtx_t ** submatricesp, rsb_submatrix_idx_t smn, struct rsb_mtx_t * submatrix, rsb_coo_idx_t off)
1406 {
1407 /**
1408 \ingroup gr_unfinished
1409 */
1410 rsb_submatrix_idx_t smi = 0;
1411 rsb_coo_idx_t coff = submatrix->coff;
1412 rsb_coo_idx_t roff = submatrix->roff;
1413 rsb_coo_idx_t m = submatrix->nr;
1414 rsb_coo_idx_t k = submatrix->nc;
1415 /* leftmost from right */
1416 for(smi=0;smi<smn;++smi)
1417 if(submatricesp[smi]->coff>=coff+k &&
1418 submatricesp[smi]->roff<=off+0 && submatricesp[smi]->roff+submatricesp[smi]->nr>off)
1419 return submatricesp[smi];
1420 /* leftmost from left, the line after */
1421 for(smi=0;smi<smn;++smi)
1422 if(submatricesp[smi]->coff<coff &&
1423 submatricesp[smi]->roff<=off+1 && submatricesp[smi]->roff+submatricesp[smi]->nr>off)
1424 return submatricesp[smi];
1425 return NULL;
1426 }
1427 #endif
1428
rsb__should_recursively_partition_matrix(rsb_coo_idx_t mB,rsb_coo_idx_t kB,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t element_count,rsb_nnz_idx_t block_count,rsb_nnz_idx_t nnz,rsb_blk_idx_t Mdim,rsb_blk_idx_t mdim,rsb_coo_idx_t roff,rsb_coo_idx_t coff,rsb_flags_t flags,size_t el_size,rsb_thread_t wet)1429 static rsb_bool_t rsb__should_recursively_partition_matrix(
1430 rsb_coo_idx_t mB, rsb_coo_idx_t kB,
1431 rsb_coo_idx_t m, rsb_coo_idx_t k,
1432 rsb_nnz_idx_t element_count,
1433 rsb_nnz_idx_t block_count,
1434 rsb_nnz_idx_t nnz,
1435 rsb_blk_idx_t Mdim,
1436 rsb_blk_idx_t mdim,
1437 rsb_coo_idx_t roff,
1438 rsb_coo_idx_t coff,
1439 rsb_flags_t flags,
1440 size_t el_size,
1441 rsb_thread_t wet
1442 )
1443 {
1444 #if (RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY == RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY_NAIVE)
1445 /*
1446 \ingroup gr_internals
1447 NEW: UNFINISHED
1448 TODO : ERROR HANDLING, DOCS
1449 TODO : should partition on high nnz per row count
1450 although in this case if the matrix is highly
1451 compact (almost banded) it won't help.
1452 Therefore, some sparseness statistics in the matrix constructor
1453 would be nice.
1454 */
1455 //long cs = rsb__get_lastlevel_c_size();
1456 //long cs = rsb__get_lastlevel_c_size_per_thread();
1457 long cs=(rsb__get_lastlevel_c_size()/(wet>0?wet:rsb_get_num_threads()));
1458 rsb_bool_t sp = RSB_BOOL_FALSE; /* should partition */
1459 rsb_fillin_t efillin=1.0; /* FIXME */
1460 size_t smab=0; /* spmv memory accessed bytes */
1461 //cs/=20000;
1462 //cs/=20;
1463 //cs/=4;
1464 /* FIXME */
1465 if(nnz<RSB_RECURSION_MIN_NNZ || m<RSB_RECURSION_MIN_DIM || k<RSB_RECURSION_MIN_DIM || !RSB_DO_FLAG_HAS(flags,RSB_FLAG_QUAD_PARTITIONING))
1466 {
1467 sp = RSB_BOOL_FALSE;
1468 goto done;
1469 }
1470
1471 if(kB<1)kB=1;
1472 if(mB<1)mB=1;
1473
1474 #if 0
1475 if(flags & RSB_FLAG_RECURSIVE_DOUBLE_DETECTED_CACHE)
1476 cs*=2;
1477
1478 if(flags & RSB_FLAG_RECURSIVE_HALF_DETECTED_CACHE)
1479 cs/=2;
1480 #endif
1481
1482 if( (flags & RSB_FLAG_RECURSIVE_SUBDIVIDE_MORE_ON_DIAG) && roff == coff )
1483 cs/=2;
1484 /* this will imply a more fine grained subdivision on the diagonal
1485 * FIXME : we could use a factor different than 2 !
1486 * */
1487
1488
1489 /* subdivide at least until matrix indices can be compressed */
1490 if((flags & RSB_FLAG_USE_HALFWORD_INDICES_CSR) && m>1 && k>1 &&
1491 // nnz>1
1492 // nnz>(cs/4)
1493 nnz*el_size>2*cs
1494 && !rsb__do_is_candidate_size_for_halfword_csr(m,k,nnz,flags))
1495 return RSB_BOOL_TRUE;
1496 if((flags & RSB_FLAG_USE_HALFWORD_INDICES_COO) && m>1 && k>1 &&
1497 // nnz>1
1498 // nnz>(cs/4)
1499 nnz*el_size>2*cs
1500 && !rsb__do_is_candidate_size_for_halfword_coo(m,k,flags))
1501 return RSB_BOOL_TRUE;
1502
1503 if(cs>0)
1504 {
1505 smab = rsb_spmv_memory_accessed_bytes_(mB,kB,m,k,efillin*nnz,((efillin*nnz)/mB)/kB,m/mB,el_size);
1506
1507 if( 2*smab > 3*3*cs ) /* FIXME : overflow possible */
1508 sp=1;
1509 else
1510 if(
1511 /* FIXME! */
1512 (((Mdim+mdim+m+k)*sizeof(rsb_coo_idx_t))
1513 /(nnz*el_size)) > 8*cs
1514 )
1515 sp = RSB_BOOL_TRUE;
1516 else
1517 sp = RSB_BOOL_FALSE;
1518 }
1519 else
1520 {
1521 /* no cache info (FIXME: there should be no section like this one) */
1522 if(
1523 Mdim<8 || mdim<8 || m < 500
1524 || k < 500 || nnz < 200*100)
1525 sp = RSB_BOOL_FALSE;
1526 else
1527 sp = RSB_BOOL_TRUE;
1528 }
1529 #ifdef RSB_EXPERIMENTAL_ROWS_SUBDIVIDE_TO_CORES_NUM
1530 /* STILL UNIMPLEMENTED */
1531 #endif /* RSB_EXPERIMENTAL_ROWS_SUBDIVIDE_TO_CORES_NUM */
1532 #if RSB_EXPERIMENTAL_NO_SUBDIVIDE_ON_MIN_NNZ_PER_ROW_OR_COLUMN
1533 if(1)
1534 {
1535 rsb_nnz_idx_t nnzpr;
1536 if( (flags&RSB_FLAG_WANT_COLUMN_MAJOR_ORDER) != 0 )
1537 nnzpr=nnz/k;
1538 else
1539 nnzpr=nnz/m;
1540
1541 if( nnzpr < RSB_CONST_MIN_NNZ_PER_ROW_OR_COLUMN_PER_SUBMATRIX )
1542 sp = RSB_BOOL_FALSE;
1543 }
1544 #endif /* RSB_EXPERIMENTAL_NO_SUBDIVIDE_ON_MIN_NNZ_PER_ROW_OR_COLUMN */
1545 done:
1546 return sp;
1547 #else /* (RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY == RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY_NAIVE) */
1548 #error "should use a RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY_NAIVE partitioning policy!"
1549 return RSB_BOOL_FALSE;
1550 #endif /* (RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY == RSB_EXPERIMENTAL_QUAD_DIVISION_POLICY_NAIVE) */
1551 }
1552
rsb_do_copy_submatrix_coa(struct rsb_mtx_t * submatrix,void * VA,void * WA,rsb_coo_idx_t * IL,rsb_coo_idx_t * IR,size_t el_size,rsb_coo_idx_t n0,rsb_coo_idx_t i0,rsb_coo_idx_t m0)1553 static rsb_nnz_idx_t rsb_do_copy_submatrix_coa(struct rsb_mtx_t * submatrix, void * VA, void * WA, rsb_coo_idx_t * IL, rsb_coo_idx_t * IR, size_t el_size, rsb_coo_idx_t n0, rsb_coo_idx_t i0, rsb_coo_idx_t m0)
1554 {
1555 rsb_nnz_idx_t i,n;
1556 RSB_C2R_ASSERT(submatrix);
1557 RSB_C2R_ASSERT(VA);
1558 RSB_C2R_ASSERT(WA);
1559 RSB_C2R_ASSERT(IL);
1560 RSB_C2R_ASSERT(IR);
1561 RSB_C2R_ASSERT(el_size>0);
1562 RSB_C2R_ASSERT(n0>=0);
1563 RSB_C2R_ASSERT(i0<m0);
1564 submatrix->VA=((char*)VA)+el_size*submatrix->nzoff;
1565 for(n=n0,i=i0;RSB_LIKELY(i<m0);n+=IR[i]-IL[i],++i)
1566 {
1567 RSB_C2R_ASSERT(n>=0);
1568 RSB_A_MEMCPY_SMALL(WA,VA,submatrix->nzoff+n,IL[i],IR[i]-IL[i],el_size);
1569 }
1570 return n;
1571 }
1572
1573
rsb_do_pick_largest_open_matrix(struct rsb_mtx_t ** submatricesp,rsb_submatrix_idx_t smc)1574 static rsb_submatrix_idx_t rsb_do_pick_largest_open_matrix(struct rsb_mtx_t ** submatricesp, rsb_submatrix_idx_t smc)
1575 {
1576 /*
1577 * FIXME: NEW, UNFINISHED
1578 * need a lock, too
1579 * TODO:need a priority queue, here
1580 * */
1581 rsb_submatrix_idx_t smi = 0,msmi = RSB_SUBM_IDX_MARKER;
1582 rsb_nnz_idx_t maxnz = 0;
1583 if(RSB_WANT_VERBOSE_SUBDIVISION)
1584 if(smc==0)
1585 RSB_INFO("warning: no largest open matrix among 0 matrices\n");
1586 for(smi=0;smi<smc;++smi)
1587 {
1588 //RSB_INFO("looking %d : %d\n",smi,submatricesp[smi]->nnz);
1589 /* FIXME: ">=" here is used to cope with diagonal implicit matrices (which could have nnz==0), too */
1590 if(submatricesp[smi])
1591 if(submatricesp[smi]->nnz>=maxnz)
1592 {
1593 maxnz = submatricesp[smi]->nnz;
1594 msmi = smi;
1595 }
1596 }
1597 return msmi;
1598 }
1599
rsb_do_coo2rec_subdivide_parallel(void * VA,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz,rsb_type_t typecode,const struct rsb_mtx_partitioning_info_t * pinfop,rsb_flags_t flags,rsb_err_t * errvalp,struct rsb_mtx_t ** submatricesp,struct rsb_mtx_t * mtxAp,const rsb_nnz_idx_t * IB,const rsb_nnz_idx_t * IX,rsb_coo_idx_t * IT,rsb_coo_idx_t * WA,rsb_submatrix_idx_t cmc,rsb_submatrix_idx_t omc,rsb_submatrix_idx_t tmc,rsb_thread_t wet,rsb_submatrix_idx_t * cmcp)1600 static rsb_err_t rsb_do_coo2rec_subdivide_parallel(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp, struct rsb_mtx_t ** submatricesp, struct rsb_mtx_t * mtxAp, const rsb_nnz_idx_t * IB, const rsb_nnz_idx_t * IX, rsb_coo_idx_t * IT, rsb_coo_idx_t * WA, rsb_submatrix_idx_t cmc, rsb_submatrix_idx_t omc, rsb_submatrix_idx_t tmc, rsb_thread_t wet, rsb_submatrix_idx_t *cmcp)
1601 {
1602 /*
1603 TODO: clean this up.
1604 Note that rsb__set_num_threads is outlawed here.
1605 */
1606 rsb_err_t errval = RSB_ERR_NO_ERROR;
1607 size_t el_size = RSB_SIZEOF(typecode);
1608 const rsb_nnz_idx_t ttlnz = nnz; /* total nnz */
1609 rsb_nnz_idx_t maxnz = nnz; /* max encountered nnz for a leaf */
1610 rsb_submatrix_idx_t stmc = RSB_MIN(tmc,wet); /* submatrices total count */
1611 rsb_submatrix_idx_t lmc = 1; /* leaf matrix count */
1612 rsb_time_t cpt = RSB_TIME_ZERO,dt = RSB_TIME_ZERO; /* cpt overwrites mtxAp->cpt */
1613 rsb_thread_t tn = rsb_get_num_threads(); /* threads number */
1614 rsb_thread_t mtn = RSB_MAX(1,(rsb_thread_t)(rsb_global_session_handle.subdivision_multiplier*tn));
1615 rsb_thread_t tnn = 1; /* threads number */
1616 rsb_float_t skew = ((rsb_float_t)(maxnz))/(nnz/wet); /* if more than one, will limit scaling */
1617 long cbs = rsb__get_cache_block_byte_size();
1618
1619 if(RSB_WANT_VERBOSE_SUBDIVISION)
1620 RSB_INFO("serial substage subdivision of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("\n");
1621 again:
1622 #pragma omp parallel reduction(|:errval) shared(submatricesp) num_threads(tnn)
1623 {
1624 rsb_submatrix_idx_t smi = 0;
1625 struct rsb_mtx_t * submatrix = NULL;
1626 #if RSB_WANT_OMP_RECURSIVE_KERNELS
1627 rsb_thread_t th_id = omp_get_thread_num();
1628 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
1629 rsb_thread_t th_id = 0;
1630 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
1631 #if RSB_WANT_VERBOSE_SUBDIVISION2
1632 if(th_id==0){
1633 RSB_MTXASM_INFO("entering %d threads in subdivision phase\n",tnn);
1634 RSB_MTXASM_INFO("entering %d threads in subdivision phase\n",tnn);}
1635 #endif /* RSB_WANT_VERBOSE_SUBDIVISION2 */
1636 iagain:
1637 #pragma omp critical (rsb_coo2rsbsub_crs)
1638 {
1639 smi = rsb_do_pick_largest_open_matrix(submatricesp+cmc,omc);
1640 if(smi != RSB_SUBM_IDX_MARKER)
1641 {
1642 smi += cmc;
1643 submatrix = submatricesp[smi];
1644 maxnz = submatrix->nnz;
1645 /* RSB_ASSERT(nnz>=wet); */
1646 skew = ((rsb_float_t)(maxnz))/((rsb_float_t)(nnz/wet));
1647 RSB_ASSERT(!isinf(skew));
1648 omc--;
1649 if(smi!=cmc)
1650 {
1651 #if 0
1652 assert(submatricesp[smi]);
1653 assert(submatricesp[cmc]);
1654 #endif
1655 RSB_SWAP(struct rsb_mtx_t *,submatricesp[smi],submatricesp[cmc]);
1656 }
1657 ++cmc;
1658 }
1659 else
1660 {
1661 submatrix = NULL;
1662 }
1663 if(RSB_WANT_VERBOSE_SUBDIVISION)
1664 {
1665 if(submatrix)
1666 RSB_INFO("subdividing "),RSB_INFO_MATRIX_SUMMARY(submatrix),RSB_INFO(" (open:%d,closed:%d) for thread %d\n",omc,cmc,th_id);
1667 else
1668 RSB_INFO("no available submatrix (open:%d,closed:%d) for thread %d/%d\n",omc,cmc,th_id,tnn);
1669 }
1670 }
1671 if((smi)!=RSB_SUBM_IDX_MARKER)
1672 {
1673 #if 0
1674 for(smi=0;RSB_LIKELY(omc>0);smi=cmc+omc-1)
1675 while(submatrix)
1676 if(omc>0 && ((submatrix=submatricesp[smi])!=NULL))
1677 #endif
1678 {
1679 rsb_coo_idx_t k = submatrix->nc;
1680 rsb_coo_idx_t m = submatrix->nr;
1681 rsb_coo_idx_t hk = RSB_MIDDLE(k);
1682 rsb_coo_idx_t hm = RSB_MIDDLE(m);
1683 rsb_nnz_idx_t ul = 0,ur = 0,ll = 0,lr = 0;
1684 rsb_nnz_idx_t nnz = submatrix->nnz;
1685 rsb_coo_idx_t roff = submatrix->roff;
1686 rsb_coo_idx_t coff = submatrix->coff;
1687 rsb_flags_t flags = submatrix->flags;
1688 rsb_nnz_idx_t nzoff = submatrix->nzoff;
1689 rsb_bool_t sqs = RSB_BOOL_FALSE; /* should quad subdivide */
1690 rsb_submatrix_idx_t smc = 0; /* submatrices count */
1691
1692 if(RSB_C2R_IF_VERBOSE)
1693 RSB_INFO("cmc:%d omc:%d smi:%d tmc=%d stmc=%d th_id=%d\n",cmc,omc,smi,tmc,stmc,th_id);
1694
1695 /* too few nonzeros for recursion (TODO: may change in the future) */
1696 if(RSB_DO_TOOFEWNNZFORRCSR(nnz,m))
1697 #if RSB_WANT_SUBDIVISION_FIXES_20101120
1698 if(!RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE))
1699 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1700 {
1701 if(RSB_C2R_IF_VERBOSE)
1702 RSB_INFO("matrix too sparse for RCSR: rejoining\n");
1703 sqs = RSB_BOOL_FALSE;
1704 goto nosqstest;
1705 }
1706
1707 /* decide if the matrix is worth subdividing further (soft) */
1708 sqs = rsb__should_recursively_partition_matrix(0,0,m,k,0,0,nnz,m,k,roff,coff,flags,el_size,mtn);
1709 #if RSB_WANT_SUBDIVISION_FIXES_20101120
1710 if(nnz<RSB_RECURSION_MIN_NNZ || m<RSB_RECURSION_MIN_DIM || k<RSB_RECURSION_MIN_DIM || !RSB_DO_FLAG_HAS(flags,RSB_FLAG_QUAD_PARTITIONING))
1711 {
1712 sqs = RSB_BOOL_FALSE; /* a hard condition */
1713 goto nosqstest;
1714 }
1715 else
1716 if(cmc+omc<tmc)
1717 if(skew>RSB_SUBDIVISION_SKEW_MAX)
1718 sqs = RSB_BOOL_TRUE; /* a soft condition */
1719 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1720
1721 if(!sqs)
1722 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_RECURSIVE_MORE_LEAVES_THAN_THREADS))
1723 if(wet>lmc)
1724 sqs = RSB_BOOL_TRUE;
1725
1726 if(sqs)
1727 {
1728 rsb_bool_t awfcsr = RSB_BOOL_FALSE; /* all of the matrices will fit csr ? */
1729 #if RSB_WANT_SUBDIVISION_FIXES_20101120
1730 rsb_nnz_idx_t mqnnz = RSB_MAX(RSB_MAX(ul,ur),RSB_MAX(lr,ll));
1731 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1732
1733 /* compute the split vector */
1734 dt = - rsb_time();
1735 if((errval = rsb_do_compute_vertical_split_search_only(IA,JA,roff,coff,m,k,hm,hk,nnz,IB,&ul,&ur,&ll,&lr))!=RSB_ERR_NO_ERROR)
1736 ;/* goto err; */
1737 dt += rsb_time();
1738 cpt += dt;
1739 RSB_C2R_ASSERT(IR);
1740 awfcsr = ( (ul>0 && RSB_DO_TOOFEWNNZFORCSR(ul,hm)) || (ur>0 && RSB_DO_TOOFEWNNZFORCSR(ur,hm)) || (lr>0 && RSB_DO_TOOFEWNNZFORCSR(lr,m-hm)) || (ll>0 && RSB_DO_TOOFEWNNZFORCSR(ll,m-hm)))?RSB_BOOL_TRUE:RSB_BOOL_FALSE;
1741
1742 if(awfcsr) /* FIXME: misleading naming ! */
1743 {
1744 /* if some leaf won't fit in CSR, we don't split anymore */
1745 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE))
1746 sqs = RSB_BOOL_TRUE;
1747 else
1748 sqs = RSB_BOOL_FALSE;
1749 if(RSB_C2R_IF_VERBOSE)
1750 RSB_INFO("no space for conversion of some leaf: rejoining ? %d\n",!sqs);
1751 }
1752
1753 #if RSB_WANT_SUBDIVISION_FIXES_20101120
1754 /* an alternative would be to place this test in the branch above*/
1755 if( mqnnz>RSB_MAX_QUADRANTS_UNBALANCE*(nnz-mqnnz) &&
1756 el_size*nnz<cbs &&
1757 nnz < (ttlnz/wet) )
1758 sqs = RSB_BOOL_FALSE;
1759 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1760
1761 /* how many submatrices out of four ? */
1762 smc = (ul?1:0)+(ur?1:0)+(ll?1:0)+(lr?1:0);
1763 if(cmc+omc+smc>tmc)
1764 {
1765 if(RSB_C2R_IF_VERBOSE)
1766 RSB_INFO("too many submatrices (%d+%d>%d: rejoining\n",cmc+omc,smc,tmc);
1767 sqs = RSB_BOOL_FALSE;
1768 goto nosqstest;
1769 }
1770
1771 #if !RSB_WANT_SUBDIVISION_FIXES_20101120
1772 #ifdef RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES
1773 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES))
1774 if(wet<lmc-1)
1775 {
1776 if(RSB_C2R_IF_VERBOSE)
1777 RSB_INFO("RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES: rejoining\n");
1778 sqs = RSB_BOOL_FALSE;
1779 }
1780 #endif /* RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES */
1781 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101120 */
1782
1783 if(RSB_C2R_IF_VERBOSE)
1784 RSB_ERROR("splitting %d/%d -> %d/%d %d/%d %d/%d %d/%d sqs? %d\n",nnz,m,ul,hm,ur,hm,ll,m-hm,lr,m-hm,sqs);
1785 if(ul+ur+ll+lr != nnz)
1786 {
1787 if(RSB_C2R_IF_VERBOSE)
1788 RSB_ERROR("%d ?= %d + %d + %d + %d = %d\n",nnz,ul,ur,ll,lr,ul+ur+ll+lr);
1789 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1790 }
1791 }
1792 nosqstest:
1793 if(sqs)
1794 {
1795 /* should quad-subdivide. let's take care of indices. */
1796 rsb_nnz_idx_t snzoff = nzoff;
1797 rsb_submatrix_idx_t smci = 0;
1798 rsb_submatrix_idx_t smco = 0;
1799 struct rsb_mtx_t*isms[4] = {NULL,NULL,NULL,NULL};
1800
1801 /*
1802 the index arrays are copied/linked into the quadrants
1803 some quadrants may seem ready for recursion, but they not result as such later on.
1804 they will be made leaf later on, if necessary.
1805 ...
1806 */
1807 RSB_C2R_ASSERT(ur>=0 && ul>=0 && lr>=0 && ll>=0);
1808
1809 #pragma omp critical (rsb_coo2rsbsub_crs)
1810 {
1811 if(cmc+omc+smc+RSB_SUBDIVISION_BUG_EXTRA>tmc)
1812 {
1813 if(RSB_C2R_IF_VERBOSE)
1814 RSB_INFO("too many submatrices (%d+%d>%d): rejoining\n",cmc+omc,smc,tmc);
1815 sqs = RSB_BOOL_FALSE;
1816 }
1817 else
1818 {
1819 lmc += smc;
1820 lmc -= 1;
1821 smco = cmc+omc;
1822 snzoff = nzoff;
1823 if(ul){ isms[0] = submatricesp[smco+smci];submatricesp[smco+smci] = NULL;smci++;}
1824 if(ur){ isms[1] = submatricesp[smco+smci];submatricesp[smco+smci] = NULL;smci++;}
1825 if(ll){ isms[2] = submatricesp[smco+smci];submatricesp[smco+smci] = NULL;smci++;}
1826 if(lr){ isms[3] = submatricesp[smco+smci];submatricesp[smco+smci] = NULL;smci++;}
1827 smci = 0;
1828 omc += smc;
1829 }
1830 if(sqs)
1831 {
1832 if(ul)
1833 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,isms[0],NULL,NULL,IA,JA,VA,snzoff,ul,hm,hk,0,0,typecode,flags)), snzoff += ul,++smci;
1834 if(ur)
1835 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,isms[1],NULL,NULL,IA,JA,VA,snzoff,ur,hm,k-hk,0,hk,typecode,flags)), snzoff += ur,++smci;
1836 if(ll)
1837 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,isms[2],NULL,NULL,IA,JA,VA,snzoff,ll,m-hm,hk,hm,0,typecode,flags)), snzoff += ll,++smci;
1838 if(lr)
1839 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,isms[3],NULL,NULL,IA,JA,VA,snzoff,lr,m-hm,k-hk,hm,hk,typecode,flags)), snzoff += lr,++smci;
1840 RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
1841 }
1842
1843 if(sqs)
1844 {
1845 smci = 0;
1846 if(ul){ submatricesp[smco+smci] = isms[0];smci++;}
1847 if(ur){ submatricesp[smco+smci] = isms[1];smci++;}
1848 if(ll){ submatricesp[smco+smci] = isms[2];smci++;}
1849 if(lr){ submatricesp[smco+smci] = isms[3];smci++;}
1850 }
1851
1852 if(sqs)
1853 {
1854 if(snzoff-nzoff!=nnz)
1855 {
1856 /* is this a partition ? */
1857 RSB_ERROR("%d - %d != %d ?= %d + %d + %d + %d = %d\n",snzoff,nzoff,nnz,ul,ur,ll,lr,ul+ur+ll+lr);
1858 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
1859 }
1860 if(RSB_SOME_ERROR(errval))
1861 {
1862 RSB_ERROR(RSB_ERRM_ES); /* goto err; */
1863 }
1864 RSB_DO_FLAG_ADD(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING);
1865 RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_NON_ROOT_MATRIX);
1866 submatrix->bindx = NULL; submatrix->bpntr = NULL; submatrix->indptr = NULL;
1867 }
1868 }
1869 }
1870 if(!sqs)
1871 {
1872 RSB_DO_FLAG_SUBST(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING,RSB_FLAG_NON_ROOT_MATRIX);
1873 /* selecting a format and declaring as leaf */
1874 if(!RSB_DO_TOOFEWNNZFORCSR(nnz,m) /*&& IR && IL*/)
1875 {
1876 /* RSB_INFO("CSR -> COO ?\n"); */
1877 if(RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE))
1878 RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_COO_STORAGE);
1879 if((errval = rsb__do_set_init_storage_flags(submatrix,submatrix->flags))!=RSB_ERR_NO_ERROR)
1880 ;/* goto err; */
1881 }
1882 else
1883 {
1884 /* RSB_INFO("COO !\n"); */
1885 rsb_flags_t sflags = flags;
1886 RSB_DO_FLAG_SUBST(sflags,RSB_FLAG_WANT_BCSS_STORAGE,RSB_FLAG_WANT_COO_STORAGE);
1887 if((errval = rsb__do_set_init_storage_flags(submatrix,sflags))!=RSB_ERR_NO_ERROR)
1888 ;/* goto err; */
1889 }
1890 if(RSB_C2R_IF_VERBOSE)
1891 RSB_INFO("freezing %d ",smi+1),
1892 RSB_INFO_MATRIX_SUMMARY(submatrix),
1893 RSB_INFO("\n");
1894 }
1895 /* matrix is declared as 'closed'.
1896 sorting, in a way smi will point to the biggest open mtxAp, which will be picked up next */
1897 #if 0
1898 qsort(submatricesp+cmc,(size_t)(omc),sizeof(struct rsb_mtx_t*),& rsb_compar_rcsr_matrix_regarding_nnz);
1899 RSB_SWAP(struct rsb_mtx_t *,submatricesp[smi],submatricesp[cmc-1]);
1900 #endif
1901 }
1902 /*smi = cmc+omc-1; */
1903 }
1904 if(omc>0 && cmc+omc<stmc)
1905 goto iagain;
1906 #if RSB_WANT_VERBOSE_SUBDIVISION2
1907 if(th_id==0)
1908 {
1909 RSB_MTXASM_INFO("thread %d:terminating subdivision",th_id);
1910 if(omc==0)
1911 {RSB_MTXASM_INFO(", no more open matrices");}
1912 RSB_MTXASM_INFO("(closed %d= %d nodes + %d leaves, out of %d available)",cmc,cmc-lmc,lmc,tmc);
1913 RSB_MTXASM_INFO(",(maxnz=%d,skew=%g)",maxnz,skew);
1914 if(cmc+omc>=stmc)
1915 {RSB_MTXASM_INFO(", no room left for submatrices");}
1916 RSB_MTXASM_INFO(".\n");
1917 }
1918 #endif
1919 } /* parallel */
1920
1921 if(RSB_SOME_ERROR(errval))
1922 goto err;
1923
1924 #pragma omp barrier
1925 if(stmc!=tmc)
1926 {
1927 stmc = tmc;
1928 if(RSB_WANT_VERBOSE_SUBDIVISION)
1929 RSB_INFO("parallel substage subdivision of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("\n");
1930 tnn = tn;
1931 goto again;
1932 }
1933 else
1934 {
1935
1936 if(RSB_WANT_VERBOSE_SUBDIVISION)
1937 RSB_INFO("parallel substage subdivision of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO(" not required\n");
1938 }
1939 {
1940 if(RSB_WANT_VERBOSE_SUBDIVISION)
1941 RSB_INFO("subdivision of "),RSB_INFO_MATRIX_SUMMARY(mtxAp),RSB_INFO("complete \n");
1942 }
1943 mtxAp->cpt = cpt;
1944
1945 *cmcp = cmc;
1946 err:
1947 RSB_DO_ERR_RETURN(errval)
1948 }
1949
1950 #if 0
1951 static rsb_err_t rsb_do_coo2rec_subdivide(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp, struct rsb_mtx_t ** submatricesp, struct rsb_mtx_t * mtxAp, const rsb_nnz_idx_t * IB, const rsb_nnz_idx_t * IX, rsb_coo_idx_t * IT, rsb_coo_idx_t * WA, rsb_submatrix_idx_t cmc, rsb_submatrix_idx_t omc, rsb_submatrix_idx_t tmc, rsb_thread_t wet, rsb_submatrix_idx_t *cmcp)
1952 {
1953 rsb_nnz_idx_t tdnnz = 0;
1954 rsb_submatrix_idx_t smi = 0;/* max matrix count, done matrix count, submatrix index */
1955 rsb_err_t errval = RSB_ERR_NO_ERROR;
1956 size_t el_size = RSB_SIZEOF(typecode);
1957 rsb_time_t cpt = RSB_TIME_ZERO,dt = RSB_TIME_ZERO;
1958
1959 for(smi=0;RSB_LIKELY(omc>0);smi=cmc+omc-1)
1960 {
1961 struct rsb_mtx_t * submatrix = submatricesp[smi];
1962 rsb_coo_idx_t k = submatrix->nc;
1963 rsb_coo_idx_t m = submatrix->nr;
1964 rsb_coo_idx_t hk = (k+1)/2;
1965 rsb_coo_idx_t hm = (m+1)/2;
1966 rsb_nnz_idx_t ul = 0,ur = 0,ll = 0,lr = 0;
1967 rsb_nnz_idx_t nnz = submatrix->nnz;
1968 rsb_coo_idx_t roff = submatrix->roff;
1969 rsb_coo_idx_t coff = submatrix->coff;
1970 rsb_coo_idx_t*IL = submatrix->bindx; // IL will be hosted here
1971 rsb_coo_idx_t*IM = IT; // IM will be hosted in a temporary vector
1972 rsb_coo_idx_t*IR = submatrix->bpntr; // IR will be hosted here
1973 rsb_flags_t flags = submatrix->flags;
1974 rsb_nnz_idx_t nzoff = submatrix->nzoff;
1975 rsb_bool_t sqs = RSB_BOOL_FALSE; // should quad subdivide
1976 rsb_submatrix_idx_t smc = 0; /* submatrices count */
1977
1978 // RSB_INFO("picked up %d/%d -> %d x %d, %d nnz, @ %d %d \n",smi+1,tmc,m,k,nnz,roff,coff);
1979
1980 if(!IL && !RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE) )
1981 {
1982 /* if there is no line pointer, we make this submatrix leaf */
1983 sqs = RSB_BOOL_FALSE;
1984 if(RSB_C2R_IF_VERBOSE)
1985 RSB_INFO("no split, as no line pointer found\n");
1986 goto nosqstest;
1987 }
1988
1989 if(/*!IL || */!IM)
1990 {
1991 /* if this happens, this is an error */
1992 RSB_ERROR("IL:%p, IM:%p\n",IL,IM);
1993 errval = RSB_ERR_INTERNAL_ERROR;
1994 goto err;
1995 }
1996
1997 /* too few nonzeros for recursion (TODO: may change in the future) */
1998 if(RSB_DO_TOOFEWNNZFORRCSR(nnz,m)
1999 /*
2000 * Uncommenting the following allows subdivision for very spare matrices
2001 * However, this feature is unfinished and bugful (segfault risk)
2002 * */
2003 /* && !RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE) */
2004 )
2005 {
2006 if(RSB_C2R_IF_VERBOSE)
2007 RSB_INFO("matrix too sparse for RCSR: rejoining\n");
2008
2009 sqs = RSB_BOOL_FALSE;
2010 goto nosqstest;
2011 }
2012
2013 /* decide if the matrix is worth subdividing further */
2014 sqs = rsb__should_recursively_partition_matrix(0,0,m,k,0,0,nnz,m,k,roff,coff,flags,el_size,0);
2015
2016 /* if we want subdivision */
2017
2018 if(!sqs)
2019 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_RECURSIVE_MORE_LEAVES_THAN_THREADS))
2020 if(wet>cmc+1+smc)/* /FIXME : this may not terminate! */
2021 sqs = RSB_BOOL_TRUE;
2022
2023 if(sqs)
2024 {
2025 rsb_bool_t awfcsr = RSB_BOOL_FALSE; /* all of the matrices will fit csr ? */
2026
2027 // compute the split vector
2028 dt = - rsb_time();
2029 if((!RSB_DO_TOOFEWNNZFORCSR(nnz,m)) && IR && IL)
2030 {if((errval = rsb_do_compute_vertical_split(IA,JA,roff,coff,m,k,hm,hk,nnz,IL,IM,IR,&ul,&ur,&ll,&lr))!=RSB_ERR_NO_ERROR) goto err;}
2031 else
2032 {
2033 if(RSB_C2R_IF_VERBOSE)
2034 RSB_INFO("using the sparse splitter\n");
2035 if((errval = rsb_do_compute_vertical_split_search_only(IA,JA,roff,coff,m,k,hm,hk,nnz,IB,&ul,&ur,&ll,&lr))!=RSB_ERR_NO_ERROR) goto err;
2036 }
2037 dt += rsb_time();
2038 cpt += dt;
2039 RSB_C2R_ASSERT(IR);
2040 awfcsr = ( (ul>0 && RSB_DO_TOOFEWNNZFORCSR(ul,hm)) || (ur>0 && RSB_DO_TOOFEWNNZFORCSR(ur,hm)) || (lr>0 && RSB_DO_TOOFEWNNZFORCSR(lr,m-hm)) || (ll>0 && RSB_DO_TOOFEWNNZFORCSR(ll,m-hm)))?RSB_BOOL_TRUE:RSB_BOOL_FALSE;
2041
2042 // after computing the split vector, we can still resign from subdividing
2043 // especially if some submatrix is deemed too small and the overall submatrices count is enough
2044 // TODO: if(rsb__should_rejoin_small_leaf(...
2045 // ...
2046
2047 /* is there room for these additional submatrices ? */
2048 // if( (ul>0 && RSB_DO_TOOFEWNNZFORRCSR(ul,hm)) || (ur>0 && RSB_DO_TOOFEWNNZFORRCSR(ur,hm)) || (lr>0 && RSB_DO_TOOFEWNNZFORRCSR(lr,m-hm)) || (ll>0 && RSB_DO_TOOFEWNNZFORRCSR(ll,m-hm)))
2049 if(awfcsr)
2050 {
2051 /* if some leaf won't fit in CSR, we don't split anymore */
2052 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE))
2053 sqs = RSB_BOOL_TRUE;
2054 else
2055 sqs = RSB_BOOL_FALSE;
2056
2057 if(RSB_C2R_IF_VERBOSE)
2058 RSB_INFO("no space for conversion of some leaf: rejoining ? %d\n",!sqs);
2059 }
2060
2061 /* how many submatrices out of four ? */
2062 smc = (ul?1:0)+(ur?1:0)+(ll?1:0)+(lr?1:0);
2063 if(cmc+omc+smc>tmc)
2064 {
2065 if(RSB_C2R_IF_VERBOSE)
2066 RSB_INFO("too many submatrices: rejoining\n");
2067 sqs = RSB_BOOL_FALSE;
2068 }
2069
2070 #ifdef RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES
2071 /*
2072 if we want to avoid micro leaves, we could stop here
2073 FIXME: we need a better criteria (for proper load balancing!)
2074 */
2075 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES))
2076 if(wet<cmc)
2077 {
2078 if(RSB_C2R_IF_VERBOSE)
2079 RSB_INFO("RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES: rejoining\n");
2080 sqs = RSB_BOOL_FALSE;
2081 }
2082 #endif /* RSB_FLAG_EXPERIMENTAL_NO_MICRO_LEAVES */
2083
2084 if(RSB_C2R_IF_VERBOSE)
2085 RSB_INFO("splitting %d/%d -> %d/%d %d/%d %d/%d %d/%d sqs? %d\n",nnz,m,ul,hm,ur,hm,ll,m-hm,lr,m-hm,sqs);
2086 }
2087
2088 nosqstest:
2089 omc--;
2090 if(smi!=cmc)
2091 RSB_SWAP(struct rsb_mtx_t *,submatricesp[smi],submatricesp[cmc]);
2092 ++cmc;
2093 if(sqs)
2094 {
2095 /* should quad-subdivide. let's take care of indices. */
2096 rsb_nnz_idx_t snzoff = nzoff;
2097
2098 // the index arrays are copied/linked into the quadrants
2099 // some quadrants may seem ready for recursion, but they not result as such later on.
2100 // they will be made leaf later on, if necessary.
2101 // ...
2102 RSB_C2R_ASSERT(ur>=0 && ul>=0 && lr>=0 && ll>=0);
2103
2104 #if RSB_C2R_WANT_MAYBE_FASTER
2105 if(IL)
2106 RSB_COA_MEMCPY_ROWSZ(IX,IL,0 ,0,m+1),
2107 IL = IX;
2108 if(IR)
2109 RSB_COA_MEMCPY_ROWSZ(IX,IR,m+1,0,m+1),
2110 IR = IX+m+1;
2111 #else /* RSB_C2R_WANT_MAYBE_FASTER */
2112 if(IL)
2113 RSB_COA_MEMMOVE(IX,IL,0 ,0,m+1),
2114 IL = IX;
2115 if(IR)
2116 RSB_COA_MEMMOVE(IX,IR,m+1,0,m+1),
2117 IR = IX+m+1;
2118 #endif /* RSB_C2R_WANT_MAYBE_FASTER */
2119
2120 if(ul)
2121 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,submatricesp[cmc+omc],IL,IM,IA,JA,VA,snzoff,ul,hm,hk,0,0,typecode,flags), ++omc, snzoff += ul);
2122 if(ur)
2123 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,submatricesp[cmc+omc],IM,IR,IA,JA,VA,snzoff,ur,hm,k-hk,0,hk,typecode,flags), ++omc, snzoff += ur);
2124 if(ll)
2125 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,submatricesp[cmc+omc],IL,IM,IA,JA,VA,snzoff,ll,m-hm,hk,hm,0,typecode,flags), ++omc, snzoff += ll);
2126 if(lr)
2127 RSB_DO_ERROR_CUMULATE(errval,rsb_do_fill_early_leaf_matrix(submatrix,submatricesp[cmc+omc],IM,IR,IA,JA,VA,snzoff,lr,m-hm,k-hk,hm,hk,typecode,flags), ++omc, snzoff += lr);
2128
2129 if(snzoff-nzoff!=nnz)
2130 {
2131 /* is this a partition ? */
2132 RSB_ERROR("%d - %d != %d ?= %d + %d + %d + %d = %d\n",snzoff,nzoff,nnz,ul,ur,ll,lr,ul+ur+ll+lr);
2133 errval = RSB_ERR_INTERNAL_ERROR; goto err;
2134 }
2135 if(RSB_SOME_ERROR(errval))
2136 {
2137 RSB_ERROR(RSB_ERRM_ES); goto err;
2138 }
2139 RSB_DO_FLAG_ADD(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING);
2140 RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_NON_ROOT_MATRIX);
2141 submatrix->bindx = NULL; submatrix->bpntr = NULL; submatrix->indptr = NULL;
2142 }
2143 else
2144 {
2145 RSB_DO_FLAG_SUBST(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING,RSB_FLAG_NON_ROOT_MATRIX);
2146 // we should decide a format, and proceed declaring it as leaf
2147 if(!RSB_DO_TOOFEWNNZFORRCSR(nnz,m) && IR && IL)
2148 {
2149 if(RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE))
2150 RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_COO_STORAGE);
2151 #if RSB_WANT_LITTLE_IMPROVED
2152 if(submatrix==mtxAp)/* root only */
2153 /* FIXME: TODO: IR is NOT needed AT ALL! */
2154 rsb_do_fill_rcsr_arrays_for_later(submatrix,IL,IR,IA,JA,nzoff,m,0);
2155 #else /* RSB_WANT_LITTLE_IMPROVED */
2156 rsb_do_fill_rcsr_arrays_for_later(submatrix,IL,IR,IA,JA,nzoff,m,0);
2157 #endif /* RSB_WANT_LITTLE_IMPROVED */
2158 if((errval = rsb__do_set_init_storage_flags(submatrix,submatrix->flags))!=RSB_ERR_NO_ERROR)
2159 goto err;
2160 submatrix->VA = VA; // FIXME: we will place pointers to partially swapped VA, here.
2161 }
2162 else
2163 if(!RSB_DO_TOOFEWNNZFORCSR(nnz,m) /*&& IR && IL*/)
2164 {
2165 // RSB_INFO("CSR -> COO ?\n");
2166 if(RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_WANT_BCSS_STORAGE))
2167 RSB_DO_FLAG_DEL(submatrix->flags,RSB_FLAG_WANT_COO_STORAGE);
2168 if((errval = rsb__do_set_init_storage_flags(submatrix,submatrix->flags))!=RSB_ERR_NO_ERROR)
2169 goto err;
2170 }
2171 else
2172 {
2173 // RSB_INFO("COO !\n");
2174 rsb_flags_t sflags = flags;
2175 RSB_DO_FLAG_SUBST(sflags,RSB_FLAG_WANT_BCSS_STORAGE,RSB_FLAG_WANT_COO_STORAGE);
2176 if((errval = rsb__do_set_init_storage_flags(submatrix,sflags))!=RSB_ERR_NO_ERROR)
2177 goto err;
2178 }
2179 if(RSB_C2R_IF_VERBOSE)
2180 RSB_INFO("freezing %d ",smi+1),
2181 RSB_INFO_MATRIX_SUMMARY(submatrix),
2182 RSB_INFO("\n");
2183 }
2184 // matrix is declared as 'closed'.
2185 // sorting, in a way smi will point to the biggest open mtxAp, which will be picked up next
2186 qsort(submatricesp+cmc,(size_t)(omc),sizeof(struct rsb_mtx_t*),& rsb_compar_rcsr_matrix_regarding_nnz);
2187 // FIXME: a priority queue would do the job, here
2188 }
2189 mtxAp->cpt = cpt;
2190 err:
2191 *cmcp = cmc;
2192 RSB_DO_ERR_RETURN(errval)
2193 }
2194 #endif
2195
rsb_do_coo2rec_shuffle(void * VA,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz,rsb_type_t typecode,const struct rsb_mtx_partitioning_info_t * pinfop,rsb_flags_t flags,rsb_err_t * errvalp,struct rsb_mtx_t ** submatricesp,struct rsb_mtx_t * mtxAp,const rsb_nnz_idx_t * IB,rsb_coo_idx_t * WA,rsb_submatrix_idx_t cmc)2196 static rsb_err_t rsb_do_coo2rec_shuffle(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp, struct rsb_mtx_t ** submatricesp, struct rsb_mtx_t * mtxAp, const rsb_nnz_idx_t * IB, rsb_coo_idx_t * WA, rsb_submatrix_idx_t cmc)
2197 {
2198 rsb_nnz_idx_t tdnnz = 0;
2199 rsb_submatrix_idx_t smi = 0;/* max matrix count, done matrix count, submatrix index */
2200 rsb_err_t errval = RSB_ERR_NO_ERROR;
2201 size_t el_size = RSB_SIZEOF(typecode);
2202 #if RSB_WANT_VERBOSE_TIMINGS
2203 rsb_time_t pmt = RSB_TIME_ZERO;
2204 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2205
2206 if(!VA || cmc==1)
2207 {
2208 mtxAp->VA = VA;
2209 goto no_va_cp;
2210 }
2211
2212 // the following is a highly parallel phase
2213 #pragma omp parallel for schedule(static,1) reduction(|:errval) shared(tdnnz,submatricesp,IB) RSB_NTC
2214 for(smi=0;smi<cmc;++smi)
2215 {
2216 struct rsb_mtx_t * submatrix = submatricesp[smi];
2217 rsb_coo_idx_t*IL = submatrix->bindx;
2218 rsb_coo_idx_t*IR = submatrix->bpntr;
2219 rsb_nnz_idx_t dnnz = 0;
2220 rsb_coo_idx_t i;
2221 // if(RSB_C2R_IF_VERBOSE)
2222 // RSB_INFO("%d -> %d\n",smi,omp_get_thread_num());
2223 if(!RSB_DO_FLAG_HAS((submatrix->flags),RSB_FLAG_QUAD_PARTITIONING))
2224 {
2225 if(RSB_DO_ENOUGHNNZFORINDEXBASEDBUILD(submatrix) && !rsb__is_coo_matrix(submatrix) && IL && IR)
2226 {
2227 RSB_C2R_ASSERT(IL);
2228 RSB_C2R_ASSERT(IR);
2229 dnnz = rsb_do_copy_submatrix_coa(submatrix,VA,WA,IL,IR,el_size,0,0,submatrix->nr);
2230 #if 0
2231 for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2232 {
2233 rsb_nnz_idx_t nnz1 = IR[i-submatrix->roff];
2234 rsb_nnz_idx_t nnz0 = IL[i-submatrix->roff];
2235 RSB_C2R_ASSERT(IL[i-submatrix->roff]>=IB[i]);
2236 RSB_C2R_ASSERT(IL[i-submatrix->roff]<=IB[i+1]);
2237 RSB_C2R_ASSERT(IL[i-submatrix->roff+1]>=IB[i+1]);
2238 RSB_C2R_ASSERT(nnz0>=IL[i-submatrix->roff]);
2239 RSB_C2R_ASSERT(nnz1<=IL[i-submatrix->roff+1]);
2240 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2241 if(IB[i]==IB[i+1])continue;
2242 RSB_C2R_ASSERT(nnz1>=nnz0);
2243 if(nnz1==nnz0)continue;
2244 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2245 RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2246 }
2247 #endif
2248 }
2249 else
2250 if(!RSB_DO_TOOFEWNNZFORCSR(submatrix->nnz,submatrix->nr))
2251 {
2252 //rsb_coo_idx_t*IL = submatrix->bindx;
2253 for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2254 {
2255 //rsb_nnz_idx_t fel;
2256 // offset of line i in the global line pointers array
2257 rsb_nnz_idx_t nnz0 = IB[i];
2258 // nnz1..nnz0 are the boundaries of line i
2259 rsb_nnz_idx_t nnz1 = IB[i+1];
2260 // check
2261 RSB_C2R_ASSERT(nnz0>=IB[i]);
2262 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2263 // skip line if empty
2264 if(nnz1-nnz0<1)continue;
2265 // find first element of line i also in the submatrix
2266 nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff,nnz1-nnz0);
2267 // skip line if empty in the submatrix
2268 if(nnz1-nnz0<1)continue;
2269 // find the length of the subrow i in the submatrix
2270 nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff+submatrix->nc,nnz1-nnz0);
2271 //check
2272 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2273 // skip line if empty in the submatrix
2274 if(nnz1-nnz0<1)continue;
2275 // nnz1 .. nnz0 contain nonempty subrow i in the submatrix
2276 // RSB_INFO("i:%d, %d..%d -> %d\n",i,nnz0,nnz1-1,submatrix->nzoff+dnnz);
2277 // checks
2278 // RSB_C2R_ASSERT(IL[i-submatrix->roff]>=IB[i]);
2279 // RSB_C2R_ASSERT(IL[i-submatrix->roff]<=IB[i+1]);
2280 // RSB_C2R_ASSERT(IL[i-submatrix->roff+1]>=IB[i+1]);
2281 // RSB_C2R_ASSERT(nnz0>=IL[i-submatrix->roff]);
2282 // RSB_C2R_ASSERT(nnz1<=IL[i-submatrix->roff+1]);
2283 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2284 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2285 RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2286 // perform the copy
2287 RSB_A_MEMCPY_SMALL(WA,VA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0,el_size);
2288 //RSB_COA_MEMCPY(WA,JA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0);
2289 // update the actual offset in the destination array
2290 dnnz += nnz1-nnz0;
2291 }
2292 }
2293 else
2294 {
2295 //rsb_coo_idx_t*IL = submatrix->bindx;
2296 for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2297 {
2298 //rsb_nnz_idx_t fel;
2299 // offset of line i in the global line pointers array
2300 rsb_nnz_idx_t nnz0 = IB[i];
2301 // nnz1..nnz0 are the boundaries of line i
2302 rsb_nnz_idx_t nnz1 = IB[i+1];
2303 // check
2304 RSB_C2R_ASSERT(nnz0>=IB[i]);
2305 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2306 // skip line if empty
2307 if(nnz1-nnz0<1)continue;
2308 // find first element of line i also in the submatrix
2309 nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff,nnz1-nnz0);
2310 // skip line if empty in the submatrix
2311 if(nnz1-nnz0<1)continue;
2312 // find the length of the subrow i in the submatrix
2313 nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff+submatrix->nc,nnz1-nnz0);
2314 //check
2315 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2316 // skip line if empty in the submatrix
2317 if(nnz1-nnz0<1)continue;
2318 // nnz1 .. nnz0 contain nonempty subrow i in the submatrix
2319 // RSB_INFO("i:%d, %d..%d -> %d\n",i,nnz0,nnz1-1,submatrix->nzoff+dnnz);
2320 // checks
2321 // RSB_C2R_ASSERT(IL[i-submatrix->roff]>=IB[i]);
2322 // RSB_C2R_ASSERT(IL[i-submatrix->roff]<=IB[i+1]);
2323 // RSB_C2R_ASSERT(IL[i-submatrix->roff+1]>=IB[i+1]);
2324 // RSB_C2R_ASSERT(nnz0>=IL[i-submatrix->roff]);
2325 // RSB_C2R_ASSERT(nnz1<=IL[i-submatrix->roff+1]);
2326 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2327 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2328 RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2329 // perform the copy
2330 RSB_A_MEMCPY_SMALL(WA,VA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0,el_size);
2331 //RSB_COA_MEMCPY(WA,JA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0);
2332 // update the actual offset in the destination array
2333 dnnz += nnz1-nnz0;
2334 }
2335 }
2336 if(dnnz!=submatrix->nnz)
2337 {
2338 RSB_ERROR("@%d,%d: found %d, should have found %d\n",
2339 submatrix->roff, submatrix->coff, dnnz,submatrix->nnz);
2340 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2341 }
2342 #pragma omp critical (rsb_coo2rsb_nzinc_crs)
2343 {tdnnz += dnnz;}
2344 }
2345 }
2346 //gerr:
2347 RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
2348 #if !defined(__xlC__)
2349 /* FIXME: xlc does not allow this, but we have experienced problems, without */
2350 #pragma omp barrier
2351 #endif /* __xlC__ */
2352 if(RSB_SOME_ERROR(errval))
2353 {
2354 RSB_PERR_GOTO(err,RSB_ERRM_NL);
2355 }
2356
2357 if(tdnnz!=nnz)
2358 {
2359 RSB_ERROR("found %d, should have found %d\n", tdnnz,nnz);
2360 errval = RSB_ERR_INTERNAL_ERROR;
2361 goto err;
2362 }
2363
2364 #if RSB_WANT_VERBOSE_TIMINGS
2365 pmt -= rsb_time();
2366 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2367 RSB_A_MEMCPY_parallel(VA,WA,0,0,nnz,el_size);
2368 #if RSB_WANT_VERBOSE_TIMINGS
2369 pmt += rsb_time();
2370 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2371 no_va_cp:
2372
2373 tdnnz = 0;
2374 #pragma omp parallel for schedule(static,1) reduction(|:errval) shared(tdnnz,submatricesp,IB) RSB_NTC
2375 for(smi=0;smi<cmc;++smi)
2376 {
2377 struct rsb_mtx_t * submatrix = submatricesp[smi];
2378 if(!RSB_DO_FLAG_HAS(submatrix->flags,RSB_FLAG_QUAD_PARTITIONING))
2379 {
2380 rsb_coo_idx_t oll,nll;
2381 rsb_coo_idx_t i;
2382 rsb_coo_idx_t*IR = submatrix->bpntr;
2383 rsb_coo_idx_t*IL = submatrix->bindx;
2384 rsb_nnz_idx_t dnnz = 0;
2385
2386 //if(!RSB_DO_TOOFEWNNZFORRCSR(submatrix->nnz,submatrix->nr))
2387 if(RSB_DO_ENOUGHNNZFORINDEXBASEDBUILD(submatrix) && !rsb__is_coo_matrix(submatrix) && IL && IR)
2388 {
2389 if(RSB_C2R_IF_VERBOSE)
2390 RSB_INFO("CSR:%d/%d:%d..%d\n",smi,cmc,submatrix->nzoff,submatrix->nzoff+submatrix->nnz);
2391 submatrix->bpntr = IA+submatrix->nzoff;
2392 RSB_C2R_ASSERT(IL); RSB_C2R_ASSERT(IR);
2393 RSB_C2R_ASSERT(IR< IA+submatrix->nzoff+submatrix->nnz);
2394 RSB_C2R_ASSERT(IL< IA+submatrix->nzoff+submatrix->nnz);
2395 RSB_C2R_ASSERT(IR>=IA+submatrix->nzoff);
2396 RSB_C2R_ASSERT(IL>=IA+submatrix->nzoff);
2397 for(dnnz=0,i=0;RSB_LIKELY(i<submatrix->nr);dnnz += IR[i]-IL[i],++i)
2398 RSB_COA_MEMCPY_SMALL(WA,JA,submatrix->nzoff+dnnz,IL[i],IR[i]-IL[i]);
2399
2400 // RSB_INFO("%d x %d (%d) @ %d, %d (rcsr)\n",submatrix->nr,submatrix->nc,submatrix->nnz,submatrix->roff,submatrix->coff);
2401 if(dnnz!=submatrix->nnz)
2402 {
2403 RSB_ERROR("@%d,%d: found %d, should have found %d\n",
2404 submatrix->roff, submatrix->coff, dnnz,submatrix->nnz);
2405 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2406 }
2407 RSB_C2R_ASSERT(IL); RSB_C2R_ASSERT(IR);
2408
2409 oll = IR[0]-IL[0];
2410 submatrix->bpntr[0] = 0;
2411 for(i=1;RSB_LIKELY(i<submatrix->nr);++i)
2412 {
2413 nll = IR[i]-IL[i];
2414 submatrix->bpntr[i] = submatrix->bpntr[i-1]+oll;
2415 oll = nll;
2416 }
2417 submatrix->bpntr[submatrix->nr] = submatrix->bpntr[submatrix->nr-1]+oll;
2418 if(submatrix->bpntr[submatrix->nr]!=submatrix->nnz)
2419 {
2420 RSB_ERROR("@%d,%d: found %d, should have found %d\n",
2421 submatrix->roff,submatrix->coff,submatrix->bpntr[submatrix->nr],submatrix->nnz);
2422 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2423 }
2424 }
2425 else
2426 if(!RSB_DO_TOOFEWNNZFORCSR(submatrix->nnz,submatrix->nr) && !rsb__is_coo_matrix(submatrix))
2427 {
2428 if(RSB_C2R_IF_VERBOSE)
2429 RSB_INFO("CSR:%d/%d:%d..%d\n",smi,cmc,submatrix->nzoff,submatrix->nzoff+submatrix->nnz);
2430 oll = 0;
2431 submatrix->bpntr = IA+submatrix->nzoff;
2432 submatrix->bpntr[0] = 0;
2433 // RSB_INFO("%d x %d (%d) @ %d, %d (rcsr)\n",submatrix->nr,submatrix->nc,submatrix->nnz,submatrix->roff,submatrix->coff);
2434 for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2435 {
2436 //rsb_nnz_idx_t fel;
2437 // offset of line i in the global line pointers array
2438 rsb_nnz_idx_t nnz0 = IB[i];
2439 // nnz1..nnz0 are the boundaries of line i
2440 rsb_nnz_idx_t nnz1 = IB[i+1];
2441 // check
2442 RSB_C2R_ASSERT(nnz0>=IB[i]);
2443 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2444 // skip line if empty
2445 if(nnz1-nnz0<1)goto is_empty_subrow;
2446 // find first element of line i also in the submatrix
2447 nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff,nnz1-nnz0);
2448 // skip line if empty in the submatrix
2449 if(nnz1-nnz0<1)goto is_empty_subrow;
2450 // find the length of the subrow i in the submatrix
2451 nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff+submatrix->nc,nnz1-nnz0);
2452 //check
2453 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2454 // skip line if empty in the submatrix
2455 if(nnz1-nnz0<1)goto is_empty_subrow;
2456 // nnz1 .. nnz0 contain nonempty subrow i in the submatrix
2457 // RSB_INFO("i:%d, %d..%d -> %d\n",i,nnz0,nnz1-1,submatrix->nzoff+dnnz);
2458 // checks
2459 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2460 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2461 RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2462 // convert row indices
2463 nll = nnz1-nnz0;
2464 if(i>submatrix->roff)
2465 submatrix->bpntr[i-submatrix->roff] = submatrix->bpntr[i-submatrix->roff-1]+oll;
2466 oll = nll;
2467 // perform the copy
2468 RSB_COA_MEMCPY_SMALL(WA,JA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0);
2469 // update the actual offset in the destination array
2470 dnnz += nnz1-nnz0;
2471 continue;
2472 is_empty_subrow:
2473 // convert row indices
2474 nll = 0;
2475 if(RSB_LIKELY(i>submatrix->roff))
2476 submatrix->bpntr[i-submatrix->roff] = submatrix->bpntr[i-submatrix->roff-1]+oll;
2477 oll = nll;
2478 }
2479 submatrix->bpntr[submatrix->nr] = submatrix->bpntr[submatrix->nr-1]+oll;
2480 if(dnnz!=submatrix->nnz || submatrix->bpntr[submatrix->nr]!=submatrix->nnz)
2481 {
2482 RSB_ERROR("@%d,%d: found %d, and %d; should have found %d\n",
2483 submatrix->roff, submatrix->coff,
2484 dnnz, submatrix->bpntr[submatrix->nr],submatrix->nnz);
2485 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2486 }
2487 }
2488 else
2489 {
2490 if(RSB_C2R_IF_VERBOSE)
2491 RSB_INFO("COO:%d/%d:%d..%d\n",smi,cmc,submatrix->nzoff,submatrix->nzoff+submatrix->nnz);
2492 oll = 0;
2493 submatrix->bpntr = IA+submatrix->nzoff;
2494 submatrix->bpntr[0] = 0;
2495 for(i=submatrix->roff;RSB_LIKELY(i<submatrix->roff+submatrix->nr);++i)
2496 {
2497 //rsb_nnz_idx_t fel;
2498 // offset of line i in the global line pointers array
2499 rsb_nnz_idx_t nnz0 = IB[i];
2500 // nnz1..nnz0 are the boundaries of line i
2501 rsb_nnz_idx_t nnz1 = IB[i+1];
2502 // check
2503 RSB_C2R_ASSERT(nnz0>=IB[i]);
2504 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2505 // skip line if empty
2506 if(nnz1-nnz0<1)continue;
2507 // find first element of line i also in the submatrix
2508 nnz0 += rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff,nnz1-nnz0);
2509 // skip line if empty in the submatrix
2510 if(nnz1-nnz0<1)continue;
2511 // find the length of the subrow i in the submatrix
2512 nnz1 = nnz0+rsb__nnz_split_coo_bsearch(JA+nnz0,submatrix->coff+submatrix->nc,nnz1-nnz0);
2513 //check
2514 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2515 // skip line if empty in the submatrix
2516 if(nnz1-nnz0<1)continue;
2517 // nnz1 .. nnz0 contain nonempty subrow i in the submatrix
2518 // RSB_INFO("i:%d, %d..%d -> %d\n",i,nnz0,nnz1-1,submatrix->nzoff+dnnz);
2519 // checks
2520 RSB_C2R_ASSERT(nnz1<=IB[i+1]);
2521 RSB_C2R_ASSERT(JA[nnz0+0]>=submatrix->coff);
2522 RSB_C2R_ASSERT(JA[nnz1-1]< submatrix->coff+submatrix->nc);
2523 // convert row indices
2524 // perform the copy
2525 RSB_COA_MEMCPY_SMALL(WA,JA,submatrix->nzoff+dnnz,nnz0,nnz1-nnz0);
2526 rsb__util_coo_array_set(IA+submatrix->nzoff+dnnz,nnz1-nnz0,i-submatrix->roff);
2527 // update the actual offset in the destination array
2528 dnnz += nnz1-nnz0;
2529 }
2530 if(dnnz!=submatrix->nnz )
2531 {
2532 RSB_ERROR("@%d,%d: found %d; should have found %d\n",
2533 submatrix->roff, submatrix->coff, dnnz, submatrix->nnz);
2534 RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_INTERNAL_ERROR);
2535 }
2536 }
2537 rsb__util_coo_array_sub(WA+submatrix->nzoff,dnnz,submatrix->coff);
2538 #pragma omp critical (rsb_coo2rsb_nzinc_crs)
2539 {tdnnz += dnnz;}
2540 }
2541 }
2542 #pragma omp barrier
2543 #if RSB_WANT_VERBOSE_TIMINGS
2544 pmt -= rsb_time();
2545 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2546 #if 1
2547 RSB_COA_MEMCPY_parallel(JA,WA,0,0,nnz);
2548 #else
2549 RSB_COA_MEMCPY(JA,WA,0,0,nnz);
2550 #endif
2551
2552 #if RSB_WANT_VERBOSE_TIMINGS
2553 pmt += rsb_time();
2554 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2555
2556 if(RSB_SOME_ERROR(errval))
2557 {
2558 RSB_PERR_GOTO(err,RSB_ERRM_NL);
2559 }
2560
2561 if(tdnnz!=nnz)
2562 {
2563 RSB_ERROR("found %d, should have found %d\n", tdnnz,nnz);
2564 errval = RSB_ERR_INTERNAL_ERROR;
2565 goto err;
2566 }
2567 err:
2568 RSB_DO_ERR_RETURN(errval)
2569 }
2570
rsb__allocate_recursive_sparse_matrix_from_row_major_coo(void * VA,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_nnz_idx_t nnz,rsb_type_t typecode,const struct rsb_mtx_partitioning_info_t * pinfop,rsb_flags_t flags,rsb_err_t * errvalp)2571 struct rsb_mtx_t * rsb__allocate_recursive_sparse_matrix_from_row_major_coo(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_coo_idx_t m, rsb_coo_idx_t k, rsb_nnz_idx_t nnz, rsb_type_t typecode, const struct rsb_mtx_partitioning_info_t * pinfop, rsb_flags_t flags, rsb_err_t *errvalp)
2572 {
2573 /**
2574 \ingroup gr_experimental
2575
2576 Once finished, should assembly a complete R-CSR/R-CSC matrix in-place in the provided COO arrays.
2577 Should use no more than ??? temporary memory.
2578
2579 TODO: get rid of pinfop.
2580 TODO: interleave the matrix structs into the data arrays.
2581 TODO: missing proper error handling.
2582 TODO: guaranteed preallocation needed
2583 TODO: may continue the line of rsb_allocate_new__ and plug it here.
2584 */
2585 rsb_err_t errval = RSB_ERR_NO_ERROR;
2586 struct rsb_mtx_t * submatrices = NULL;
2587 struct rsb_mtx_t ** submatricesp = NULL;
2588 struct rsb_mtx_t * mtxAp = NULL;
2589 long fcs = rsb__get_first_level_c_size();
2590 long lcs = rsb__get_lastlevel_c_size();
2591 long lcspt = rsb__get_lastlevel_c_size_per_thread();
2592 long cbs = rsb__get_cache_block_byte_size();
2593 #if RSB_WANT_SUBDIVISION_FIXES_20101213
2594 long wcbs = cbs;
2595 #else /* RSB_WANT_SUBDIVISION_FIXES_20101213 */
2596 long wcbs = lcspt;
2597 #endif /* RSB_WANT_SUBDIVISION_FIXES_20101213 */
2598 #if RSB_WANT_VERBOSE_TIMINGS
2599 rsb_time_t pmt = RSB_TIME_ZERO;
2600 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2601 rsb_coo_idx_t * IB = NULL;
2602 rsb_coo_idx_t * IT = NULL;
2603 rsb_coo_idx_t * IX = NULL;
2604 rsb_coo_idx_t * WA = NULL;
2605 rsb_submatrix_idx_t tmc = 0, smi = 0; /* max matrix count, done matrix count, submatrix index */
2606 rsb_submatrix_idx_t cmc = 0, omc = 0; /* closed matrices count, open matrices count */
2607 rsb_submatrix_idx_t lm = 0; /* leaf matrices */
2608 rsb_time_t dt = RSB_TIME_ZERO,eit = RSB_TIME_ZERO,est = RSB_TIME_ZERO,ect = RSB_TIME_ZERO,tat = RSB_TIME_ZERO,sat = RSB_TIME_ZERO;
2609 const rsb_thread_t wet = rsb_get_num_threads(); /* want executing threads */
2610 size_t el_size = RSB_SIZEOF(typecode);
2611 rsb_coo_idx_t roff = 0;
2612 rsb_coo_idx_t coff = 0;
2613
2614 tat = - rsb_time();
2615 #if RSB_ALLOW_EMPTY_MATRICES
2616 if(!RSB_HAVE_GOOD_PARMS_FOR_EMPTY(m,k,nnz,flags))
2617 #endif /* RSB_ALLOW_EMPTY_MATRICES */
2618 if(!RSB_HAVE_GOOD_PARMS_FOR_IN_PLACE_RCSR(m,k,nnz,flags))
2619 {
2620 RSB_ERROR(RSB_ERRM_MDNFARTS);
2621 errval = RSB_ERR_BADARGS;
2622 goto err;
2623 }
2624
2625 if( fcs > lcs || fcs<1 || cbs<1 ) /* we allow declaration of 1 level of cache only */
2626 {
2627 /* TODO : find a reasonable solution, and declare it in ./rsbench -I, which should give some diagnostic about this */
2628 RSB_ERROR("innermost cache size:%d, outermost cache size:%d, cache block size %d\n",fcs,lcs,cbs);
2629 errval = RSB_ERR_FAILED_MEMHIER_DETECTION;
2630 goto err;
2631 }
2632
2633 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COLUMN_MAJOR_ORDER))
2634 {
2635 RSB_ERROR(RSB_ERRM_CMOINIY);
2636 errval = RSB_ERR_UNIMPLEMENTED_YET;
2637 goto err;
2638 }
2639
2640 #if RSB_WANT_DEBUG_VERBOSE_INTERFACE_NOTICE
2641 if((rsb_global_session_handle.rsb_g_verbose_interface&2))
2642 RSB_STDOUT("building a matrix with %d nnz, %d x %d\n",nnz,m,k);
2643 #endif /* RSB_WANT_DEBUG_VERBOSE_INTERFACE_NOTICE */
2644
2645 if(RSB_DO_FLAGS_EXTRACT_STORAGE(flags)==RSB_FLAG_NOFLAGS)
2646 {
2647 RSB_DO_FLAG_ADD(flags,RSB_FLAG_DEFAULT_STORAGE_FLAGS);
2648 }
2649
2650 /* TODO: plug *here* the eventually to-come RSB_WANT_FASTER_EXPERIMENTAL_CONSTRUCTOR stuff */
2651
2652 tmc = RSB_SUBDIVISION_BUG_EXTRA+2*(((nnz+wcbs)*(RSB_SIZEOF(typecode)+2*sizeof(rsb_coo_idx_t)))/(wcbs)); /* TODO: clean this up */
2653 tmc = RSB_MAX(1,(rsb_submatrix_idx_t)(rsb_global_session_handle.subdivision_multiplier*tmc));
2654
2655 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_RECURSIVE_MORE_LEAVES_THAN_THREADS))
2656 if(wet>tmc)
2657 {
2658 tmc = wet;
2659 }
2660
2661 submatrices = rsb__calloc(sizeof(struct rsb_mtx_t )*tmc);
2662 submatricesp = rsb__calloc(sizeof(struct rsb_mtx_t*)*tmc);
2663
2664 if(!submatrices || !submatricesp)
2665 {
2666 errval = RSB_ERR_ENOMEM;
2667 RSB_PERR_GOTO(err,RSB_ERRM_ES);
2668 }
2669
2670 for(smi=0;smi<tmc;++smi)
2671 submatricesp[smi] = submatrices + smi;
2672
2673 if(RSB_DO_FLAG_HAS(flags,RSB_FLAG_FORTRAN_INDICES_INTERFACE)) /* TODO: this is *slow*, speed this up */
2674 rsb__util_coo_array_from_fortran_indices(IA,nnz,RSB_BOOL_TRUE),
2675 rsb__util_coo_array_from_fortran_indices(JA,nnz,RSB_BOOL_TRUE),
2676 RSB_DO_FLAG_DEL(flags,RSB_FLAG_FORTRAN_INDICES_INTERFACE);
2677
2678 dt = - rsb_time();
2679 if((errval = rsb__do_cleanup_nnz(VA,IA,JA,nnz,roff,coff,m,k,&nnz,typecode,flags))!=RSB_ERR_NO_ERROR)
2680 goto err;
2681
2682 ect = dt;
2683 dt = - rsb_time();
2684 ect -= dt;
2685
2686 if(!RSB_DO_FLAG_HAS(flags,RSB_FLAG_SORTED_INPUT))
2687 if((errval = rsb__util_sort_row_major_inner(VA,IA,JA,nnz,m,k,typecode,flags))!=RSB_ERR_NO_ERROR)
2688 {
2689 RSB_ERROR(RSB_ERRM_ES);
2690 goto err;
2691 }
2692 RSB_DO_FLAG_ADD(flags,RSB_FLAG_SORTED_INPUT); /* TODO: is this needed ? */
2693
2694 /* we need duplicates removal, and this can only take place after sorting */
2695 #if RSB_WANT_VERBOSE_TIMINGS
2696 {rsb_nnz_idx_t dnnz = nnz - rsb_weed_out_duplicated(IA,JA,VA,nnz,typecode,flags);
2697 RSB_INFO("duplicate removal: %zd - %zd = %zd\n",nnz,dnnz,nnz-dnnz);
2698 nnz -= dnnz;}
2699 #else /* RSB_WANT_VERBOSE_TIMINGS */
2700 nnz = rsb_weed_out_duplicates(IA,JA,VA,nnz,typecode,flags);
2701 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2702 dt += rsb_time();
2703
2704 est = dt; /* with 'sorting' (est) we DO NOT intend also cleanup (in ect) */
2705
2706 /* work vectors allocation */
2707 /* IL = rsb__malloc(sizeof(rsb_coo_idx_t)*(m+1)); */
2708 IT = rsb__malloc(sizeof(rsb_coo_idx_t)*(m+1));
2709 IX = rsb__malloc(sizeof(rsb_coo_idx_t)*2*(m+1));
2710 IB = rsb__malloc(sizeof(rsb_coo_idx_t)*(m+1));
2711 if(/* !IL ||*/ !IT || !IX || !IB)
2712 {
2713 RSB_ERROR(RSB_ERRM_ES);
2714 errval = RSB_ERR_ENOMEM; goto err;
2715 }
2716
2717 /* declaring this first matrix (smi == 0) as 'open' */
2718 smi = 0; omc = 1;
2719 /* compile in the first mtxAp, linking into to the temporary split vector */
2720 submatrices[smi].nzoff = 0;
2721 submatrices[smi].roff = roff;
2722 submatrices[smi].coff = coff;
2723 submatrices[smi].bindx = IB;
2724 submatrices[smi].bpntr = IB+1;
2725 submatrices[smi].indptr = NULL;
2726 /* RSB_DO_FLAG_ADD(flags,RSB_FLAG_QUAD_PARTITIONING); */
2727 RSB_DO_FLAG_ADD(flags,RSB_FLAG_ASSEMBLED_IN_COO_ARRAYS);
2728 if( (errval = rsb__set_init_flags_and_stuff(
2729 &submatrices[smi],NULL,NULL,m,k,nnz,nnz,nnz,typecode,flags)
2730 )!=RSB_ERR_NO_ERROR)
2731 {
2732 RSB_ERROR(RSB_ERRM_ES);
2733 goto err;
2734 }
2735
2736 if(nnz==0)
2737 {
2738 /* a special case. we copy the arrays addresses because they may be non-NULL and containing duplicate/diagonal/etc.. elements we have honoured to free, afterwards. */
2739 ++cmc; --omc;
2740 mtxAp = &submatrices[0];
2741 RSB_DO_FLAG_DEL(mtxAp->flags,RSB_FLAG_QUAD_PARTITIONING); /* necessary, too */
2742 mtxAp->bpntr = IA;
2743 mtxAp->bindx = JA;
2744 mtxAp->indptr = NULL;
2745 mtxAp->VA = VA;
2746 goto arrays_done;
2747 }
2748 else
2749 mtxAp = &submatrices[0];
2750
2751 sat = - rsb_time();
2752 /* computing the first right-left pointer vectors */
2753 #if RSB_WANT_PARALLEL_SUBDIVISION
2754 if((errval = rsb_do_compute_vertical_split_parallel(IA,JA,roff,coff,m,k,0,0,nnz,IB,NULL,NULL,NULL,NULL,NULL,NULL))!=RSB_ERR_NO_ERROR)
2755 #else /* RSB_WANT_PARALLEL_SUBDIVISION */
2756 if((errval = rsb_do_compute_vertical_split_parallel(IA,JA,roff,coff,m,k,0,0,nnz,IB,NULL,NULL,NULL,NULL,NULL,NULL))!=RSB_ERR_NO_ERROR)
2757 /*if((errval = rsb_do_compute_vertical_split(IA,JA,roff,coff,m,k,0,0,nnz,IB,NULL,NULL,NULL,NULL,NULL,NULL))!=RSB_ERR_NO_ERROR) */
2758 #endif /* RSB_WANT_PARALLEL_SUBDIVISION */
2759 {
2760 RSB_ERROR(RSB_ERRM_ES);
2761 goto err;
2762 }
2763
2764 if(RSB_C2R_IF_VERBOSE)
2765 RSB_INFO("beginning (%d x %d) @ %p with flags 0x%x (coo:%d, csr:%d), storage: 0x%x, max %d submatrices\n",
2766 submatrices[smi].nr, submatrices[smi].nc, (const void*)&submatrices[smi], submatrices[smi].flags,
2767 RSB_DO_FLAG_HAS(submatrices[smi].flags,RSB_FLAG_WANT_COO_STORAGE),
2768 RSB_DO_FLAG_HAS(submatrices[smi].flags,RSB_FLAG_WANT_BCSS_STORAGE),
2769 submatrices[smi].matrix_storage,tmc
2770 );
2771
2772 /* if(!RSB_WANT_MORE_PARALLELISM || (RSB_DO_FLAG_HAS(mtxAp->flags,RSB_FLAG_QUAD_PARTITIONING))) */ /* TODO */
2773 #if 1 /* the code is not yet ready for this */
2774 /* #if RSB_WANT_OMP_RECURSIVE_KERNELS */
2775 errval = rsb_do_coo2rec_subdivide_parallel(VA,IA,JA,m,k,nnz,typecode,pinfop,flags,errvalp,submatricesp,mtxAp,IB,IX,IT,WA,cmc,omc,tmc,RSB_MAX(1,RSB_MIN(wet,nnz)),&cmc);
2776 #else
2777 errval = rsb_do_coo2rec_subdivide(VA,IA,JA,m,k,nnz,typecode,pinfop,flags,errvalp,submatricesp,mtxAp,IB,IX,IT,WA,cmc,omc,tmc,wet,&cmc);
2778 #endif
2779 RSB_CONDITIONAL_FREE(IX);
2780 if(RSB_SOME_ERROR(errval))
2781 goto err;
2782
2783 eit = - rsb_time();
2784 sat -= eit;
2785
2786 /*
2787 RSB_CONDITIONAL_FREE(IL);
2788 RSB_CONDITIONAL_FREE(IT);
2789 */
2790 /* WA will is needed for shuffle only (so, after IL,IM deallocation, in a way that total memory need is max(WA,IL)) */
2791 WA = rsb__malloc(RSB_MAX(sizeof(rsb_coo_idx_t),el_size)*nnz);
2792 if(!WA)
2793 {
2794 RSB_ERROR(RSB_ERRM_FAOTAFS);
2795 errval = RSB_ERR_ENOMEM; goto err;
2796 }
2797
2798 #if 0
2799 /* after symbolic partitioning is done, we are ready to shuffle all of the arrays using the temporary storage and add an intermediate node */
2800 RSB_INFO("assembling leaf %d -> %d x %d, %d\n",smi,submatricesp[smi]->nr,submatricesp[smi]->nc,submatricesp[smi]->nnz);
2801 RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows(VA,IA,JA,m,0,nnz,0,typecode,IL,IM,WA));
2802 RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows(VA,IA,JA,(m+1)/2,0,IL[(m+1)/2],0,typecode,IL,IR,WA));
2803 RSB_DO_ERROR_CUMULATE(errval,rsb_do_shuffle_left_and_right_rows(VA,IA,JA,m,(m+1)/2,nnz,IL[(m+1)/2],typecode,IL,IR,WA));
2804 /* TODO : should use a temporary vector, here. */
2805 #endif
2806
2807 for(smi=0;smi<cmc;++smi)
2808 if(rsb__is_terminal_recursive_matrix(submatricesp[smi]))
2809 ++lm;
2810
2811 /* qsort(submatricesp+(cmc-lm),(size_t)(lm),sizeof(struct rsb_mtx_t*),&rsb_compar_rcsr_matrix_leftmost_first); */
2812 qsort(submatricesp,(size_t)(cmc),sizeof(struct rsb_mtx_t*),&rsb_compar_rcsr_matrix_leftmost_first);
2813 /* TODO: a priority queue would do the job, here */
2814 for(smi=0;smi<cmc-lm;++smi)
2815 if(rsb__is_terminal_recursive_matrix(submatricesp[smi]))
2816 {
2817 errval = RSB_ERR_INTERNAL_ERROR;
2818 RSB_PERR_GOTO(err,RSB_ERRM_ANLSMIT);
2819 }
2820 for(smi=cmc-lm;smi<cmc;++smi)
2821 if(!rsb__is_terminal_recursive_matrix(submatricesp[smi]))
2822 {
2823 errval = RSB_ERR_INTERNAL_ERROR;
2824 RSB_PERR_GOTO(err,RSB_ERRM_ALSMINT);
2825 }
2826
2827 errval = rsb_do_coo2rec_shuffle(VA,IA,JA,m,k,nnz,typecode,pinfop,flags,errvalp,submatricesp,mtxAp,IB,WA,cmc);
2828 if(RSB_SOME_ERROR(errval))
2829 {
2830 RSB_PERR_GOTO(err,RSB_ERRM_SEOWS);
2831 }
2832
2833 rsb__do_set_in_place_submatrices_offsets(submatrices,cmc,VA,IA,JA,el_size);
2834
2835 /* RSB_INFO("VA:%p, IA:%p, JA:%p\n",VA,IA,JA); */
2836
2837 if(RSB_C2R_IF_VERBOSE)
2838 RSB_INFO("IA? :%p / %p\n",(const void*)IA,
2839 (const void*)(rsb__do_get_first_submatrix(mtxAp)->bpntr-
2840 (rsb__do_get_first_submatrix(mtxAp)->nr+1))
2841 /* rsb__do_get_first_submatrix(mtxAp)->roff-
2842 ((submatricesp[0])->nr+1) */
2843 );
2844
2845 /* after shuffling, the last vectors conversion happens and we are done. */
2846 arrays_done:
2847 eit += rsb_time();
2848
2849 /* first matrix is always root (even if a CSR one) */
2850 RSB_DO_FLAG_DEL(submatrices[0].flags,RSB_FLAG_NON_ROOT_MATRIX);
2851 #if RSB_EXPERIMENTAL_SHOULD_TRAVERSE_RECURSIVE_MATRICES_AS_BLOCKS
2852 if(!(submatrices[0].flags & RSB_FLAG_NON_ROOT_MATRIX))
2853 {
2854 submatrices[0].all_leaf_matrices = NULL;
2855 errval = rsb__get_array_of_leaf_matrices(&submatrices[0],&(submatrices[0].all_leaf_matrices),&submatrices[0].all_leaf_matrices_n);
2856 if(RSB_SOME_ERROR(errval))
2857 goto err;
2858 }
2859 else
2860 {
2861 /* this is a non root matrix */
2862 submatrices[0].all_leaf_matrices = NULL;
2863 submatrices[0].all_leaf_matrices_n = 0;
2864 }
2865 #endif /* RSB_EXPERIMENTAL_SHOULD_TRAVERSE_RECURSIVE_MATRICES_AS_BLOCKS */
2866
2867 #if RSB_WANT_VERBOSE_TIMINGS
2868 rsb_time_t hct = - rsb_time();
2869 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2870 if(
2871 /* RSB_DO_FLAG_HAS(flags,RSB_FLAG_USE_HALFWORD_INDICES_COO)
2872 || RSB_DO_FLAG_HAS(flags,RSB_FLAG_USE_HALFWORD_INDICES_CSR)
2873 || RSB_DO_FLAG_HAS(flags,RSB_FLAG_WANT_COO_STORAGE)*/
2874 RSB_DO_FLAG_HAS(flags,RSB_FLAG_USE_HALFWORD_INDICES)
2875 )
2876 #if RSB_WANT_MORE_PARALLELISM
2877 RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_recursive_matrix_to_halfword_storages_parallel(mtxAp));
2878 #else /* RSB_WANT_MORE_PARALLELISM */
2879 RSB_DO_ERROR_CUMULATE(errval,rsb_do_switch_fresh_recursive_matrix_to_halfword_storages(mtxAp));
2880 #endif /* RSB_WANT_MORE_PARALLELISM */
2881 else
2882 {
2883 if(RSB_C2R_IF_VERBOSE)
2884 RSB_INFO("no RSB_FLAG_USE_HALFWORD_INDICES flag\n");
2885 }
2886 RSB_DO_ERROR_CUMULATE(errval,rsb_do_compute_bounded_boxes(mtxAp));
2887
2888 #if RSB_WANT_VERBOSE_TIMINGS
2889 hct += rsb_time();
2890 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2891
2892 if(RSB_SOME_ERROR(errval))
2893 {
2894 RSB_PERR_GOTO(err,RSB_ERRM_ES);
2895 }
2896 tat += rsb_time();
2897
2898 mtxAp->sat = sat;
2899 mtxAp->ect = ect;
2900 mtxAp->eit = eit;
2901 mtxAp->est = est;
2902 mtxAp->pet = RSB_TIME_ZERO;
2903 mtxAp->rpt = RSB_TIME_ZERO;
2904 mtxAp->tat = tat;
2905
2906 /* some statistics */
2907 #if RSB_WANT_VERBOSE_TIMINGS
2908 RSB_INFO("analyzed arrays in %g s\n",sat);
2909 RSB_INFO("cleaned-up arrays in %g s\n",ect);
2910 RSB_INFO("sorted arrays in %g s\n",est);
2911 RSB_INFO("computed partitions in %g s\n",mtxAp->cpt);
2912 RSB_INFO("shuffled partitions in %g s\n",eit);
2913 RSB_INFO(" (of which %g s for parallel memcpy and %g for halfword conversion)\n",pmt,hct);
2914 #endif /* RSB_WANT_VERBOSE_TIMINGS */
2915 mtxAp->cpt = 0;/* cpt is contained in sat, so should not be counted here! */
2916 #if RSB_STORE_IDXSA
2917 mtxAp->idxsa = rsb__get_index_storage_amount(mtxAp);
2918 #endif
2919
2920 #if 0
2921 RSB_INFO("got %d matrices (%d leaves)\n",cmc,lm);
2922
2923 if( !rsb__mtx_chk(mtxAp) )
2924 {
2925 errval = RSB_ERR_INTERNAL_ERROR;
2926 RSB_PERR_GOTO(derr,RSB_ERRM_NL);
2927 }
2928
2929 errval = rsb__do_switch_recursive_matrix_to_fullword_storage(mtxAp);
2930
2931 errval = rsb__do_switch_recursive_in_place_matrix_to_in_place_rcoo(mtxAp);
2932 ok:
2933 #endif
2934 goto noerr;
2935
2936 /* TODO: missing proper error handling here ! --- destroy any allocated data for the matrix (e.g.: submatrices, .. ) */
2937 if(mtxAp)
2938 rsb__do_mtx_free(mtxAp);
2939 err:
2940 mtxAp = NULL;
2941 noerr:
2942 if(RSB_SOME_ERROR(errval))
2943 rsb__do_perror(NULL,errval);
2944 RSB_CONDITIONAL_FREE(IB);
2945 RSB_CONDITIONAL_FREE(IT);
2946 RSB_CONDITIONAL_FREE(IX);
2947 /* RSB_CONDITIONAL_FREE(IL); */
2948 RSB_CONDITIONAL_FREE(WA);
2949 RSB_CONDITIONAL_FREE(submatricesp);
2950 if(!mtxAp)RSB_CONDITIONAL_FREE(submatrices);
2951 RSB_CONDITIONAL_ERRPSET(errvalp,errval);
2952 #if RSB_WANT_DEBUG_VERBOSE_INTERFACE_NOTICE
2953 if(mtxAp && (rsb_global_session_handle.rsb_g_verbose_interface&2))
2954 RSB_STDOUT_MATRIX_SUMMARY(mtxAp),RSB_STDOUT("\n");
2955 #endif /* RSB_WANT_DEBUG_VERBOSE_INTERFACE_NOTICE */
2956 return mtxAp;
2957 }
2958
rsb__init_coo_struct_from_rsb(const struct rsb_mtx_t * mtxAp,struct rsb_coo_matrix_t * coop)2959 rsb_err_t rsb__init_coo_struct_from_rsb(const struct rsb_mtx_t *mtxAp, struct rsb_coo_matrix_t *coop)
2960 {
2961 /* FIXME: unfinished; shall replace by direct use of RSB_INIT_COO_FROM_MTX, or the opposite */
2962 if(!mtxAp || !coop)
2963 {
2964 return RSB_ERR_BADARGS;
2965 }
2966
2967 RSB_INIT_COO_FROM_MTX(coop,mtxAp);
2968 return RSB_ERR_NO_ERROR;
2969 }
2970
rsb__project_rsb_to_coo(struct rsb_mtx_t * mtxAp,struct rsb_coo_matrix_t * coop)2971 rsb_err_t rsb__project_rsb_to_coo(struct rsb_mtx_t *mtxAp, struct rsb_coo_matrix_t *coop)
2972 {
2973 rsb_err_t errval = RSB_ERR_NO_ERROR;
2974 if(!mtxAp || !coop)
2975 {
2976 errval = RSB_ERR_BADARGS;
2977 RSB_PERR_GOTO(err,RSB_ERRM_ES);
2978 }
2979 errval = rsb__init_coo_struct_from_rsb(mtxAp,coop);
2980 if(RSB_SOME_ERROR(errval))
2981 {
2982 RSB_PERR_GOTO(err,RSB_ERRM_ES);
2983 }
2984
2985 RSB_BIND_COO_TO_MTX(coop,mtxAp);
2986 err:
2987 return errval;
2988 }
2989
2990 /* @endcond */
2991