1 /*
2 
3 Copyright (C) 2008-2021 Michele Martone
4 
5 This file is part of librsb.
6 
7 librsb is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11 
12 librsb is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15 License for more details.
16 
17 You should have received a copy of the GNU Lesser General Public
18 License along with librsb; see the file COPYING.
19 If not, see <http://www.gnu.org/licenses/>.
20 
21 */
22 /* @cond INNERDOC  */
23 /*!
24  * @file
25  * @author Michele Martone
26  * @brief
27  * This source file contains parallel sorting functions.
28  * */
29 
30 #include "rsb_common.h"
31 
32 #define RSB_DO_WANT_PSORT_VERBOSE 0	/* set this to >0 to print parallel sort statistics */
33 #define RSB_DO_WANT_PSORT_TIMING (RSB_DO_WANT_PSORT_VERBOSE+0) 	/* set this to 1 to print some statistics */
34 #define RSB_DO_WANT_PSORT_FASTER_BUT_RISKY 0	/* FIXME: STILL UNFINISHED  */
35 #define RSB_WANT_SORT_PARALLEL_BUT_SLOW 1
36 
37 RSB_INTERNALS_COMMON_HEAD_DECLS
38 
rsb__util_sort_row_major_parallel(void * VA,rsb_coo_idx_t * IA,rsb_coo_idx_t * JA,rsb_nnz_idx_t nnz,rsb_coo_idx_t m,rsb_coo_idx_t k,rsb_type_t typecode,rsb_flags_t flags)39 rsb_err_t rsb__util_sort_row_major_parallel(void *VA, rsb_coo_idx_t * IA, rsb_coo_idx_t * JA, rsb_nnz_idx_t nnz, rsb_coo_idx_t m, rsb_coo_idx_t k,  rsb_type_t typecode, rsb_flags_t flags)
40 {
41 	/**
42 	 * \ingroup gr_util
43 	 * TODO: should describe somewhere our technique: this is a mixed counting sort + merge sort.
44 	*/
45 
46 	rsb_err_t errval = RSB_ERR_NO_ERROR;
47 	int cfi;
48 	size_t el_size;
49 	float cfa[] = {2}; // will use threads count as a reference
50 //	float cfa[] = {1}; // will use cache size as a reference
51 //	float cfa[] = {-1};  // will use as much memory as possible
52 	const long wet = rsb_get_num_threads(); /* want executing threads */
53 
54 	if(nnz<RSB_MIN_THREAD_SORT_NNZ*wet)
55 	{
56 		// FIXME: it is known that very small matrices (e.g.: 2x2 from `make tests`) are not handled, here.
57 		// however, this should be handled in a better way than this :)
58 		return rsb__util_sort_row_major_inner(VA,IA,JA,nnz,m,k,typecode,flags);
59 	}
60 
61 	if(!IA || !JA || !VA)
62 	{
63 		errval = RSB_ERR_BADARGS;
64 		RSB_PERR_GOTO(err,RSB_ERRM_ES);
65 	}
66 
67 	el_size = RSB_SIZEOF(typecode);
68 
69 	for(cfi=0;cfi<sizeof(cfa)/sizeof(float);++cfi)
70 	{
71 		void *W = NULL;
72 		rsb_char_t *IW = NULL;
73 		int ti;
74 		long cs,bs,tc,ns,tcs;
75 		rsb_nnz_idx_t cnnz = 0;
76 		size_t bnnz = 0,fsm = rsb__sys_free_system_memory();
77 		size_t wb = 0;
78 
79 #if RSB_DO_WANT_PSORT_TIMING
80 		rsb_time_t dt,st,mt,tt;
81 #endif /* RSB_DO_WANT_PSORT_TIMING */
82 		// compute how many bytes are necessary for an element
83 		ns=2*sizeof(rsb_coo_idx_t)+el_size;
84 		// compute how many bytes are necessary for the whole processing
85 		bs=nnz*ns;
86 		if(cfa[cfi]>1)
87 			tcs=bs+(wet*ns);
88 		else if(cfa[cfi]>0)
89 			tcs = rsb__get_lastlevel_c_size()*cfa[cfi]*wet;/* FIXME: '*wet' is a hack just for benchmark-related issues */
90 		else
91 			tcs=fsm/2; /* could be 0 */
92 
93 		if(tcs<1)
94 			tcs=bs;	/* could happen, for an interfacing problem */
95 		else
96 		if(fsm>0)
97 		{
98 			tcs = RSB_MIN(fsm,tcs);
99 		}
100 
101 		// prepare a buffer
102 		W = rsb__malloc(tcs);
103 		if(!W)
104 		{
105 			errval = RSB_ERR_ENOMEM;
106 			RSB_PERR_GOTO(erri,RSB_ERRM_ES)
107 		}
108 		cs=tcs/wet;
109 		//RSB_INFO("cache is %d bytes\n",cs);
110 
111 		// compute the nnz fitting in the buffer
112 		cnnz=cs/ns;
113 		// compute the total count of necessary passes
114 		//tc=(bs+cs-1)/cs;
115 		tc=(nnz+(cnnz-1))/(cnnz);
116 
117 		wb = RSB_DO_REQUIRE_BYTES_FOR_INDEX_BASED_SORT(cnnz,m,k,1,1);
118 		IW = rsb__malloc(wb*wet);
119 		if(!IW)
120 		{
121 			errval = RSB_ERR_ENOMEM;
122 			RSB_PERR_GOTO(erri,RSB_ERRM_ES);
123 		}
124 
125 		//RSB_INFO("there are %d nnz (%d bytes), %d times the cache (%z bytes), %d nnz per cache\n",nnz,bs,tc,cs,cnnz);
126 #if RSB_DO_WANT_PSORT_TIMING
127 		dt = rsb_time();
128 		st=-dt;
129 #endif /* RSB_DO_WANT_PSORT_TIMING */
130 
131 		// this phase is potentially parallel, and is the slower one.
132 		// NOTE:before parallelization, one should avoid allocations during sort, or serialize in them in some way!
133 		#pragma omp parallel for schedule(static,1) shared(IA,JA,VA)   RSB_NTC
134 		for(ti=0;ti<tc;++ti)
135 		{
136 			size_t fnnz=ti*cnnz;
137 			rsb_nnz_idx_t bnnz=(ti==tc-1)?(nnz-fnnz):cnnz;
138 //			RSB_INFO("s:%d..%d (bnnz=%d)\n",fnnz,fnnz+bnnz-1,bnnz);
139 			rsb__util_sort_row_major_buffered(((rsb_byte_t*)VA)+el_size*fnnz,IA+fnnz,JA+fnnz,bnnz,m,k,typecode,flags,IW+wb*ti,wb);
140 			RSB_PS_ASSERT(!rsb__util_is_sorted_coo_as_row_major(VA+fnnz,IA+fnnz,JA+fnnz,bnnz,typecode,NULL,flags));
141 		}
142 
143 #if RSB_DO_WANT_PSORT_TIMING
144 		dt = rsb_time();
145 		st+=dt;
146 #endif /* RSB_DO_WANT_PSORT_TIMING */
147 		// this phase is potentially parallel, too
148 		for(bnnz=cnnz;bnnz<nnz;)
149 		{
150 		//	size_t fnnz;
151 			int fi, fn = ((nnz-bnnz)+(2*bnnz-1))/(2*bnnz);
152 			#pragma omp parallel for schedule(static,1) shared(IA,JA,VA)   RSB_NTC
153 //			for(fnnz=0;fnnz<nnz-bnnz;fnnz+=2*bnnz)
154 			for(fi=0;fi<fn;++fi)
155 			{
156 #if RSB_WANT_OMP_RECURSIVE_KERNELS
157 				rsb_char_t * lW=((rsb_char_t*)W)+cs*omp_get_thread_num();
158 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
159 				rsb_char_t * lW=((rsb_char_t*)W)+cs*0;
160 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
161 				size_t fnnz=fi*2*bnnz;
162 				size_t lnnz=(fnnz+2*bnnz>nnz)?(nnz-(fnnz+bnnz)):bnnz;
163 				void *fVA=((rsb_char_t*)VA)+el_size*fnnz;
164 				void *fIA=IA+fnnz;
165 				void *fJA=JA+fnnz;
166 #if RSB_PS_ASSERT
167 				void *bVA=((rsb_char_t*)VA)+el_size*(fnnz+bnnz);
168 				void *bIA=IA+fnnz+bnnz;
169 				void *bJA=JA+fnnz+bnnz;
170 #endif /* RSB_PS_ASSERT */
171 				//RSB_INFO("m:%d..%d %d..%d (%d) (bnnz=%d) (lnnz=%d)\n",fnnz,fnnz+bnnz-1,fnnz+bnnz,fnnz+bnnz+lnnz-1,cs,bnnz,lnnz);
172 //				RSB_INFO("sentinel:%x %d %d\n",IA+fnnz+bnnz+lnnz,IA[fnnz+bnnz+lnnz],JA[fnnz+bnnz+lnnz]);
173 				RSB_PS_ASSERT(!rsb__util_is_sorted_coo_as_row_major(fVA,fIA,fJA,bnnz,typecode,NULL,flags));
174 				RSB_PS_ASSERT(!rsb__util_is_sorted_coo_as_row_major(bVA,bIA,bJA,lnnz,typecode,NULL,flags));
175 				rsb__do_util_merge_sorted_subarrays_in_place(fVA,fIA,fJA,lW,bnnz,lnnz,cs,flags,typecode);
176 //				RSB_INFO("sentinel:%x %d %d\n",IA+fnnz+bnnz+lnnz,IA[fnnz+bnnz+lnnz],JA[fnnz+bnnz+lnnz]);
177 				RSB_PS_ASSERT(!rsb__util_is_sorted_coo_as_row_major(fVA,fIA,fJA,bnnz,typecode,NULL,flags));
178 				RSB_PS_ASSERT(!rsb__util_is_sorted_coo_as_row_major(fVA,fIA,fJA,bnnz+lnnz,typecode,NULL,flags));
179 			}
180 			#pragma omp barrier
181 			bnnz *= 2;
182 		}
183 #if RSB_DO_WANT_PSORT_TIMING
184 		mt = - dt;
185 		dt = rsb_time();
186 		mt += dt;
187 		tt = mt + st;
188 		RSB_INFO("using %zd partitions, (sort=%.5lg+merge=%.5lg)=%.5lg, on %d threads\n",(size_t)tc,st,mt,tt,wet);
189 #endif /* RSB_DO_WANT_PSORT_TIMING */
190 
191 //		assert(!rsb__util_is_sorted_coo_as_row_major(VA,IA,JA,nnz,typecode,NULL,flags));
192 //		if(rsb__util_is_sorted_coo_as_row_major(VA,IA,JA,nnz,typecode,NULL,flags))
193 //			RSB_PERR_GOTO(err,RSB_ERRM_EM);
194 
195 erri:
196 		RSB_CONDITIONAL_FREE(W);
197 		RSB_CONDITIONAL_FREE(IW);
198 	}
199 err:
200 	RSB_DO_ERR_RETURN(errval)
201 }
202 
rsb__util_sort_row_major_inner(void * RSB_RESTRICT VA,rsb_coo_idx_t * RSB_RESTRICT IA,rsb_coo_idx_t * RSB_RESTRICT JA,const rsb_nnz_idx_t nnz,const rsb_coo_idx_t m,const rsb_coo_idx_t k,const rsb_type_t typecode,const rsb_flags_t flags)203 rsb_err_t rsb__util_sort_row_major_inner(void * RSB_RESTRICT VA, rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t * RSB_RESTRICT JA, const rsb_nnz_idx_t nnz, const rsb_coo_idx_t m, const rsb_coo_idx_t k, const  rsb_type_t typecode , const rsb_flags_t flags /*, void * WA, size_t wb */)
204 {
205 	rsb_err_t errval = RSB_ERR_NO_ERROR;
206 #if RSB_WANT_SORT_PARALLEL_BUT_SLOW
207 		if( rsb_global_session_handle.asm_sort_method > 0 )
208 		/* parallel and scaling but slow */
209 			errval = rsb__util_sort_row_major_parallel(VA,IA,JA,nnz,m,k,typecode,flags);
210 		else
211 #else /* RSB_WANT_SORT_PARALLEL_BUT_SLOW */
212 #endif  /* RSB_WANT_SORT_PARALLEL_BUT_SLOW */
213 			/* not so parallel nor scaling but fast */
214 			errval = rsb_util_sort_row_major_bucket_based_parallel(VA,IA,JA,nnz,m,k,typecode,flags);
215 		return errval;
216 }
217 
rsb_util_sort_row_major_bucket_based_parallel(void * RSB_RESTRICT VA,rsb_coo_idx_t * RSB_RESTRICT IA,rsb_coo_idx_t * RSB_RESTRICT JA,const rsb_nnz_idx_t nnz,const rsb_coo_idx_t m,const rsb_coo_idx_t k,const rsb_type_t typecode,const rsb_flags_t flags)218 rsb_err_t rsb_util_sort_row_major_bucket_based_parallel(void * RSB_RESTRICT VA, rsb_coo_idx_t * RSB_RESTRICT IA, rsb_coo_idx_t * RSB_RESTRICT JA, const rsb_nnz_idx_t nnz, const rsb_coo_idx_t m, const rsb_coo_idx_t k, const  rsb_type_t typecode , const rsb_flags_t flags /*, void * WA, size_t wb */)
219 {
220 	/**
221 		\ingroup gr_internals
222 		FIXME: EXPERIMENTAL, DOCUMENT ME
223 		FIXME: shall stand duplicates, and so consequently e.g. m*n<nnz (it is actual input, e.g. out of the sparse matrices' sum).
224 	*/
225 	int psc = RSB_PSORT_CHUNK;
226 	rsb_err_t errval = RSB_ERR_NO_ERROR;
227 //	rsb_nnz_idx_t frnz = 0;
228 	rsb_nnz_idx_t mnzpr = 0;
229 	rsb_nnz_idx_t *PA = NULL;
230 	void *WA = NULL;
231 	rsb_coo_idx_t *iWA = NULL;
232 	rsb_coo_idx_t *jWA = NULL;
233 	rsb_coo_idx_t *nWA = NULL;
234 	void *vWA = NULL;
235 	rsb_nnz_idx_t n = 0;
236 	size_t el_size = RSB_SIZEOF(typecode);
237 	//struct rsb_mtx_partitioning_info_t pinfop;
238 	/* const long wet = rsb_get_num_threads();*/ /* want executing threads */
239 	const long wet = rsb__set_num_threads(RSB_THREADS_GET_MAX_SYS); /* want executing threads; FIXME: it seems there is a severe bug with the definition of RSB_NTC */
240 #if RSB_DO_WANT_PSORT_TIMING
241 	rsb_time_t dt,pt,st,mt,ct;
242 	rsb_time_t tt = - rsb_time();
243 #endif /* RSB_DO_WANT_PSORT_TIMING */
244 	rsb_int_t ei = RSB_DO_FLAG_HAS(flags,RSB_FLAG_FORTRAN_INDICES_INTERFACE) ? 1 : 0;
245 
246 	if(wet==0)
247 	{
248 		errval = RSB_ERR_GENERIC_ERROR;
249 		RSB_PERR_GOTO(ret,"%s\n","No threads detected! Are you sure to have initialized the library?\n"); // RSB_ERRM_FLI
250 	}
251 
252 	if(RSB_MATRIX_UNSUPPORTED_TYPE(typecode))
253 	{
254 		errval = RSB_ERR_UNSUPPORTED_TYPE;
255 		RSB_PERR_GOTO(ret,"\n");
256 	}
257 
258 	if(nnz<2)
259 		goto ret;
260 
261 	if(RSB_MUL_OVERFLOW(sizeof(rsb_nnz_idx_t),(m+2),size_t,rsb_non_overflowing_t)
262 	|| RSB_MUL_OVERFLOW(sizeof(rsb_nnz_idx_t)*2+el_size,(nnz),size_t,rsb_non_overflowing_t))
263 	{
264 		errval = RSB_ERR_LIMITS;
265 		RSB_PERR_GOTO(err,"sorry, allocating that much memory would cause overflows\n");
266 	}
267 	PA = rsb__calloc(sizeof(rsb_nnz_idx_t)*(m+2));
268 //	WA = rsb__calloc(RSB_MAX(sizeof(rsb_coo_idx_t),el_size)*(nnz+1));
269 //	WA = rsb__calloc((2+3*sizeof(rsb_coo_idx_t)+el_size)*nnz);
270 //	WA = rsb__calloc((2*sizeof(rsb_coo_idx_t)+el_size)*nnz);
271 	WA = rsb__calloc_parallel((sizeof(rsb_coo_idx_t)*2+el_size)*nnz); // NEW 20101201
272 
273 	if(!PA || !WA)
274 	{
275 		errval = RSB_ERR_ENOMEM;
276 		RSB_PERR_GOTO(err,"after calloc, pa=%p, wa=%p\n",PA,WA);
277 	}
278 	iWA=((rsb_coo_idx_t*) WA);
279 	jWA=((rsb_coo_idx_t*)iWA)+nnz;
280 	vWA=((rsb_coo_idx_t*)jWA)+nnz;
281 //	nWA=((rsb_char_t*)vWA)+(el_size*nnz);
282 
283 	/* saving one head element with a trick */
284 	++PA;
285 #if RSB_DO_WANT_PSORT_TIMING
286 	dt = rsb_time();
287 	mt=-dt;
288 #endif /* RSB_DO_WANT_PSORT_TIMING */
289 
290 #if RSB_DO_WANT_PSORT_FASTER_BUT_RISKY
291 # if 1
292 	/* FIXME: unfinished and incorrect code */
293 	#pragma omp parallel for schedule(static,psc) shared(PA,IA) RSB_NTC
294 	for(n=0;n<nnz;++n)
295 		PA[IA[n]+1-ei]++;
296 # else
297 	/* actually, this code is VERY SLOW :) */
298 	#pragma omp parallel reduction(|:errval) shared(PA,IA)
299 	{
300 		rsb_nnz_idx_t n;
301 		rsb_thr_t th_id = omp_get_thread_num();
302 		rsb_thr_t tn = omp_get_num_threads();
303 
304 		for(n=0;RSB_LIKELY(n<nnz);++n)
305 			if(IA[n]%tn==th_id)
306 				PA[IA[n]+1-ei]++;
307 	}
308 	#pragma omp barrier
309 #endif
310 #else /* RSB_DO_WANT_PSORT_FASTER_BUT_RISKY */
311 	/* setting PA[i] to contain the count of elements on row i */
312 	for(n=0;RSB_LIKELY(n<nnz);++n)
313 	{
314 		RSB_ASSERT(IA[n]>=0);
315 		RSB_ASSERT(IA[n]<=m);
316 		PA[IA[n]+1-ei]++;
317 #if RSB_DO_WANT_PSORT_VERBOSE>1
318 		RSB_INFO("PA[m] = %d\n",PA[m]);
319 		RSB_INFO("IA[%d] = %d   PA[%d] = %d\n",n,IA[n],IA[n]+1-ei,PA[IA[n]+1-ei]);
320 #endif /* RSB_DO_WANT_PSORT_VERBOSE */
321 	}
322 #endif /* RSB_DO_WANT_PSORT_FASTER_BUT_RISKY */
323 	/* setting PA[i] to contain the count of elements before row i */
324 	for(n=0;RSB_LIKELY(n<m);++n)
325 	{
326 		PA[n+1] += PA[n];
327 #if RSB_DO_WANT_PSORT_VERBOSE>1
328 		RSB_INFO("PA[%d] = %d\n",n,PA[n]);
329 #endif /* RSB_DO_WANT_PSORT_VERBOSE */
330 	}
331 #if RSB_DO_WANT_PSORT_VERBOSE>1
332 	RSB_INFO("PA[%d] = %d\n",n,PA[n]);
333 #endif /* RSB_DO_WANT_PSORT_VERBOSE */
334 
335 #if RSB_DO_WANT_PSORT_TIMING
336 	dt = rsb_time();
337 	mt+=dt;
338 	pt=-dt;
339 #endif /* RSB_DO_WANT_PSORT_TIMING */
340 	/* shuffling elements on the basis of their row
341 	 * FIXME : this is the slowest part of this code
342 	 * its performance is largely dependent on cache lenghts and latencies.. */
343 	/* FIXME : parallelization of this is challenging */
344 	rsb_util_do_scatter_rows(vWA,iWA,jWA,VA,IA,JA,PA-ei,nnz,typecode);
345 	--PA; /* PA has been modified. */
346 #if RSB_DO_WANT_PSORT_TIMING
347 	dt = rsb_time();
348 	pt+=dt;
349 	st=-dt;
350 #endif /* RSB_DO_WANT_PSORT_TIMING */
351 //	RSB_COA_MEMCPY(IA,iWA,0,0,nnz);
352 //	RSB_COA_MEMCPY(JA,jWA,0,0,nnz);
353 //	RSB_A_MEMCPY(VA,vWA,0,0,nnz,el_size);
354 //	SB_COA_MEMCPY_parallel(IA,iWA,0,0,nnz);
355 //	RSB_COA_MEMCPY_parallel(JA,jWA,0,0,nnz);
356 //	RSB_A_MEMCPY_parallel(VA,vWA,0,0,nnz,el_size);
357 	/* restore the row pointers with a trick */
358 
359 	RSB_ASSERT(PA[m]==nnz);
360 
361 	/* TODO: parallelization of this ? FIXME: is this necessary ? */
362 	for(n=0;n<m;++n)
363 		mnzpr = RSB_MAX(mnzpr,PA[n+1]-PA[n]);
364 
365 	nWA = rsb__malloc(sizeof(rsb_nnz_idx_t)*(mnzpr+2)*wet);/* rsb__malloc is evil inside openmp */
366 	if(!nWA) { RSB_DO_ERROR_CUMULATE(errval,RSB_ERR_ENOMEM);RSB_PERR_GOTO(err,RSB_ERRM_ES); }
367 
368 	psc = RSB_MIN(psc,m);
369 	/* the rows are ready to be sorted (FIXME: this is slow, and could be optimized very much) */
370 //	#pragma omp parallel for reduction(|:errval)
371 //	#pragma omp parallel for
372 //	#pragma omp parallel for schedule(static,10)
373 //	#pragma omp parallel for schedule(static,1)
374 //	#pragma omp parallel for schedule(static,psc) shared(iWA,jWA,vWA,nWA,PA)   num_threads(wet)
375 	#pragma omp parallel for schedule(static,psc) shared(iWA,jWA,vWA,nWA,PA)   RSB_NTC
376 	for(n=0;n<m;++n)
377 	{
378 		rsb_nnz_idx_t nnz1,nnz0;
379 #if 1
380 #if RSB_WANT_OMP_RECURSIVE_KERNELS
381 		rsb_thread_t th_id = omp_get_thread_num();
382 #else /* RSB_WANT_OMP_RECURSIVE_KERNELS */
383 		rsb_thread_t th_id=0;
384 #endif /* RSB_WANT_OMP_RECURSIVE_KERNELS */
385 		rsb_nnz_idx_t tnoff=th_id*(mnzpr+2);
386 		nnz1=PA[n+1];
387 		nnz0=PA[n];
388 #if RSB_DO_WANT_PSORT_VERBOSE
389 		RSB_INFO("psort row %d/%d: nonzeros [%d .. %d/%d] on thread %d\n",(int)n,m,(int)nnz0,(int)nnz1,(int)nnz,(int)th_id);
390 #endif /* RSB_DO_WANT_PSORT_VERBOSE */
391 		if(nnz1-nnz0<2)
392 			continue;/* skip empty line. TODO: could implement with macro sorting algorithms for few nnz */
393 		if(!RSB_SOME_ERROR(rsb_do_msort_up(nnz1-nnz0,jWA+nnz0,nWA+tnoff)))
394 			rsb_ip_reord(nnz1-nnz0,((rsb_char_t*)vWA)+el_size*nnz0,iWA+nnz0,jWA+nnz0,nWA+tnoff,typecode);
395 #else
396 		nnz1=PA[n+1];
397 		nnz0=PA[n];
398 		rsb__do_util_sortcoo(vWA+nnz0,iWA+nnz0,jWA+nnz0,m,k,nnz1-nnz0,typecode,NULL,flags,NULL,0);
399 #endif
400 	}
401 
402 #if RSB_DO_WANT_PSORT_TIMING
403 	dt = rsb_time();
404 	st+=dt;
405 	ct=-dt;
406 #endif /* RSB_DO_WANT_PSORT_TIMING */
407 
408 	RSB_COA_MEMCPY_parallel(IA,iWA,0,0,nnz);
409 	RSB_COA_MEMCPY_parallel(JA,jWA,0,0,nnz);
410 	RSB_A_MEMCPY_parallel(VA,vWA,0,0,nnz,el_size);
411 #if RSB_DO_WANT_PSORT_TIMING
412 	dt = rsb_time();
413 	ct+=dt;
414 #endif /* RSB_DO_WANT_PSORT_TIMING */
415 err:
416 	RSB_CONDITIONAL_FREE(PA);
417 	RSB_CONDITIONAL_FREE(WA);
418 	RSB_CONDITIONAL_FREE(nWA);
419 #if RSB_DO_WANT_PSORT_TIMING
420 	dt = rsb_time();
421 	tt+=dt;
422 	RSB_INFO("pt:%lg  st:%lg  tt:%lg  mt:%lg ct:%lg\n",pt,st,tt,mt,ct);
423 #endif /* RSB_DO_WANT_PSORT_TIMING */
424 ret:
425 	RSB_DO_ERR_RETURN(errval)
426 }
427 /* @endcond */
428