1dnl
2dnl
3dnl	@author: Michele Martone
4dnl
5/* @cond INNERDOC */
6/*!
7 @file
8 @brief
9
10 Performance info gathering code. (OBSOLETE)
11 */
12dnl
13include(`rsb_misc.m4')dnl
14dnl
15RSB_M4_HEADER_MESSAGE()dnl
16dnl
17ifdef(`ONLY_WANT_HEADERS',`
18#ifndef RSB_BENCH_H_INCLUDED
19#define RSB_BENCH_H_INCLUDED
20')
21dnl
22include(`do_unroll.m4')dnl
23include(`rsb_krnl_vb_macros.m4')dnl
24include(`rsb_krnl_macros.m4')dnl
25dnl
26#ifdef __cplusplus
27extern "C" {
28#endif /* __cplusplus */
29dnl
30#include "rsb_internals.h"
31dnl
32#ifdef RSB_HAVE_CBLAS_H
33#include <cblas.h>
34#endif /* RSB_HAVE_CBLAS_H */
35#ifdef RSB_HAVE_CLAPACK_H
36#include <clapack.h>
37#endif /* RSB_HAVE_CLAPACK_H */
38#include <math.h>
39dnl
40dnl
41dnl
42dnl	RSB_M4_HYPERBOLIC_FITTING_FUNCTION_ARGS()
43dnl	---------------------------------
44dnl
45define(`RSB_M4_HYPERBOLIC_FITTING_FUNCTION_ARGS',`dnl
46dnl
47`(double x[], double y[], size_t nb_loop, double * a, double * b, double *c, double c_s)'dnl
48dnl
49')dnl
50dnl
51dnl
52dnl	RSB_M4_HYPERBOLIC_FITTING_FUNCTION_IDENTIFIER()
53dnl	---------------------------------
54dnl
55define(`RSB_M4_HYPERBOLIC_FITTING_FUNCTION_IDENTIFIER',`dnl
56dnl
57`rsb_fit_hyp'dnl
58dnl
59')dnl
60dnl
61dnl
62dnl
63dnl	RSB_M4_HYPERBOLIC_FITTING_FUNCTION()
64dnl	---------------------------------
65dnl
66define(`RSB_M4_HYPERBOLIC_FITTING_FUNCTION',`dnl
67dnl
68rsb_err_t RSB_M4_HYPERBOLIC_FITTING_FUNCTION_IDENTIFIER()`'dnl
69RSB_M4_HYPERBOLIC_FITTING_FUNCTION_ARGS()`'dnl
70ifdef(`ONLY_WANT_HEADERS',`;
71',`
72{
73#if !(RSB_HAVE_CLAPACK && RSB_HAVE_CBLAS)
74	return RSB_ERR_UNSUPPORTED_OPERATION;
75#else
76	/**
77	 * \ingroup gr_bench
78         * Note :
79	 *
80	 * This function will compute a performance predictor based on
81         * nonzero per row ratio, by fitting the two input x (non zeros per row)
82         * and y (megaflops) vectors (both with n = RSB_FITTING_SAMPLES points) to
83         * the following formula :
84         *
85         *           `megaflops (nnz_per_row) a + b / ( c + nnz_per_row )'
86         *
87         * The c_s and nb_loop arguments will be documented some day.
88         *
89	 * This model is discussed in the following article :
90
91@article{ButtEijkLang:spmvp,
92  title = {Performance Optimization and Modeling of Blocked Sparse Kernels},
93  author = {Buttari, Alfredo and Eijkhout, Victor and Langou, Julien and Filippone, Salvatore},
94  pages = {467--484},
95  year = 2007,
96  journal = {IJHPCA},
97  volume = 21,
98  url = {\url{{http://www.tacc.utexas.edu/~eijkhout/Articles/2007-buttari-spmvp.pdf}}}
99}
100         *
101         */
102
103	rsb_int nparms=3;
104	rsb_int n = RSB_FITTING_SAMPLES;
105	/* Fortran arrays */
106#define RSB_FORTRAN_ARRAY(AI,ROWS,COLS) AI[(ROWS)*(COLS)]
107
108	rsb_int nj = 3;
109	rsb_int i,j;
110	rsb_err_t errval = RSB_ERR_NO_ERROR;
111
112	double RSB_FORTRAN_ARRAY(G ,n,3);
113	double RSB_FORTRAN_ARRAY(G1,n,3);
114	double RSB_FORTRAN_ARRAY(GG,3,3);
115	double RSB_FORTRAN_ARRAY(z ,n,1);
116	double RSB_FORTRAN_ARRAY(z0,n,1);
117	double RSB_FORTRAN_ARRAY(dy,n,1);
118	double RSB_FORTRAN_ARRAY(ddy,3,1);
119	double RSB_FORTRAN_ARRAY(xj ,nj,1);
120	double RSB_FORTRAN_ARRAY(yj ,nj,1);
121	double RSB_FORTRAN_ARRAY(zj ,nj,1);
122
123	double xcpy[n];
124	double a_t,b_t,sum1,sum2,sum3,sum4,error,tmp_a,tmp_b,tmp_c, min_err,max,min,avg,intl;
125  	int /*i,*/info,ipivot[3],/*nj,j,*/k,cnt;
126	rsb__memcpy(xcpy,x,sizeof(xcpy));	/* not a bit more .. and please note that sizeof(x)=sizeof(double*) != sizeof(x[n])*/
127
128
129	RSB_INFO("starting analysis...\n");
130	RSB_STDOUT("\n");
131	RSB_STDOUT("performance data:\n");
132	for(i=0;i<n;++i)
133	{
134		RSB_STDOUT("%lg %lg\n",xcpy[i],y[i]);
135	}
136
137	sum1=0;
138	sum2=0;
139	sum3=0;
140	sum4=0;
141
142
143  	*a=y[n-1];
144
145	rsb__memcpy(xj,x,sizeof(xj));	/* not a bit more */
146	rsb__memcpy(yj,y,sizeof(yj));	/* not a bit more */
147
148	for(i=0;i<nj;++i)
149  	{
150		zj[i]=yj[i]-*a;
151  		zj[i]=1/zj[i];
152	}
153
154	for(i=0;i<nj;++i)
155	{
156		sum1=sum1 + xj[i]*zj[i];
157		sum2=sum2 + xj[i];
158		sum3=sum3 + zj[i];
159		sum4=sum4 + xj[i]*xj[i];
160	}
161
162	a_t= (sum3*sum4-sum2*sum1)/(nj*sum4-sum2*sum2);
163	b_t=(nj*sum1 - sum2*sum3) / (nj*sum4 - sum2*sum2);
164
165  	*b=1/b_t;
166	*c=a_t* *b;
167
168	for(i=0;i<n;++i)
169		z0[i]= *a +*b/(x[i]+*c);
170
171	error = 0;
172	for(j=0;j<n;++j)
173		error = error + (fabs( z0[j] - y[j] ) / y[j] );
174
175	error = error / n * 100;
176
177	min_err=error;
178
179	tmp_a=*a;
180	tmp_b=*b;
181	tmp_c=*c;
182
183	for(i=0;i<nb_loop;++i)
184	{
185		for(j=0;j<n;++j)
186			dy[j] = z0[j]-y[j];
187
188		for(j=0;j<n;++j)
189		{
190			G[j+0*n]=1;
191			G[j+1*n]=1/(x[j]+tmp_c);
192			G[j+2*n]=-tmp_b/( (x[j]+tmp_c)*(x[j]+tmp_c) );
193
194			G1[j+0*n]= G[j+0*n];
195			G1[j+1*n]= G[j+1*n];
196			G1[j+2*n]= G[j+2*n];
197		}
198
199#if
200		cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,3,3,n,1.0,G,n,G1,n,0.0,GG,3);
201		errval =  clapack_dgetrf(CblasColMajor,3,3,GG,3,ipivot);
202		if(RSB_SOME_ERROR(errval)) goto err;
203		cblas_dgemv(CblasColMajor,CblasTrans,n,3,1.0,G,n,dy,1,0.0,ddy,1);
204		errval =  clapack_dgetrs(CblasColMajor,CblasNoTrans,3,1,GG,3,ipivot,ddy,3);
205		if(RSB_SOME_ERROR(errval)) goto err;
206#else /* (RSB_HAVE_CLAPACK && RSB_HAVE_CBLAS) */
207#endif /* (RSB_HAVE_CLAPACK && RSB_HAVE_CBLAS) */
208
209		tmp_a = tmp_a-ddy[1-1];
210		tmp_b = tmp_b-ddy[2-1];
211		tmp_c = tmp_c-ddy[3-1];
212
213		for(j=0;j<n;++j)
214			z0[j]= tmp_a +tmp_b/(x[j]+tmp_c);
215
216		error = 0;
217		for(j=0;j<n;++j)
218	       		error = error + (fabs( z0[j] - y[j] ) / y[j] );
219
220		error = error / n * 100;
221		if(error < min_err)
222		{
223		        *a=tmp_a;
224		        *b=tmp_b;
225		        *c=tmp_c;
226		}
227	}
228
229	if((*c< 0) && (*c  < c_s))
230	{
231		*c=10000;
232		*b=10000;
233		avg=0;
234		max=y[0];
235		min=y[0];
236		for(i=0;i<n;++i)
237		{
238		        if (y[i] > max) max=y[i];
239		        if (y[i] < min) min=y[i];
240		        avg=avg+y[i];
241		}
242		avg=avg/(double)(n);
243		*a=avg;
244		intl=max-min;
245		avg=0;
246		cnt=0;
247		for(/*i=0*/;i<n;++i)
248		//for(i=0;i<n;++i)
249		{
250        		if (fabs(y[i]-avg) < (0.3*intl))
251			{
252				avg = avg + y[i];
253				cnt=cnt+1;
254			}
255		}
256     		if(cnt > 0) *a=avg/(double)cnt;
257	}
258	else
259  	if (*b >= 0)
260	{
261		*c=10000;
262		*b=10000;
263		avg=0;
264		max=y[0];
265		min=y[0];
266		for(i=0;i<n;++i)
267		{
268			if (y[i] > max) max=y[i];
269			if (y[i] < min) min=y[i];
270			avg=avg+y[i];
271		}
272		avg=avg/(double)n;
273		intl=max-min;
274		avg=0;
275		cnt=0;
276		//for(i=0;i<n;++i)
277		for(/*i=0*/;i<n;++i)
278		{
279		        if (fabs(y[i]-avg) < (0.3*intl))
280			{
281				avg = avg + y[i];
282				cnt=cnt+1;
283			}
284		}
285		if(cnt > 0) *a=avg/ (double) cnt;
286	}
287
288
289	RSB_STDOUT("\n");
290	RSB_STDOUT("alpha:%lg beta:%lg gamma:%lg\n",*a,*b,*c);
291
292	RSB_STDOUT("\nfitting:\n");
293	for(i=0;i<n;++i)
294	{
295		RSB_STDOUT("%lg %lg\n", xcpy[i], *a+*b/(xcpy[i]+*c));
296	}
297
298	return RSB_ERR_NO_ERROR;
299	err:
300	RSB_DO_ERR_RETURN(errval)
301#endif /* RSB_HAVE_CLAPACK && RSB_HAVE_CBLAS */
302}
303')dnl
304')dnl
305dnl
306dnl
307dnl
308dnl
309dnl	RSB_M4_REFERENCEBENCHMARK_FUNCTION_ARGS()
310dnl	------------------------------------------------------------------
311dnl
312define(`RSB_M4_REFERENCEBENCHMARK_FUNCTION_ARGS',`dnl
313dnl
314`(void)'dnl
315dnl
316')dnl
317dnl
318dnl
319dnl
320dnl	RSB_M4_REFERENCEBENCHMARK_FUNCTION_IDENTIFIER()
321dnl	--------------------------------------------
322dnl
323define(`RSB_M4_REFERENCEBENCHMARK_FUNCTION_IDENTIFIER',`dnl
324dnl
325`rsb__do_referencebenchmark'dnl
326dnl
327dnl
328')dnl
329dnl
330dnl
331dnl
332dnl	RSB_M4_REFERENCEBENCHMARK_FUNCTION_NAME()
333dnl	--------------------------------------
334dnl
335define(`RSB_M4_REFERENCEBENCHMARK_FUNCTION_NAME',`dnl
336dnl
337rsb_err_t RSB_M4_REFERENCEBENCHMARK_FUNCTION_IDENTIFIER`'dnl
338dnl
339dnl
340')dnl
341dnl
342dnl
343dnl
344dnl	RSB_M4_REFERENCEBENCHMARK_FUNCTION()
345dnl	---------------------------------
346dnl
347define(`RSB_M4_REFERENCEBENCHMARK_FUNCTION',`dnl
348dnl
349RSB_M4_REFERENCEBENCHMARK_FUNCTION_NAME`'dnl
350RSB_M4_REFERENCEBENCHMARK_FUNCTION_ARGS`'dnl
351ifdef(`ONLY_WANT_HEADERS',`;
352',`
353{
354	/*!
355	 * \ingroup gr_bench
356	 * A complete benchmark program.
357	 * Will benchmark all supported matrix operations over all supported types
358	 * over all supported matrix partitionings.
359	 *
360	 * Moreover, it WILL perform analysis of performance data and results dumput.
361         *
362	 * \return \rsb_errval_inp_param_msg
363         *
364	 * FIXME : UNFINISHED: should process and dump this info in a header file.
365	 */
366	struct rsb_global_reference_performance_info_t grpi;
367	rsb_err_t errval = RSB_ERR_NO_ERROR;
368	rsb_blk_idx_t ri,ci;	/* row index, columns index */
369	rsb_coo_idx_t order=20000;
370	rsb_coo_idx_t rows=order,cols=order;	/* FIXME : TEMPORARY */
371	rsb_blk_idx_t rua[] = RSB_ROWS_UNROLL_ARRAY;
372	rsb_blk_idx_t cua[] = RSB_COLUMNS_UNROLL_ARRAY;
373	double tot_secs=0.0,pred_secs=1.0;
374	rsb_trans_t transA = RSB_DEFAULT_TRANSPOSITION;
375	size_t kernels_n = RSB_ROWS_UNROLL_ARRAY_LENGTH*RSB_COLUMNS_UNROLL_ARRAY_LENGTH*RSB_IMPLEMENTED_MOPS*RSB_IMPLEMENTED_TYPES;
376	rsb_int ti=0;	/* type index */
377	int fbw,bwi;
378	RSB_BZERO_P(&grpi);
379
380	/* if((errval = rsb_lib_init(RSB_NULL_INIT_OPTIONS))){goto err;} we skip this to enable calling this from within our library (FIXME) */
381
382	if(RSB_FITTING_SAMPLES<2)
383	{
384		fbw=(RSB_FIRST_FITTING_SAMPLE_BW_MAX + RSB_FIRST_FITTING_SAMPLE_BW_MIN)/2;
385		bwi=fbw;
386	}
387	else
388	{
389		fbw = RSB_FIRST_FITTING_SAMPLE_BW_MIN;
390		bwi=(RSB_FIRST_FITTING_SAMPLE_BW_MAX - RSB_FIRST_FITTING_SAMPLE_BW_MIN)/(RSB_FITTING_SAMPLES-1);
391	}
392
393	tot_secs = -rsb_time();
394	pred_secs *= RSB_ROWS_UNROLL_ARRAY_LENGTH * RSB_COLUMNS_UNROLL_ARRAY_LENGTH * RSB_FITTING_SAMPLES * RSB_IMPLEMENTED_META_MOPS *  RSB_IMPLEMENTED_TYPES * RSB_BENCHMARK_MIN_SECONDS;
395	RSB_STDERR("#reference benchmarking of %zd kernels (no transposed, no symmetric, and so on) should take at least %lg seconds..\n",kernels_n,pred_secs);
396
397foreach(`mtype',RSB_M4_MATRIX_TYPES,`dnl
398	/* mtype type benchmarking */
399/*	RSB_INFO("#mtype type benchmarking\n");*/
400	for(ri=0;ri<RSB_ROWS_UNROLL_ARRAY_LENGTH;++ri)
401	{
402		for(ci=0;ci<RSB_COLUMNS_UNROLL_ARRAY_LENGTH;++ci)
403		{
404			rsb_blk_idx_t br = rua[ri];
405			rsb_blk_idx_t bc = cua[ci];
406			rsb_coo_idx_t bw,mbw=(cols/bc);
407			rsb_int si=0;	/* sample index */
408			mbw=(cols-bc)/bc;	/* tune here to fill further our matrix */
409			/* FIXME : there is the danger of empty samples! */
410			for(bw=fbw;bw<=mbw && si< RSB_FITTING_SAMPLES ;bw+=bwi)	/* this parameter should be tunable, too */
411			{
412				//RSB_INFO("bw = %d\n",bw);
413				rsb_int moi=0;	/* matrix operation index */
414				double time,*timep=&time;
415				struct rsb_mtx_t * mtxAp =
416					rsb__generate_blocked_banded(br,bc,rows,cols,bw,timep,RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(mtype),RSB_BOOL_TRUE );	/* FIXME : generating triangular factors always ! */
417				if(!mtxAp)
418				{
419					RSB_STDERR(RSB_ERRM_IE);
420					{errval = RSB_ERR_GENERIC_ERROR; goto err;}
421				}
422dnl				struct rsb_options_t * o = mtxAp->options;
423
424foreach(`mop',RSB_M4_MATRIX_META_OPS,`dnl
425				{
426/*					RSB_INFO("#mtype type, ");*/
427/*					RSB_INFO("mop operation benchmarking\n");*/
428					/* mop operation benchmarking */
429ifelse(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop),1,`dnl
430					mtype *out=NULL,*rhs=NULL;
431')dnl
432ifelse(RSB_M4_IS_SPXX_OP_SCALING_KERNEL_MOP(mop),1,`dnl
433			double alpha=1.0;/* FIXME */
434			double * alphap = &alpha;
435')dnl
436ifelse(RSB_M4_IS_SPXX_SCALING_KERNEL_MOP(mop),1,`dnl
437			double beta =1.0;/* FIXME */
438			double * betap  = &beta ;
439')dnl
440ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl
441					mtype * row_sums;
442')dnl
443
444
445ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl
446					row_sums = rsb__malloc(mtxAp->el_size*(rows+br));
447					if(!row_sums) {errval = RSB_ERR_ENOMEM;goto erri_`'RSB_M4_CHOPSPACES(mtype)`'`_'`'mop;}
448					if(rsb__fill_with_ones(row_sums,mtxAp->typecode,cols,1))     {errval = RSB_ERR_ENOMEM;goto erri_`'RSB_M4_CHOPSPACES(mtype)`'`_'`'mop;}
449')dnl
450ifelse(mop,`scale',`dnl
451					mtype * scale_factors = rsb__malloc(mtxAp->el_size*(rows+br));
452					if(!scale_factors) {errval = RSB_ERR_ENOMEM;goto erri_`'RSB_M4_CHOPSPACES(mtype)`'`_'`'mop;}
453					if(rsb__fill_with_ones(scale_factors,mtxAp->typecode,rows,1))     {errval = RSB_ERR_ENOMEM;goto erri_`'RSB_M4_CHOPSPACES(mtype)`'`_'`'mop;}
454')dnl
455ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl
456')dnl
457ifelse(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop),1,`dnl
458					rsb_coo_idx_t nrhs=4;
459					rsb_coo_idx_t bstride = cols+bc;
460					rsb_coo_idx_t cstride = rows+br;
461ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`dnl
462					rsb_coo_idx_t incx=1,incy=1;
463',`dnl
464					rsb_coo_idx_t incx=1,incy=1;
465')dnl
466					incx=1,incy=1;	/* this is just a pacifier for "unused variable"-like warnings */
467					rhs = rsb__malloc(mtxAp->el_size*(bstride)*nrhs);
468					out = rsb__malloc(mtxAp->el_size*(cstride)*nrhs);
469					if(!out || rsb__fill_with_ones(out,mtxAp->typecode,cstride*nrhs,1)){errval = RSB_ERR_ENOMEM;goto erri_`'RSB_M4_CHOPSPACES(mtype)`'`_'`'mop;}
470					if(!rhs || rsb__fill_with_ones(rhs,mtxAp->typecode,bstride*nrhs,1)){errval = RSB_ERR_ENOMEM;goto erri_`'RSB_M4_CHOPSPACES(mtype)`'`_'`'mop;}
471')dnl
472ifelse(mop,`negation',`dnl
473					int please_fix_RSB_M4_ARGS_TO_ACTUAL_ARGS=-1;/* here to fix negation */
474')dnl
475
476					grpi.gpi[ti].pipmo[moi].blocks_per_row[si]=bw*bc; /* FIXME : TEMPORARY !!  */
477
478					/* we benchmark our mtype library implementation for operation mop */
479					grpi.gpi[ti].pipmo[moi].pipfs[si].seconds[ri][ci] = RSB_BENCHMARK_MIN_SECONDS; /* min seconds */
480					grpi.gpi[ti].pipmo[moi].pipfs[si].m_flops[ri][ci] = (double)RSB_BENCHMARK_MIN_RUNS; /* min runs */
481
482					errval = dnl
483ifelse(RSB_M4_MATRIX_OP_IS_META_OP(mop),`1',dnl
484`0;/* meta-op : we already measured matrix creation time  */
485grpi.gpi[ti].pipmo[moi].pipfs[si].seconds[ri][ci]=time;
486grpi.gpi[ti].pipmo[moi].pipfs[si].m_flops[ri][ci]=((double)rsb__do_get_matrix_nnz(mtxAp))/1000000;
487/* FIXME : this is experimental and unfinished code */
488',`
489RSB_M4_DIRECT_KERNEL_DISPATCH_BENCHMARK_FUNCTION_IDENTIFIER(mop,mtype)(dnl
490&(grpi.gpi[ti].pipmo[moi].pipfs[si].seconds[ri][ci]),dnl
491&(grpi.gpi[ti].pipmo[moi].pipfs[si].m_flops[ri][ci]),dnl
492RSB_M4_DIRECT_KERNEL_DISPATCH_TIMING_FUNCTION_ACTUAL_ARGS(mop,mtype));')
493
494					grpi.gpi[ti].pipmo[moi].pipfs[si].fillin[ri][ci]  = rsb__do_get_matrix_fillin(mtxAp);
495					grpi.gpi[ti].pipmo[moi].pipfs[si].rows = rows;
496					grpi.gpi[ti].pipmo[moi].pipfs[si].cols = cols;
497					grpi.gpi[ti].pipmo[moi].pipfs[si].nnz  = rsb__do_get_matrix_nnz(mtxAp) ;
498					grpi.gpi[ti].pipmo[moi].pipfs[si].flags= mtxAp->flags ;
499					grpi.gpi[ti].pipmo[moi].pipfs[si].storage= mtxAp->matrix_storage ;
500					grpi.gpi[ti].pipmo[moi].pipfs[si].typecode= mtxAp->typecode ;
501					grpi.gpi[ti].pipmo[moi].pipfs[si].element_count= mtxAp->element_count;
502
503					grpi.gpi[ti].pipmo[moi].pipfs[si].e_mflops[ri][ci] =
504						grpi.gpi[ti].pipmo[moi].pipfs[si].m_flops[ri][ci] /
505						grpi.gpi[ti].pipmo[moi].pipfs[si].fillin[ri][ci];
506
507					if(RSB_SOME_ERROR(errval)){goto erri_`'RSB_M4_CHOPSPACES(mtype)`'`_'`'mop;}
508					++moi;
509
510					erri_`'RSB_M4_CHOPSPACES(mtype)`'`_'`'mop:
511					if(RSB_SOME_ERROR(errval))goto err;
512
513					RSB_NULL_STATEMENT_FOR_COMPILER_HAPPINESS
514ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl
515					RSB_CONDITIONAL_FREE(row_sums);
516')dnl
517ifelse(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop),1,`dnl
518					RSB_CONDITIONAL_FREE(out);
519					RSB_CONDITIONAL_FREE(rhs);
520')dnl
521ifelse(mop,`scale',`dnl
522					RSB_CONDITIONAL_FREE(scale_factors);
523')dnl
524				}
525')dnl
526				RSB_MTX_FREE(mtxAp);
527				++si;
528			}
529		}
530	}
531	{
532		rsb_int moi;
533		rsb_char_t * mops[] = RSB_M4_MATRIX_META_OPS_ARRAY;
534		rsb_char_t * types[] = RSB_M4_MATRIX_TYPES_ARRAY;
535		rsb_char_t s[RSB_M4_BUFLEN];
536		rsb__print_mop_reference_performance_info_header();
537		for(moi=0;moi<RSB_IMPLEMENTED_META_MOPS;++moi)
538		{
539/*			rsb_int si;*/
540			/* informational printout */
541			sprintf(s,"%s\t%s\t",types[ti], mops[moi]);
542			rsb__print_mop_reference_performance_info(&(grpi.gpi[ti].pipmo[moi]),s);
543/*			for(si=0;si<RSB_FITTING_SAMPLES;++si)*/
544/*				rsb__dump_performance_info(&(grpi.gpi[ti].pipmo[moi].pipfs[si]), NULL);*/
545		}
546	}
547	++ti;
548')dnl
549	tot_secs += rsb_time();
550	RSB_STDERR("#reference benchmarking took %lg seconds (predicted %lg :)....\n",tot_secs,pred_secs);
551
552	grpi.initialized=1;	/* FIXME : only partially */
553	//rsb__dump_global_reference_performance_info(&grpi);
554#if RSB_WANT_PERFORMANCE_FILE
555	rsb__save_global_reference_performance_info(&grpi);
556#endif /* RSB_WANT_PERFORMANCE_FILE */
557	return RSB_ERR_NO_ERROR;	/* FIXME : temporary */
558
559	ti=0;	/* type index */
560	for(ti=0;ti<RSB_IMPLEMENTED_TYPES	;++ti)
561	for(ri=0;ri<RSB_ROWS_UNROLL_ARRAY_LENGTH;++ri)
562	{
563		for(ci=0;ci<RSB_COLUMNS_UNROLL_ARRAY_LENGTH;++ci)
564		{
565			rsb_blk_idx_t bc = cua[ci];
566			rsb_int moi=0;	/* matrix operation index */
567			for(moi=0;moi<RSB_IMPLEMENTED_META_MOPS ;++moi)
568			{
569				rsb_int si=0;	/* sample index */
570
571				double y[RSB_FITTING_SAMPLES];
572				double * x = grpi.gpi[ti].pipmo[moi].blocks_per_row;
573
574				for(si=0;si< RSB_FITTING_SAMPLES ;++si)
575				{
576					/* we tune our mtype library implementation for operation mop */
577						y[si] =
578							grpi.gpi[ti].pipmo[moi].pipfs[si].m_flops[ri][ci]/
579							grpi.gpi[ti].pipmo[moi].pipfs[si].seconds[ri][ci];
580				}
581
582				/*
583				 * FIXME : make this fitting analysis offline respect our benchmark!
584				 */
585				errval = RSB_M4_HYPERBOLIC_FITTING_FUNCTION_IDENTIFIER()(
586						x, y, 3,
587						&(grpi.gpi[ti].pipmo[moi].alpha[ri][ci]),
588						&(grpi.gpi[ti].pipmo[moi].beta [ri][ci]),
589						&(grpi.gpi[ti].pipmo[moi].gamma[ri][ci]), (double)bc
590						/* FIXME : is this right ?*/
591					);
592				if(RSB_SOME_ERROR(errval))goto err;
593			}
594		}
595	}
596
597	if( rsb_lib_exit(RSB_NULL_EXIT_OPTIONS) )
598		return RSB_ERR_INTERNAL_ERROR;
599
600	return RSB_ERR_NO_ERROR;
601err:
602	RSB_DO_ERR_RETURN(errval)
603}
604')dnl
605')dnl
606dnl
607dnl
608dnl
609dnl
610dnl
611RSB_M4_HYPERBOLIC_FITTING_FUNCTION()
612RSB_M4_REFERENCEBENCHMARK_FUNCTION()
613dnl
614dnl
615dnl
616dnl
617dnl
618dnl
619dnl
620#ifdef __cplusplus
621}
622#endif  /* __cplusplus */
623dnl
624dnl
625ifdef(`ONLY_WANT_HEADERS',`
626#endif /* RSB_BENCH_H_INCLUDED */
627')
628/* @endcond */
629dnl
630