1dnl
2dnl	@author: Michele Martone
3dnl
4/*!
5 @file
6 @brief
7 Performance kernels dispatching code, for each type, submatrix size, operation.
8 But for block compressed sparse stripes format.
9 Kernels unrolled, with no loops, for only user-specified blockings.
10 */
11dnl
12include(`rsb_misc.m4')dnl
13RSB_M4_HEADER_MESSAGE()dnl
14RSB_M4_HEADER_EXTRA_DECLARATIONS()dnl
15include(`rsb_krnl_bcss_macros.m4')dnl
16include(`rsb_krnl_vb_macros.m4')dnl FIXME : RSB_M4_KERNEL_DIRECT_DISPATCHER_FUNCTION_NAME
17dnl
18dnl
19dnl
20dnl
21dnl
22dnl
23define(`RSB_M4_BCOO_SPMV_KERNELS',`dnl
24dnl
25pushdef(`unrollings',$1)dnl
26dnl
27dnl	FIXED BLOCK SIZE KERNELS :
28dnl
29foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
30foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
31foreach(`matrix_storage',RSB_M4_BCOO_FORMATS,`dnl
32foreach(`unrolling',unrollings,`dnl
33dnl ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),
34ifelse(1,1,`dnl
35foreach(`diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
36foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl
37foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl
38foreach(`symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
39foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
40foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
41foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
42RSB_M4_BCOO_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,symmetry,rowsu,colsu,unrolling,mop,citype,diagonal,uplo)
43')dnl
44')dnl
45')dnl
46')dnl
47')dnl
48')dnl
49')dnl
50')dnl
51')dnl
52')dnl
53')dnl
54')dnl
55dnl
56dnl	FIXED BLOCK SIZE DISPATCHERS :
57dnl
58foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
59foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
60dnl ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),1,..
61ifelse(1,1,`dnl
62foreach(`matrix_storage',RSB_M4_BCOO_FORMATS,`dnl
63foreach(`unrolling',unrollings,`dnl
64foreach(`symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
65foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
66foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
67foreach(`diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
68foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
69RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo)
70')dnl
71')dnl
72')dnl
73')dnl
74')dnl
75')dnl
76')dnl
77')dnl
78')dnl
79')dnl
80dnl
81dnl
82popdef(`unrollings')dnl
83dnl
84')dnl
85dnl
86dnl
87dnl
88dnl
89define(`RSB_M4_BCOO_KERNEL_FUNCTION',`dnl
90dnl
91dnl
92pushdef(`want_what',$1)dnl
93pushdef(`mtype',$2)dnl
94pushdef(`matrix_storage',$3)dnl
95pushdef(`transposition',$4)dnl
96pushdef(`symmetry',$5)dnl
97pushdef(`b_rows',$6)dnl		block rows
98pushdef(`b_columns',$7)dnl	block columns
99pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices)
100pushdef(`unrolling',$8)dnl
101pushdef(`mop',$9)dnl
102pushdef(`citype',$10)dnl
103pushdef(`diagonal',$11)dnl
104pushdef(`uplo',$12)dnl
105dnl
106pushdef(`total_columns',ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`Mdim',`mdim'))dnl
107pushdef(`total_rows',ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`mdim',`Mdim'))dnl
108pushdef(`out_dim',ifelse(transposition,RSB_M4_TRANS_N,total_rows,total_columns))dnl
109pushdef(`fid',RSB_M4_KERNEL_DIRECT_DISPATCHER_FUNCTION_NAME(mtype,matrix_storage,transposition,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo))dnl
110dnl
111ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`dnl
112pushdef(`mi',`i')dnl
113pushdef(`Mi',`j')dnl
114')dnl
115ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,`dnl
116pushdef(`mi',`j')dnl
117pushdef(`Mi',`i')dnl
118')dnl
119dnl
120ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl
121pushdef(`tmi',mi)dnl
122pushdef(`tMi',Mi)dnl
123')dnl
124ifelse(RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),1,`dnl
125pushdef(`tmi',Mi)dnl
126pushdef(`tMi',mi)dnl
127')dnl
128dnl
129ifelse(RSB_M4_IS_SPXX_OP_SCALING_KERNEL_MOP(mop),1,`dnl
130pushdef(`postmult',`(alpha)*')dnl
131',`dnl
132dnl
133ifelse(RSB_M4_IS_SPMX_OP_NEGATING_KERNEL_MOP(mop),1,`dnl
134pushdef(`postmult',`(-1)*')dnl
135',`dnl
136pushdef(`postmult',`')dnl
137')dnl
138dnl
139')dnl
140dnl
141pushdef(`ttransposition',`RSB_M4_TRANSPOSE_TRANSPOSITION(transposition)')dnl
142pushdef(`htransposition',`ifelse(symmetry,RSB_M4_SYMBOL_HERMITIAN,`RSB_M4_H2T_TRANSPOSITION(transposition)',transposition)')dnl
143dnl
144pushdef(`tsymmetry',`ifelse(symmetry,RSB_M4_SYMBOL_HERMITIAN,`RSB_M4_TRANSPOSE_SYMMETRY(symmetry)',symmetry)')dnl
145dnl
146pushdef(`toskipbecauseofsymmetry',`RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_IS_COMPLEX_TYPE(mtype)),RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))')dnl
147dnl
148dnl
149ifelse(RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED(want_what,mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo),`1',`dnl
150dnl
151ifelse(want_what,`DOC',`dnl
152	/*  TODO */
153')dnl
154ifelse(want_what,`all',`dnl
155dnl
156ifelse(RSB_M4_IS_FORMAT_BCOO(matrix_storage),1,`dnl
157rsb_err_t fid`'dnl
158RSB_M4_BCOO_KERNEL_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo)dnl
159')dnl
160ifdef(`ONLY_WANT_HEADERS',`;
161',`
162dnl /* begin of fid function */
163RSB_M4_BCOO_KERNEL_FUNCTION(`BODY',mtype,matrix_storage,transposition,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo)dnl
164')dnl
165')dnl
166dnl
167ifelse(want_what,`ID',`dnl
168fid`'dnl
169')dnl
170dnl
171ifelse(want_what,`ARGS',`dnl
172RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo)`'dnl
173')dnl
174dnl
175dnl
176ifelse(want_what,`BODY',`dnl
177dnl
178{
179ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`0',`dnl
180pushdef(`incx',`1')dnl
181pushdef(`incy',`1')dnl
182')dnl
183RSB_M4_BXXX_KERNEL_FUNCTION_HELP($@)
184dnl
185ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_IS_DIAGONAL_IMPLICIT(diagonal)),1,`dnl
186	RSB_M4_FAKE_DIAG_IMPLICIT_MSG
187')dnl
188dnl
189ifelse(toskipbecauseofsymmetry,1,`dnl
190dnl
191	/* Symmetric `transposed' reverts to symmetric `not transposed' */
192	return RSB_M4_BCOO_KERNEL_FUNCTION(`ID',mtype,matrix_storage,RSB_M4_TRANS_N,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo)dnl
193(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCOO_KERNEL_FUNCTION(`ARGS',mtype,matrix_storage,RSB_M4_TRANS_N,symmetry,b_rows,b_columns,unrolling,mop,citype,diagonal,uplo)));
194dnl
195')dnl
196dnl
197ifelse(toskipbecauseofsymmetry,0,`dnl
198dnl
199dnl	the i,j type has to be the same as the arrays one.
200dnl	if not, mismatch on the copied bytes will occur.
201ifelse(RSB_M4_AND(RSB_M4_NOT(RSB_M4_IS_RC_BIASED_KERNEL_MOP(mop)),RSB_M4_NOT(RSB_M4_AND(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry))))),`1',`dnl
202ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`dnl
203	register rsb_coo_idx_t i=0,j=0;
204',`dnl
205	register citype i=0,j=0;
206dnl 20110227 if declaring short indices, we should care about proper conversion
207')dnl
208	const citype *IA=(const citype*)bpntr, *JA=(const citype*)bindx;
209dnl
210',`dnl
211dnl
212ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),`0',`dnl
213	const citype *JA=(const citype*)bindx;
214	register citype j=0;
215',`dnl
216	const citype *IA=(const citype*)bpntr;
217	register citype i=0;
218')dnl
219')dnl
220dnl ifelse(mop,`scale',`',`dnl
221dnl ')dnl	20121005 shall change this condition when enabling transpose scale as well
222	register rsb_nnz_idx_t n=0;
223ifelse(RSB_M4_IS_OP_SCALING_KERNEL_MOP(mop),`1',`dnl
224	const mtype alpha=*alphap;`'dnl
225')dnl
226ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),`1',`dnl
227	const mtype beta=*betap;`'dnl
228')dnl
229dnl
230ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`0',`dnl
231	dnl const rsb_coo_idx_t incx=1,incy=1;`'
232')dnl
233dnl
234dnl
235ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop)),1,`dnl
236dnl
237
238dnl
239dnl
240dnl
241ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),1,`dnl
242	const mtype *trhs = rhs+incx*(roff-coff);`'// symmetry
243	mtype *tout=out+incy*(coff-roff);`'
244
245')dnl
246dnl
247ifelse(RSB_M4_IS_ZEROING_KERNEL_MOP(mop),1,`dnl
248	rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(mtype),out_dim,NULL,out,incy);
249')dnl
250dnl
251ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),1,`dnl
252ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`dnl
253	if(beta!=1)rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(mtype),out_dim,&beta,out,ystride);
254',`dnl
255	rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(mtype), out_dim,&beta, out, 1);
256')dnl
257')dnl
258dnl
259ifelse(transposition,RSB_M4_TRANS_N,`dnl
260',`dnl
261ifelse(RSB_M4_IS_UNSYMMETRIC(symmetry),1,`dnl
262	rhs=(rhs-coff*(incx))+roff*(incx);
263	out=(out-roff*(incy))+coff*(incy);
264')dnl
265')dnl
266dnl
267ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),1,`dnl
268	if(roff==coff)
269')dnl
270dnl
271dnl
272ifelse(RSB_M4_IS_UNSYMMETRIC(symmetry),1,`dnl
273dnl
274ifelse(1,1,`dnl
275dnl
276dnl	RSB_M4_SIMPLE_LOOP_UNROLL..
277	RSB_M4_SIMPLE_LOOP_UNROLL_5S(`n',`LI',`0',`nnz',`dnl
278',`dnl
279	i=IA[n+LI]; j=JA[n+LI];
280	out[tMi*incy]+=`'postmult`'RSB_M4_CONJ(VA[n+LI],mtype,transposition,RSB_M4_SYMBOL_UNSYMMETRIC)*rhs[tmi*incx];
281dnl
282',`',`',`RSB_M4_EARLY_EVICT_INSTRUCTION((IA+n,JA+n,VA+n))`'dnl
283',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL)
284dnl
285',`dnl
286dnl
287RSB_M4_SIMPLE_LOOP_UNROLL_5S(`n',`LI',`0',`nnz',`dnl
288dnl
289dnl
290',`dnl
291dnl
292			`const rsb_coo_idx_t' `i_'``''LI`'=IA[n+LI];
293			`const rsb_coo_idx_t' `j_'``''LI`'=JA[n+LI];
294			`const mtype b_'``''LI`'=rhs[tmi``_''LI`'*incx];
295			`const mtype a_'``''LI`'=VA[n+LI];
296dnl
297',`dnl
298			if(tMi``_''0`'== tMi``_''eval(RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_MEDIUM-1)`')
299			{
300				mtype cacc = RSB_M4_ZERO(mtype);
301forloop(`_LI_',0,decr(RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_MEDIUM),`dnl
302				cacc+=`'postmult`'RSB_M4_CONJ(`a_'``''_LI_,mtype,transposition,RSB_M4_SYMBOL_UNSYMMETRIC)`*b_'``''_LI_;
303')dnl
304			out[tMi``_''0`'*incy]+=cacc;
305`'dnl
306			}
307			else
308			{
309',`dnl
310				out[tMi``_''LI`'*incy]+=`'postmult`RSB_M4_CONJ(a``_''``''LI`',mtype,transposition,RSB_M4_SYMBOL_UNSYMMETRIC)'`*b_'``''LI;
311',`dnl
312			}
313',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_MEDIUM)
314dnl
315')dnl
316dnl
317',`dnl
318dnl
319	for(n=0;RSB_LIKELY(n<nnz);++n)
320	{
321		i=IA[n];
322		j=JA[n];
323dnl		assert(i< Mdim);
324dnl		assert(j< mdim);
325		out[tMi*incy]+=`'postmult`'RSB_M4_UIM_CONJ(VA[n],mtype,transposition,symmetry)*rhs[tmi*incx];
326dnl
327ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),1,`dnl
328		if(RSB_LIKELY(tMi!=tmi))
329			out[tmi*incy]+=`'postmult`'RSB_M4_CIM_CONJ(VA[n],mtype,transposition,symmetry)*rhs[tMi*incx];
330')dnl
331dnl
332	}
333dnl
334')dnl
335dnl
336ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(symmetry),1,`dnl
337	if(roff!=coff)
338	RSB_M4_SIMPLE_LOOP_UNROLL(`n',`LI',`0',`nnz',`dnl
339		i=IA[n+LI];
340		j=JA[n+LI];
341dnl		assert(i< Mdim);
342dnl		assert(j< mdim);
343ifelse(transposition,RSB_M4_TRANS_N,`dnl
344		out[Mi*incy]+=`'postmult`'RSB_M4_UIM_CONJ(VA[n+LI],mtype,transposition,symmetry)*rhs[mi*incx];
345		tout[mi*incy]+=`'postmult`'RSB_M4_CIM_CONJ(VA[n+LI],mtype,transposition,symmetry)*trhs[Mi*incx];
346',`dnl
347		tout[tMi*incy]+=`'postmult`'RSB_M4_UIM_CONJ(VA[n+LI],mtype,transposition,symmetry)*trhs[tmi*incx];
348		out[tmi*incy]+=`'postmult`'RSB_M4_CIM_CONJ(VA[n+LI],mtype,transposition,symmetry)*rhs[tMi*incx];
349')dnl
350dnl
351	',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL)
352')dnl
353dnl
354	return RSB_ERR_NO_ERROR;
355')dnl
356')dnl
357dnl
358ifelse(RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop)),1,`dnl
359dnl
360dnl	FIXME: and roff and coff ?
361dnl
362dnl
363pushdef(`is_an_externally_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_XOR(RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),RSB_M4_SAME(uplo,`u')))')dnl
364pushdef(`is_vector_updating_spsv',RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))dnl
365dnl
366	rsb_coo_idx_t ii;
367ifelse(is_an_externally_backward_kernel,1,`
368	for(n=nnz-1,ii=Mdim-1;RSB_LIKELY(ii+1>0) ;--ii)
369',`dnl
370	for(n=0,ii=0;RSB_LIKELY(ii<Mdim);++ii)
371')dnl
372	{
373		mtype ax;
374ifelse(is_vector_updating_spsv,1,`dnl
375ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(diagonal),1,`dnl
376dnl	..
377',`dnl
378dnl		const mtype aa;
379		mtype aa;
380ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK,1,`dnl
381		if(n>=nnz)return RSB_ERR_INVALID_NUMERICAL_DATA;
382')dnl
383		aa=VA[n];
384ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK,1,`dnl
385		if(VA[n]==RSB_M4_ZERO(mtype))return RSB_ERR_INVALID_NUMERICAL_DATA;
386')dnl
387ifelse(is_an_externally_backward_kernel,1,`
388		n--;
389',`dnl
390		n++;
391')dnl
392		out[ii*incy]/=aa;
393')dnl
394		ax=out[ii*incy];
395',`dnl
396		ax=0;
397')dnl
398ifelse(is_an_externally_backward_kernel,1,`
399		for(;RSB_LIKELY(n+1>0);--n)
400',`dnl
401		for(;RSB_LIKELY(n<nnz);++n)
402')dnl
403		{
404			i=IA[n];
405			j=JA[n];
406ifelse(is_vector_updating_spsv,1,`dnl
407			if(RSB_UNLIKELY(!(i==ii )))
408',`dnl
409			if(RSB_UNLIKELY(!(i==ii && j!=i)))
410')dnl
411				break;
412ifelse(is_vector_updating_spsv,1,`dnl
413			out[j*incy]-=RSB_M4_CONJ(VA[n],mtype,transposition,symmetry)*ax;
414',`dnl
415			ax += RSB_M4_CONJ(VA[n],mtype,transposition,symmetry)*out[j*incy];
416')dnl
417		}
418
419ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(diagonal),1,`dnl
420ifelse(is_vector_updating_spsv,1,`dnl
421		out[ii*incy]=(`'postmult`'out[ii*incy]);
422',`dnl
423		out[ii*incy]=(`'postmult`'out[ii*incy]-ax);
424')dnl
425',`dnl
426dnl
427dnl	FIXME: goto err is illegal for nnz=0 ...
428dnl
429dnl		if(!(i==ii && i==j))
430dnl			goto err;
431ifelse(is_vector_updating_spsv,1,`dnl
432		out[ii*incy]=(`'postmult`'out[ii*incy]);
433',`dnl
434ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK,1,`dnl
435		if(n==nnz || VA[n]==RSB_M4_ZERO(mtype))return RSB_ERR_INVALID_NUMERICAL_DATA;
436')dnl
437		out[ii*incy]=(`'postmult`'out[ii*incy]-ax)/VA[n];
438ifelse(is_an_externally_backward_kernel,1,`dnl
439		--n;
440',`dnl
441		++n;
442')dnl
443')dnl
444')dnl
445	}
446	return RSB_ERR_NO_ERROR;
447dnl err:
448dnl	return RSB_ERR_BADARGS;
449dnl
450popdef(`is_an_externally_backward_kernel')dnl
451popdef(`is_vector_updating_spsv')dnl
452dnl
453')dnl
454dnl
455dnl ifelse(RSB_M4_NOT(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop)),1,`dnl
456dnl 	return RSB_ERR_UNIMPLEMENTED_YET;
457dnl ')dnl
458dnl
459ifelse(mop,`scale',`dnl
460	for(n=0;RSB_LIKELY(n<nnz);++n)
461	{
462	dnl
463dnl	FIXME: what about hermitian ?
464dnl
465ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl
466		i=IA[n];
467		VA[n]*=scale_factors[i];
468',`dnl
469		j=JA[n];
470dnl		i=IA[n];
471dnl		VA[n]*=scale_factors[i];
472		VA[n]*=scale_factors[j];
473')dnl
474dnl
475	}
476	return RSB_ERR_NO_ERROR;
477')dnl
478dnl
479ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl
480	dnl
481	dnl	TODO: do we need vector blank ?
482	dnl
483	for(n=0;RSB_LIKELY(n<nnz);++n)
484	{
485dnl
486ifelse(RSB_M4_IS_UNSYMMETRIC(symmetry),1,`dnl
487dnl
488ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl
489		i=IA[n];
490ifelse(mop,`infty_norm',`dnl
491		row_sums[roff+i]+=RSB_M4_ABS(mtype,VA[n]);
492')dnl
493ifelse(mop,`rowssums',`dnl
494		row_sums[roff+i]+=VA[n];
495')dnl
496',`dnl
497		j=JA[n];
498ifelse(mop,`infty_norm',`dnl
499		row_sums[coff+j]+=RSB_M4_ABS(mtype,VA[n]);
500')dnl
501ifelse(mop,`rowssums',`dnl
502		row_sums[coff+j]+=VA[n];
503')dnl
504')dnl
505')dnl
506dnl
507dnl
508ifelse(RSB_M4_IS_UNSYMMETRIC(symmetry),0,`dnl
509dnl
510		i=IA[n];
511		j=JA[n];
512dnl
513ifelse(mop,`infty_norm',`dnl
514		row_sums[roff+i]+=RSB_M4_ABS(mtype,VA[n]);
515')dnl
516ifelse(mop,`rowssums',`dnl
517		row_sums[roff+i]+=VA[n];
518')dnl
519		if( roff+i != coff+j )
520ifelse(mop,`infty_norm',`dnl
521			row_sums[coff+j]+=RSB_M4_ABS(mtype,VA[n]);
522')dnl
523ifelse(mop,`rowssums',`dnl
524			row_sums[coff+j]+=VA[n];
525')dnl
526')dnl
527dnl
528	}
529	return RSB_ERR_NO_ERROR;
530')dnl
531dnl
532ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`0',`dnl
533popdef(`incx')dnl
534popdef(`incy')dnl
535')dnl
536dnl
537}
538dnl } /* end of fid function */
539dnl
540')dnl
541dnl
542')dnl
543dnl
544
545popdef(`toskipbecauseofsymmetry')dnl
546popdef(`htransposition')dnl
547popdef(`ttransposition')dnl
548popdef(`tsymmetry')dnl
549popdef(`postmult')dnl
550popdef(`tmi')dnl
551popdef(`tMi')dnl
552popdef(`mi')dnl
553popdef(`Mi')dnl
554popdef(`total_columns')dnl
555popdef(`total_rows')dnl
556popdef(`out_dim')dnl
557popdef(`fid')dnl
558dnl
559popdef(`uplo')dnl
560popdef(`diagonal')dnl
561popdef(`citype')dnl
562popdef(`mop')dnl
563popdef(`unrolling')dnl
564popdef(`itype')dnl
565popdef(`b_columns')dnl
566popdef(`b_rows')dnl
567popdef(`symmetry')dnl
568popdef(`transposition')dnl
569popdef(`matrix_storage')dnl
570popdef(`mtype')dnl
571popdef(`want_what')dnl
572')dnl
573dnl
574dnl
575define(`RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION',`dnl
576dnl
577pushdef(`want_what',$1)dnl
578pushdef(`mtype',$2)dnl
579pushdef(`matrix_storage',$3)dnl
580pushdef(`transposition',$4)dnl
581pushdef(`symmetry',$5)dnl
582pushdef(`unrolling',$6)dnl
583dnl pushdef(`b_rows',$7)dnl		block rows
584dnl pushdef(`b_columns',$8)dnl	block columns
585pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices)
586pushdef(`mop',`$9')dnl
587pushdef(`citype',`$10')dnl
588pushdef(`diagonal',`$11')dnl
589pushdef(`uplo',$12)dnl
590dnl
591dnl
592dnl
593ifelse(RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED(want_what,mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo),`1',`dnl
594dnl
595ifelse(want_what,`DOC',`dnl
596	/*  TODO */
597')dnl
598dnl
599ifelse(want_what,`all',`dnl
600dnl `/* This code is intended for a block compressed sparse stripe matrix. */'
601ifdef(`ONLY_WANT_HEADERS',`dnl
602RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`function_declaration',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo)
603',`dnl
604RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`function_definition',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo)
605')dnl
606dnl
607dnl
608dnl
609')dnl
610dnl
611ifelse(want_what,`function_definition',`dnl
612rsb_err_t RSB_M4_KERNEL_SIZE_DISPATCH_FUNCTION_NAME(mtype,matrix_storage,transposition,symmetry,unrolling,mop,citype,diagonal,uplo)dnl
613RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo)
614RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`BODY',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo)
615')dnl
616dnl
617ifelse(want_what,`function_declaration',`dnl
618rsb_err_t RSB_M4_KERNEL_SIZE_DISPATCH_FUNCTION_NAME(mtype,matrix_storage,transposition,symmetry,unrolling,mop,citype,diagonal,uplo)dnl
619RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo);dnl
620')dnl
621dnl
622ifelse(want_what,`ARGS',`dnl
623dnl
624dnl
625pushdef(`matrix_structs',`const itype Mdim,const itype mdim,const citype * RSB_M4_RESTRICT bindx,const rsb_nnz_idx_t * RSB_M4_RESTRICT bpntr,const rsb_nnz_idx_t *RSB_M4_RESTRICT indptr,const rsb_coo_idx_t * RSB_M4_RESTRICT rpntr,const rsb_coo_idx_t * RSB_M4_RESTRICT cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags,const rsb_nnz_idx_t nnz')dnl
626(`'dnl
627ifelse(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop),1,`dnl
628dnl
629dnl	no restrict on aliasing ops
630dnl
631ifelse(RSB_M4_IS_ALLOWING_ALIASING_KERNEL_MOP(mop),1,`dnl
632const mtype * RSB_M4_RESTRICT VA, const mtype * rhs, mtype * out, matrix_structs`'dnl
633',`dnl
634const mtype * RSB_M4_RESTRICT VA, const mtype * RSB_M4_RESTRICT rhs, mtype * RSB_M4_RESTRICT out, matrix_structs`'dnl
635')dnl
636')dnl
637ifelse(RSB_M4_IS_OP_SCALING_KERNEL_MOP(mop),`1',`dnl
638,const mtype * RSB_M4_RESTRICT alphap`'dnl
639')dnl
640ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),`1',`dnl
641,const mtype * RSB_M4_RESTRICT betap`'dnl
642')dnl
643ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`1',`dnl
644,rsb_coo_idx_t incx, rsb_coo_idx_t incy`'dnl
645')dnl
646ifelse(mop,`spmm_az',`dnl
647dnl
648dnl	FIXME
649dnl
650const itype bstride, const itype cstride, const itype nrhs`'dnl
651')dnl
652ifelse(mop,`scale',`dnl
653mtype * VA, matrix_structs, const mtype *scale_factors`'dnl
654')dnl
655ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl
656const mtype * VA, mtype * row_sums, matrix_structs`'dnl
657')dnl
658ifelse(mop,`negation',`dnl
659mtype * VA, matrix_structs`'dnl
660')dnl
661)dnl
662dnl
663')dnl
664dnl
665dnl
666ifelse(want_what,`BODY',`dnl
667dnl
668dnl
669{
670	RSB_M4_DEBUGINFO(``$0'')dnl
671dnl	/*!  \ingroup rsb_doc_kernels
672	/*
673	 * This function will dispatch the specialized looped kernel function for
674	 * performing the desired matrix operation ("mop") for the current fixed
675	 * block size.
676	 *
677	 * \return \rsb_errval_inp_param_msg
678ifelse(RSB_M4_IS_FORMAT_BCOO(matrix_storage),1,`dnl
679	 *
680	 * Since this is strictly blocked code, you should allow the rhs and the out
681	 * vector to accept a small overflow not bigger, respectively, than
682	 *       mod(blockrows-mod(matrixrows,blockrows),blockrows)
683	 * and
684	 *       mod(blockcols-mod(matrixcols,blockcols),blockcols)
685dnl	 *
686dnl	 * Note: We assume this quantity is the same for each block.
687dnl	 *
688dnl	 * WARNING : EXPERIMENTAL FUNCTION
689dnl	 * for block bigger than ~12x12 it seems that inline matrix multiplication code slows down the whole thing
690')dnl
691	 */
692	register rsb_coo_idx_t columns,rows;
693	rsb_err_t errval = RSB_ERR_NO_ERROR;
694	if(cpntr && rpntr)
695	{
696		columns=cpntr[1]-cpntr[0];
697		rows   =rpntr[1]-rpntr[0];
698	}
699	else
700dnl #if RSB_EXPERIMENTAL_WANT_PURE_BCOO
701ifelse(RSB_M4_WANT_20110206_BOUNDED_BOX_PATCH,1,`dnl
702dnl 20110206	set the following
703		columns = rows=1;	/* experimental, for the bounded box patch */
704',`dnl
705dnl 20110206	and commented the following
706		columns=bc,rows=br;
707')dnl
708dnl #else
709dnl 		columns = rows=1;
710dnl #endif
711
712ifelse(RSB_M4_IS_FORMAT_BCOO(matrix_storage),1,`dnl
713pushdef(`args',`RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,symmetry,unrolling,,,mop,citype,diagonal,uplo))')dnl
714switch(rows)
715{
716foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl
717	case rowsu:
718	{switch(columns)
719	{
720foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl
721		case colsu:/* rowsu colsu matrix_storage */
722		errval = RSB_M4_BCOO_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,symmetry,rowsu,colsu,unrolling,mop,citype,diagonal,uplo)( args );
723		break;
724')dnl
725	default:
726#ifdef RSB_WANT_LOOPING_KERNELS
727		errval = RSB_M4_BCOO_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,symmetry,rowsu,RSB_M4_COLUMNS_FALLBACK_UNROLL,`l',mop,citype,diagonal,uplo)( args );
728#else /* RSB_WANT_LOOPING_KERNELS  */
729	errval = RSB_ERR_UNSUPPORTED_OPERATION;
730#endif /* RSB_WANT_LOOPING_KERNELS  */
731	}}
732	break;
733')dnl
734	default:
735#ifdef RSB_WANT_LOOPING_KERNELS
736		errval = RSB_M4_BCOO_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,symmetry,RSB_M4_ROWS_FALLBACK_UNROLL,RSB_M4_COLUMNS_FALLBACK_UNROLL,`l',mop,citype,diagonal,uplo)( args );
737#else /* RSB_WANT_LOOPING_KERNELS */
738	errval = RSB_ERR_UNSUPPORTED_OPERATION;
739#endif /* RSB_WANT_LOOPING_KERNELS */
740};
741popdef(`args')dnl
742')dnl
743	dnl errval = RSB_ERR_UNSUPPORTED_TYPE;
744	return errval;
745}
746dnl
747')dnl
748dnl
749')dnl
750dnl
751popdef(`citype')dnl
752popdef(`mop')dnl
753popdef(`matrix_storage')dnl
754dnl popdef(`b_rows')dnl
755dnl popdef(`b_columns')dnl
756popdef(`transposition')dnl
757popdef(`symmetry')dnl
758popdef(`mtype')dnl
759popdef(`itype')dnl
760popdef(`unrolling')dnl
761popdef(`diagonal')dnl
762popdef(`want_what')dnl
763popdef(`uplo')dnl
764')dnl
765dnl
766dnl
767dnl
768