1dnl
2dnl	@author: Michele Martone
3dnl
4dnl
5dnl
6define(`RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED',`dnl
7dnl
8pushdef(`want_what',$1)dnl
9pushdef(`mtype',$2)dnl
10pushdef(`matrix_storage',$3)dnl
11pushdef(`transposition',$4)dnl
12pushdef(`k_symmetry',$5)dnl
13pushdef(`unrolling',$6)dnl
14pushdef(`b_rows',$7)dnl		block rows
15pushdef(`b_columns',$8)dnl	block columns
16pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices)
17pushdef(`mop',`$9')dnl
18pushdef(`citype',`$10')dnl
19pushdef(`k_diagonal',`$11')dnl
20pushdef(`uplo',$12)dnl
21dnl
22RSB_M4_AND(dnl
23RSB_M4_IMPLY(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_SAME(uplo,`g'))),dnl
24RSB_M4_IMPLY(RSB_M4_NOT(RSB_M4_IS_SPSX_KERNEL_MOP(mop)),RSB_M4_SAME(uplo,`g')),dnl
251)`'dnl
26dnl
27dnl
28popdef(`citype')dnl
29popdef(`mop')dnl
30popdef(`matrix_storage')dnl
31popdef(`b_rows')dnl
32popdef(`b_columns')dnl
33popdef(`transposition')dnl
34popdef(`k_symmetry')dnl
35popdef(`mtype')dnl
36popdef(`itype')dnl
37popdef(`unrolling')dnl
38popdef(`k_diagonal')dnl
39popdef(`want_what')dnl
40popdef(`uplo')dnl
41')dnl
42dnl
43dnl
44dnl
45dnl
46dnl	These functions dispatch on the column size, calling the
47dnl	proper kernels.
48dnl
49dnl	They assume type dispatching has just been performed.
50dnl
51dnl
52dnl	RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(want_what,mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
53dnl	-----------------------------------------------------------------------------------------------------------------------------------
54dnl
55define(`RSB_M4_BCXX_KERNEL_SIZE_DISPATCH_FUNCTION',`dnl
56dnl
57dnl
58ifelse(RSB_M4_IS_FORMAT_BCOO(matrix_storage),`1',`dnl
59dnl
60RSB_M4_BCOO_KERNEL_SIZE_DISPATCH_FUNCTION($@)`'dnl
61dnl
62')dnl
63dnl
64dnl
65ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),`1',`dnl
66dnl
67RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION($@)`'dnl
68dnl
69')dnl
70dnl
71dnl
72dnl
73')dnl
74dnl
75dnl
76dnl
77define(`RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION',`dnl
78dnl
79pushdef(`want_what',$1)dnl
80pushdef(`mtype',$2)dnl
81pushdef(`matrix_storage',$3)dnl
82pushdef(`transposition',$4)dnl
83pushdef(`k_symmetry',$5)dnl
84pushdef(`unrolling',$6)dnl
85dnl pushdef(`b_rows',$7)dnl		block rows
86dnl pushdef(`b_columns',$8)dnl	block columns
87pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices)
88pushdef(`mop',`$9')dnl
89pushdef(`citype',`$10')dnl
90pushdef(`k_diagonal',`$11')dnl
91pushdef(`uplo',$12)dnl
92dnl
93ifelse(RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED(want_what,mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo),`1',`dnl
94dnl
95ifelse(want_what,`DOC',`dnl
96	/*  TODO */
97')dnl
98dnl
99ifelse(want_what,`all',`dnl
100dnl `/* This code is intended for a block compressed sparse stripe matrix. */'
101ifdef(`ONLY_WANT_HEADERS',`dnl
102RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`function_declaration',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
103',`dnl
104RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`function_definition',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
105')dnl
106dnl
107dnl
108dnl
109')dnl
110dnl
111ifelse(want_what,`function_definition',`dnl
112rsb_err_t RSB_M4_KERNEL_SIZE_DISPATCH_FUNCTION_NAME(mtype,matrix_storage,transposition,k_symmetry,unrolling,mop,citype,k_diagonal,uplo)dnl
113RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
114RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`BODY',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
115')dnl
116dnl
117ifelse(want_what,`function_declaration',`dnl
118rsb_err_t RSB_M4_KERNEL_SIZE_DISPATCH_FUNCTION_NAME(mtype,matrix_storage,transposition,k_symmetry,unrolling,mop,citype,k_diagonal,uplo)dnl
119RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo);dnl
120')dnl
121dnl
122ifelse(want_what,`ARGS',`dnl
123dnl
124dnl
125pushdef(`matrix_structs',`const itype Mdim,const itype mdim,const citype * RSB_M4_RESTRICT bindx,const rsb_nnz_idx_t * RSB_M4_RESTRICT bpntr,const rsb_nnz_idx_t *RSB_M4_RESTRICT indptr,const rsb_coo_idx_t * RSB_M4_RESTRICT rpntr,const rsb_coo_idx_t * RSB_M4_RESTRICT cpntr,const rsb_coo_idx_t br,const rsb_coo_idx_t bc,const rsb_coo_idx_t roff,const rsb_coo_idx_t coff,const rsb_flags_t flags')dnl
126(`'dnl
127ifelse(RSB_M4_IS_SPXX_TWO_VECTORS_OPERATING_KERNEL_MOP(mop),1,`dnl
128dnl
129dnl	no restrict on aliasing ops
130dnl
131ifelse(RSB_M4_IS_ALLOWING_ALIASING_KERNEL_MOP(mop),1,`dnl
132const mtype * RSB_M4_RESTRICT VA, const mtype * rhs, mtype * out, matrix_structs`'dnl
133',`dnl
134const mtype * RSB_M4_RESTRICT VA, const mtype * RSB_M4_RESTRICT rhs, mtype * RSB_M4_RESTRICT out, matrix_structs`'dnl
135')dnl
136')dnl
137ifelse(RSB_M4_IS_OP_SCALING_KERNEL_MOP(mop),`1',`dnl
138,const mtype * RSB_M4_RESTRICT alphap`'dnl
139')dnl
140ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),`1',`dnl
141,const mtype * RSB_M4_RESTRICT betap`'dnl
142')dnl
143ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`1',`dnl
144,rsb_coo_idx_t incx, rsb_coo_idx_t incy`'dnl
145')dnl
146ifelse(mop,`spmm_az',`dnl
147dnl
148dnl	FIXME
149dnl
150const itype bstride, const itype cstride, const itype nrhs`'dnl
151')dnl
152ifelse(mop,`scale',`dnl
153mtype * VA, matrix_structs, const mtype *scale_factors`'dnl
154')dnl
155ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl
156const mtype * VA, mtype * row_sums, matrix_structs`'dnl
157')dnl
158ifelse(mop,`negation',`dnl
159mtype * VA, matrix_structs`'dnl
160')dnl
161)dnl
162dnl
163')dnl
164dnl
165dnl
166ifelse(want_what,`BODY',`dnl
167dnl
168dnl
169{
170	RSB_M4_DEBUGINFO(``$0'')dnl
171dnl	/*!  \ingroup rsb_doc_kernels
172	/*
173	 * This function will dispatch the specialized looped kernel function for
174	 * performing the desired matrix operation ("mop") for the current fixed
175	 * block size.
176	 *
177	 * \return \rsb_errval_inp_param_msg
178ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl
179	 *
180	 * Since this is strictly blocked code, you should allow the rhs and the out
181	 * vector to accept a small overflow not bigger, respectively, than
182	 *       mod(blockrows-mod(matrixrows,blockrows),blockrows)
183	 * and
184	 *       mod(blockcols-mod(matrixcols,blockcols),blockcols)
185dnl	 *
186dnl	 * Note: We assume this quantity is the same for each block.
187dnl	 *
188dnl	 * WARNING : EXPERIMENTAL FUNCTION
189dnl	 * for block bigger than ~12x12 it seems that inline matrix multiplication code slows down the whole thing
190')dnl
191	 */
192	rsb_err_t errval = RSB_ERR_NO_ERROR;
193
194ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl
195pushdef(`args',`RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo))')dnl
196
197	register rsb_coo_idx_t columns,rows;
198	if(cpntr && rpntr)
199	{
200		columns=cpntr[1]-cpntr[0];
201		rows   =rpntr[1]-rpntr[0];
202	}
203	else
204dnl #if RSB_EXPERIMENTAL_WANT_PURE_BCSS
205ifelse(RSB_M4_WANT_20110206_BOUNDED_BOX_PATCH,1,`dnl
206dnl 20110206	set the following
207		columns = rows=1;	/* experimental, for the bounded box patch */
208',`dnl
209dnl 20110206	and commented the following
210		columns=bc,rows=br;
211')dnl
212dnl #else
213dnl 		columns = rows=1;
214dnl #endif
215
216switch(rows)
217{
218foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl
219	case rowsu:
220	{switch(columns)
221	{
222foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl
223		case colsu:/* rowsu colsu matrix_storage */
224		errval = RSB_M4_BCSS_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)( args );
225		break;
226')dnl
227	default:
228#ifdef RSB_WANT_LOOPING_KERNELS
229		errval = RSB_M4_BCSS_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,k_symmetry,rowsu,RSB_M4_COLUMNS_FALLBACK_UNROLL,`l',mop,citype,k_diagonal,uplo)( args );
230#else /* RSB_WANT_LOOPING_KERNELS */
231	errval = RSB_ERR_UNSUPPORTED_OPERATION;
232#endif /* RSB_WANT_LOOPING_KERNELS */
233	}}
234	break;
235')dnl
236	default:
237#ifdef RSB_WANT_LOOPING_KERNELS
238		errval = RSB_M4_BCSS_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,k_symmetry,RSB_M4_ROWS_FALLBACK_UNROLL,RSB_M4_COLUMNS_FALLBACK_UNROLL,`l',mop,citype,k_diagonal,uplo)( args );
239#else /* RSB_WANT_LOOPING_KERNELS */
240	errval = RSB_ERR_UNSUPPORTED_OPERATION;
241#endif /* RSB_WANT_LOOPING_KERNELS */
242};
243popdef(`args')dnl
244')dnl
245	dnl errval = RSB_ERR_UNSUPPORTED_TYPE;
246	return errval;
247}
248dnl
249')dnl
250dnl
251')dnl
252dnl
253popdef(`citype')dnl
254popdef(`mop')dnl
255popdef(`matrix_storage')dnl
256dnl popdef(`b_rows')dnl
257dnl popdef(`b_columns')dnl
258popdef(`transposition')dnl
259popdef(`k_symmetry')dnl
260popdef(`mtype')dnl
261popdef(`itype')dnl
262popdef(`unrolling')dnl
263popdef(`k_diagonal')dnl
264popdef(`want_what')dnl
265popdef(`uplo')dnl
266')dnl
267dnl
268dnl
269dnl
270dnl
271dnl	These functions will perform their operations on fixed block matrices.
272dnl
273define(`RSB_M4_BXXX_KERNEL_FUNCTION_HAS_IMPLEMENTATION',`dnl
274dnl
275dnl
276pushdef(`want_what',$1)dnl
277pushdef(`mtype',$2)dnl
278pushdef(`matrix_storage',$3)dnl
279pushdef(`transposition',$4)dnl
280pushdef(`k_symmetry',$5)dnl
281pushdef(`b_rows',$6)dnl		block rows
282pushdef(`b_columns',$7)dnl	block columns
283pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices)
284pushdef(`unrolling',$8)dnl
285pushdef(`mop',$9)dnl
286pushdef(`citype',$10)dnl
287pushdef(`k_diagonal',$11)dnl
288pushdef(`uplo',$12)dnl
289dnl
290ifelse(dnl
291dnl
292dnl	The following are cases which are NOT implemented.
293dnl	Each line emits a non empty character (`*') to block an implementation.
294dnl
295dnl	CSC SPSV gets blocked:
296dnl ifelse(RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(transposed)),1,`no',`')`'dnl
297dnl	CSR transposed SPSV gets blocked:
298dnl ifelse(RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),transposed),1,`no'`')dnl
299dnl	SPSV for non 1x1 blockings gets blocked
300ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),1,ifelse(RSB_M4_AND(RSB_M4_SAME(b_rows,1),RSB_M4_SAME(b_columns,1)),`1',`',`no'))`'dnl
301dnl
302dnl	any symmetric kernel for non 1x1 blockings gets blocked
303dnl	TODO : should modify RSB_M4_EXTRA_SYMMETRIC_DIAGONAL_FIXING_KERNEL to support k_symmetry and blocking
304ifelse(RSB_M4_OR(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),RSB_M4_AND(RSB_M4_SAME(b_rows,1),RSB_M4_SAME(b_columns,1))),1,`',`no')`'dnl
305dnl	any SPSV symmetric gets blocked
306ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),1,ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),1,`no',`'))`'dnl
307dnl
308,`',`1',`0')dnl
309dnl
310popdef(`uplo')dnl
311popdef(`want_what')dnl
312popdef(`k_diagonal')dnl
313popdef(`citype')dnl
314popdef(`mop')dnl
315popdef(`matrix_storage')dnl
316popdef(`k_symmetry')dnl
317popdef(`transposition')dnl
318popdef(`mtype')dnl
319popdef(`itype')dnl
320popdef(`unrolling')dnl
321')dnl
322dnl
323dnl
324dnl
325dnl
326dnl
327define(`RSB_M4_BXXX_KERNEL_FUNCTION_HELP',`dnl
328dnl
329dnl
330pushdef(`want_what',$1)dnl
331pushdef(`mtype',$2)dnl
332pushdef(`matrix_storage',$3)dnl
333pushdef(`transposition',$4)dnl
334pushdef(`k_symmetry',$5)dnl
335pushdef(`b_rows',$6)dnl		block rows
336pushdef(`b_columns',$7)dnl	block columns
337pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices)
338pushdef(`unrolling',$8)dnl
339pushdef(`mop',$9)dnl
340pushdef(`citype',$10)dnl
341pushdef(`k_diagonal',$11)dnl
342pushdef(`uplo',$12)dnl
343dnl
344	/**
345	 * \ingroup rsb_doc_kernels
346ifelse(RSB_M4_MEMBER(mop,`spsv_uxua'),1,`dnl
347	 * Computes \f$y \leftarrow RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A')^{-1} \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
348')dnl
349ifelse(mop,`spmv_unua',`dnl
350	 * Computes \f$y \leftarrow y - RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
351')dnl
352ifelse(mop,`spmv_uaua',`dnl
353	 * Computes \f$y \leftarrow y + RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
354')dnl
355ifelse(mop,`spmv_sxsa',`dnl
356	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
357	 * with incx and incy as x and y vector strides
358')dnl
359ifelse(mop,`spmv_sxsx',`dnl
360	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
361	 * with incx and incy as x and y vector strides
362')dnl
363ifelse(mop,`spmv_sasa',`dnl
364	 * Computes \f$y \leftarrow y + RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
365')dnl
366ifelse(mop,`spmv_uxua',`dnl
367	 * Computes \f$y \leftarrow y + \alpha \cdot RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
368')dnl
369ifelse(mop,`spmv_uxux',`dnl
370	 * Computes \f$y \leftarrow \beta \cdot y + \alpha \cdot RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
371')dnl
372ifelse(mop,`spmm_az',`dnl
373	 * Computes \f$y \leftarrow RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
374')dnl
375ifelse(mop,`infty_norm',`dnl
376	 * Computes \f$ \|A\|_{\infty} \f$ (or rather, \f$ row\_sums_i \leftarrow \sum_{j=0}^{mdim} A_{ij} ), where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A').\f$
377')dnl
378ifelse(mop,`rowssums',`dnl
379	 * Computes \f$ \|A\|_{1} \f$ (or rather, \f$ row\_sums_i \leftarrow \sum_{i=0}^{Mdim} A^{T}_{ij} ), where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A').\f$
380')dnl
381ifelse(mop,`spmv_uauz',`dnl
382	 * Computes \f$y \leftarrow RSB_M4_TRANSPOSITION_OP_EFFECT(transposition,`A') \cdot x, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A'). \f$
383')dnl
384ifelse(mop,`scale',`dnl
385	 * Computes \f$A \leftarrow A\cdot P, P_{ii}=s_{i}, where RSB_M4_SYMMETRY_EFFECT(k_symmetry,`A').\f$
386')dnl
387ifelse(mop,`negation',`dnl
388	 * Computes \f$A \leftarrow - A \f$
389')dnl
390         * Matrix A should be blocked b_rows x b_columns, stored in matrix_storage format, RSB_M4_MATRIX_DIAGONAL_DENOMINATION(k_diagonal), of `type' mtype, with citype column indices.
391dnl
392ifelse(RSB_M4_BXXX_KERNEL_FUNCTION_HAS_IMPLEMENTATION($@),`1',`dnl
393	 * \return \rsb_errval_inp_param_msg
394	 */
395',`dnl
396dnl	FIXME: the return error is not always adequate, here.
397	 * \return RSB_ERR_UNIMPLEMENTED_YET (this function is not implemented).
398	 */
399dnl	/* or RSB_ERR_UNSUPPORTED_FEATURE ? */
400')dnl
401dnl
402popdef(`uplo')dnl
403popdef(`want_what')dnl
404popdef(`k_diagonal')dnl
405popdef(`citype')dnl
406popdef(`mop')dnl
407popdef(`matrix_storage')dnl
408popdef(`k_symmetry')dnl
409popdef(`transposition')dnl
410popdef(`mtype')dnl
411popdef(`itype')dnl
412popdef(`unrolling')dnl
413')dnl
414dnl
415dnl
416dnl
417dnl
418dnl
419dnl	These functions will perform their operations on fixed block matrices.
420dnl
421define(`RSB_M4_BCSS_KERNEL_FUNCTION',`dnl
422dnl
423dnl
424pushdef(`want_what',$1)dnl
425pushdef(`mtype',$2)dnl
426pushdef(`matrix_storage',$3)dnl
427pushdef(`transposition',$4)dnl
428pushdef(`k_symmetry',$5)dnl
429pushdef(`b_rows',$6)dnl		block rows
430pushdef(`b_columns',$7)dnl	block columns
431pushdef(`itype',`rsb_coo_idx_t ')dnl integer type (for indices)
432pushdef(`unrolling',$8)dnl
433pushdef(`mop',$9)dnl
434pushdef(`citype',$10)dnl
435pushdef(`k_diagonal',$11)dnl
436pushdef(`uplo',$12)dnl
437dnl
438ifelse(RSB_M4_ARE_KERNEL_GENERATION_PARMS_ALLOWED(want_what,mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo),`1',`dnl
439dnl
440ifelse(want_what,`all',`dnl
441dnl
442ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl
443rsb_err_t RSB_M4_BCSS_KERNEL_FUNCTION(`ID',mtype,matrix_storage,transposition,k_symmetry,b_rows,b_columns,unrolling,mop,citype,k_diagonal,uplo)dnl
444RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,b_rows,b_columns,unrolling,mop,citype,k_diagonal,uplo)dnl
445')dnl
446ifdef(`ONLY_WANT_HEADERS',`;
447',`
448RSB_M4_BCSS_KERNEL_FUNCTION(`BODY',mtype,matrix_storage,transposition,k_symmetry,b_rows,b_columns,unrolling,mop,citype,k_diagonal,uplo)dnl
449')dnl
450')dnl
451dnl
452ifelse(want_what,`ID',`dnl
453RSB_M4_KERNEL_DIRECT_DISPATCHER_FUNCTION_NAME(mtype,matrix_storage,transposition,k_symmetry,b_rows,b_columns,unrolling,mop,citype,k_diagonal,uplo)`'dnl
454')dnl
455dnl
456ifelse(want_what,`ARGS',`dnl
457RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`ARGS',mtype,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)`'dnl
458')dnl
459dnl
460ifelse(want_what,`BODY',`dnl
461dnl
462{
463dnl
464dnl	The body of a CSR/CSC computational kernel.
465dnl
466dnl	RSB_M4_DEBUGINFO(``$0'')dnl
467dnl
468ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl
469dnl
470pushdef(`total_block_columns',ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`Mdim',`mdim'))dnl
471pushdef(`total_block_rows',ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`mdim',`Mdim'))dnl
472pushdef(`total_rows',ifelse(unrolling,`l',rpntr[total_block_rows],total_block_rows*b_rows))dnl
473pushdef(`total_columns',ifelse(unrolling,`l',cpntr[total_block_columns],total_block_columns*b_columns))dnl
474dnl
475ifelse(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),1,`dnl
476pushdef(`mi',`i')dnl
477pushdef(`Mi',`j')dnl
478')dnl
479ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,`dnl
480pushdef(`mi',`j')dnl
481pushdef(`Mi',`i')dnl
482')dnl
483dnl
484dnl	FIXME : out_dim should depend on the operation!
485dnl
486pushdef(`out_dim',ifelse(transposition,RSB_M4_TRANS_N,total_rows,total_columns))dnl
487dnl
488pushdef(`is_zero_acc_spsv_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_OR(RSB_M4_AND(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),RSB_M4_AND(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))))')dnl
489dnl pushdef(`is_zero_acc_spsv_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_OR(RSB_M4_AND(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),RSB_M4_SAME(transposition,RSB_M4_TRANS_N))))')dnl
490dnl
491pushdef(`is_diag_d_spsv_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_OR(RSB_M4_AND(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),RSB_M4_AND(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N))))))')dnl
492dnl
493dnl pushdef(`is_an_externally_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_XOR(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))')dnl
494dnl pushdef(`is_an_externally_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))')dnl
495pushdef(`is_an_externally_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_XOR(RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)),RSB_M4_SAME(uplo,`u')))')dnl
496dnl
497pushdef(`is_a_backward_kernel',is_an_externally_backward_kernel)dnl
498dnl pushdef(`is_a_backward_kernel',`RSB_M4_AND(RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N)))')dnl
499dnl
500pushdef(`block_backward',`ifelse(is_a_backward_kernel,1,`a += rows*columns',`a -= rows*columns')')dnl
501pushdef(`block_forward',`ifelse(is_a_backward_kernel,1,`a -= rows*columns',`a += rows*columns')')dnl
502dnl
503dnl
504dnl	FIXME : and so the stride x/y association
505dnl
506dnl pushdef(`extra_xstride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`incx',`0'))dnl
507dnl pushdef(`extra_ystride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`incy',`0'))dnl
508pushdef(`xstride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`(incx)',`1'))dnl
509pushdef(`ystride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`(incy)',`1'))dnl
510pushdef(`extra_xstride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`xstride',`1'))dnl
511pushdef(`extra_ystride',ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`ystride',`1'))dnl
512dnl
513dnl	NEW:
514dnl
515pushdef(`transposed',ifelse(transposition,RSB_M4_TRANS_N,0,1))dnl
516dnl pushdef(`transposed',dnl
517dnl ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,eval(transposed),eval(1-transposed))dnl
518dnl )dnl
519dnl
520dnl
521pushdef(`brin',`(i*extra_ystride)')dnl
522pushdef(`bcin',`(j*extra_xstride)')dnl
523dnl
524ifelse(transposed,`1',`dnl
525dnl
526dnl	block row index, block column index
527dnl
528pushdef(`bci',`(i*extra_xstride)')dnl
529pushdef(`bri',`(j*extra_ystride)')dnl
530pushdef(`bcit',`(j*extra_xstride)')dnl
531pushdef(`brit',`(i*extra_ystride)')dnl
532',`dnl
533pushdef(`bri',`(i*extra_ystride)')dnl
534pushdef(`bci',`(j*extra_xstride)')dnl
535pushdef(`brit',`(j*extra_ystride)')dnl
536pushdef(`bcit',`(i*extra_xstride)')dnl
537')dnl
538dnl
539pushdef(`should_init_out_vector_before_outer_loop',`dnl
540RSB_M4_OR(RSB_M4_IS_SCALING_KERNEL_MOP(mop),dnl
541RSB_M4_AND(RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(eval(transposed))),dnl
542RSB_M4_AND(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),eval(transposed)),dnl
543RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry))dnl
544')dnl
545dnl
546dnl
547dnl
548pushdef(`has_implementation',`dnl
549RSB_M4_BXXX_KERNEL_FUNCTION_HAS_IMPLEMENTATION($@)`'dnl
550')dnl
551dnl
552')
553RSB_M4_BXXX_KERNEL_FUNCTION_HELP($@)
554ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal)),1,`dnl
555	RSB_M4_FAKE_DIAG_IMPLICIT_MSG
556')dnl
557ifelse(has_implementation,`1',`dnl
558',`dnl
559dnl	/* or RSB_ERR_UNSUPPORTED_FEATURE ? */
560	return RSB_ERR_UNIMPLEMENTED_YET;
561')dnl
562dnl
563ifelse(has_implementation,`1',`dnl
564dnl	Comments
565dnl
566ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry)),1,`dnl
567	/*
568ifelse(RSB_M4_want_verbose_comments,`1',`dnl
569		WARNING : This function assumes the matrix symmetric, and therefore
570		will write the output vector in the 0,Mdim and -roff+coff,-roff+coff+Mdim range.
571		So if you are using this function in a parallel environment, you should care about
572		proper locking of the output vectors.
573')dnl
574ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_SCALING_KERNEL_MOP(mop),RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry)),1,`dnl
575		The output vector zero-ing is impacted, too, so if you are using this kernel with
576		recursive storage, you should care about the proper zeroing of the whole output vector.
577')dnl
578	*/
579')dnl
580dnl
581dnl
582dnl
583dnl
584ifelse(RSB_M4_OR(RSB_M4_AND(RSB_M4_NOT(RSB_M4_IS_COMPLEX_TYPE(type)),RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),RSB_M4_NOT(transposition,RSB_M4_TRANS_N))),1,`dnl
585dnl
586ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_C),1,`dnl
587	/* `For non complex types, hermitian defaults to plain transposition.' */
588	return RSB_M4_BCSS_KERNEL_FUNCTION(`ID',type,matrix_storage,RSB_M4_H2T_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)dnl
589(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',type,matrix_storage,RSB_M4_H2T_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype)));
590')dnl
591dnl
592ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_T),1,`dnl
593	/* `This kernel performs the same as its transposed', transposition -> RSB_M4_TRANSPOSE_TRANSPOSITION(transposition). */
594	return RSB_M4_BCSS_KERNEL_FUNCTION(`ID',type,matrix_storage,RSB_M4_TRANSPOSE_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)dnl
595(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',type,matrix_storage,RSB_M4_TRANSPOSE_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype)));
596')dnl
597dnl
598',`dnl
599ifelse(RSB_M4_OR(RSB_M4_AND(RSB_M4_IS_COMPLEX_TYPE(type),RSB_M4_SAME(k_symmetry,`hNEVEROCCURINGFIXME'),RSB_M4_SAME(transposition,RSB_M4_TRANS_C)),RSB_M4_AND(RSB_M4_IS_COMPLEX_TYPE(type),RSB_M4_SAME(k_symmetry,`s'),RSB_M4_SAME(transposition,RSB_M4_TRANS_T))),1,`dnl
600dnl
601	/* `This kernel performs the same as its transposed', transposition -> RSB_M4_TRANSPOSE_TRANSPOSITION(transposition). */
602	return RSB_M4_BCSS_KERNEL_FUNCTION(`ID',type,matrix_storage,RSB_M4_TRANSPOSE_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)dnl
603(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',type,matrix_storage,RSB_M4_TRANSPOSE_TRANSPOSITION(transposition),k_symmetry,rowsu,colsu,unrolling,mop,citype)));
604dnl
605dnl ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_C),1,`dnl
606dnl 	/*
607dnl 		The matrix is treated as symmetric hermitian.
608dnl 		FIXME: missing implementation.
609dnl 	*/
610dnl 	return RSB_ERR_UNIMPLEMENTED_YET;
611dnl ')dnl
612dnl
613',`dnl
614dnl
615ifelse(RSB_M4_AND(RSB_M4_NOT(RSB_M4_IS_COMPLEX_TYPE(type)),RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N))),1,`dnl
616dnl
617	/* Symmetric `transposed' reverts to symmetric `not transposed' */
618	return RSB_M4_BCSS_KERNEL_FUNCTION(`ID',type,matrix_storage,RSB_M4_TRANS_N,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)dnl
619(RSB_M4_ARGS_TO_ACTUAL_ARGS(RSB_M4_BCSS_KERNEL_FUNCTION(`ARGS',type,matrix_storage,RSB_M4_TRANS_N,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)));
620dnl
621',`dnl
622dnl
623dnl
624ifelse(unrolling,`l',/* FIXME : l-unrolled functions are broken */)dnl
625dnl
626dnl	BEGIN VARIABLES DECLARATIONS
627dnl
628ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl
629ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),`1',`dnl
630ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),1,`dnl
631	register rsb_coo_idx_t i=0,j=0;
632',`dnl
633	register rsb_coo_idx_t i=0;
634')dnl
635',`dnl
636	register rsb_coo_idx_t i=0,j=0;
637')dnl
638	register rsb_nnz_idx_t k=0;
639dnl
640ifelse(RSB_M4_NOT(RSB_M4_IS_SPMV_KERNEL_MOP(mop)),`1',`dnl
641ifelse(unrolling,`l',`dnl
642	const register rsb_coo_idx_t columns=cpntr[1]-cpntr[0];	/* we assume that block_count >= 1 */
643	const register rsb_coo_idx_t rows   =rpntr[1]-rpntr[0];	/* we assume that block_count >= 1 */
644',`dnl
645	const register rsb_coo_idx_t columns=b_columns,rows=b_rows;
646')dnl
647')dnl
648')dnl
649dnl
650ifelse(RSB_M4_IS_READONLY_KERNEL_MOP(mop),1,`dnl
651ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),`0',`dnl
652	const mtype *a=VA;
653')dnl
654')dnl
655ifelse(RSB_M4_IS_WRITEONLY_KERNEL_MOP(mop),1,`dnl
656	mtype *a=VA;
657')dnl
658dnl
659ifelse(RSB_M4_IS_RC_BIASED_KERNEL_MOP(mop),`0',`dnl
660ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`0',`dnl
661ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop),`0',`dnl
662ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),`0',`dnl
663	const rsb_coo_idx_t incx=1,incy=1;`'
664')dnl
665')dnl
666')dnl
667')dnl
668dnl
669ifelse(RSB_M4_IS_OP_SCALING_KERNEL_MOP(mop),`1',`dnl
670	const mtype alpha=*alphap;`'dnl
671')dnl
672ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),`1',`dnl
673	const mtype beta=*betap;`'dnl
674')dnl
675dnl
676ifelse(RSB_M4_is_transposed_spmv,1,`dnl
677	const mtype *trhs = rhs+xstride*(roff-coff);`'
678	mtype *tout=out+ystride*(coff-roff);`'
679
680')dnl
681ifelse(RSB_M4_IS_SPXX_OP_SCALING_KERNEL_MOP(mop),1,`dnl
682')dnl
683dnl
684dnl
685dnl	END VARIABLES DECLARATIONS
686dnl
687dnl	BEGIN CONDITIONAL VECTOR SCALING
688dnl
689ifelse(should_init_out_vector_before_outer_loop,1,`dnl
690ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),1,`dnl
691ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),1,`dnl
692	if(beta!=1)rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type),out_dim,&beta,out,ystride);
693',`dnl
694	rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type), out_dim,&beta, out, 1);
695') /* we scale the destination vector */
696')dnl
697ifelse(RSB_M4_IS_ZEROING_KERNEL_MOP(mop),1,`dnl
698	rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type),out_dim,NULL,out,ystride);
699')dnl
700')dnl
701dnl
702dnl	END CONDITIONAL VECTOR SCALING
703dnl
704dnl	BEGIN COMMON EXTERNAL LOOP BEGINNING
705dnl
706ifelse(RSB_M4_want_verbose_comments,`1',` /*	Outer loop. Occurs on the major dimension.	*/ ')dnl
707dnl
708ifelse(is_an_externally_backward_kernel,1,`
709	for(Mi=Mdim-1; RSB_LIKELY((Mi+1)>0 /*trick for unsigned indices */);--Mi) //RSB_M4_IS_SPSX_KERNEL_MOP(mop),RSB_M4_IS_FORMAT_COLUMN_MAJOR(matrix_storage),RSB_M4_NOT(RSB_M4_SAME(transposition,RSB_M4_TRANS_N))
710	{
711',`dnl
712dnl
713ifelse(RSB_M4_AND(RSB_M4_WANT_20110206_BOUNDED_BOX_PATCH,RSB_M4_NOT(RSB_M4_IS_SCALING_OR_ZEROING_KERNEL_MOP(mop))),1,`dnl
714dnl	really, the above condition should also check for transposition! but in this way it does no wrong.
715	for(Mi=br;RSB_LIKELY(Mi<bc);++Mi)	/* experimental, for the bounded box patch */
716',`dnl
717	for(Mi=0;RSB_LIKELY(Mi<Mdim);++Mi)
718')dnl
719dnl
720	{
721')dnl
722dnl
723ifelse(RSB_M4_want_verbose_comments,`1',`dnl
724		/* logically,  i is the working block row, j is the working block column */
725		/* physically, Mi is the working block row, mi is the working block column */
726')dnl
727dnl
728pushdef(`colsu',ifelse(unrolling,`l',columns,colsu))dnl
729pushdef(`rowsu',ifelse(unrolling,`l',rows,rowsu))dnl
730pushdef(`tcolsu',ifelse(transposition,RSB_M4_TRANS_T,rowsu,colsu))dnl
731pushdef(`trowsu',ifelse(transposition,RSB_M4_TRANS_T,colsu,rowsu))dnl
732dnl
733ifelse(RSB_M4_IS_SPXX_OP_SCALING_KERNEL_MOP(mop),1,`dnl
734pushdef(`postalphamult',`(alpha)*')dnl
735',`dnl
736dnl
737ifelse(RSB_M4_IS_SPMX_OP_NEGATING_KERNEL_MOP(mop),1,`dnl
738pushdef(`postalphamult',`(-1)*')dnl
739',`dnl
740pushdef(`postalphamult',`')dnl
741')dnl
742dnl
743')dnl
744dnl
745ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),`1',`dnl
746ifelse(transposed,`0',`dnl
747ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),0,`dnl
748		const mtype *a=VA;
749')dnl
750')dnl
751ifelse(RSB_M4_OR(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),RSB_M4_AND(RSB_M4_IS_UNSYMMETRIC(k_symmetry),RSB_M4_NOT(transposed))),1,`dnl
752		register mtype cacc = RSB_M4_ZERO(mtype);
753dnl		mtype *outi=out+(trowsu*i*ystride);
754',`dnl
755')dnl
756')dnl
757dnl
758ifelse(RSB_M4_IS_SPXX_KERNEL_MOP(mop),`1',`dnl
759ifelse(RSB_M4_is_transposed_spmv,1,`dnl
760		const mtype bt=postalphamult`'trhs[(tcolsu*xstride*(Mi))];
761dnl		const mtype *b = rhs+(tcolsu*bci);
762',`dnl
763dnl		const mtype bn = rhs[(tcolsu*xstride*(Mi))];	/*20120915: spurious instruction commented out*/
764')dnl
765')dnl
766		const rsb_nnz_idx_t fk=bpntr[Mi],lk=bpntr[Mi+1];
767dnl
768dnl
769dnl	END COMMON EXTERNAL LOOP BEGINNING
770dnl
771dnl	BEGIN EXTERNAL LOOP VECTOR SCALING
772dnl
773ifelse(RSB_NOT(RSB_M4_IS_ALLOWING_ALIASING_KERNEL_MOP(mop)),1,`
774ifelse(should_init_out_vector_before_outer_loop,0,`dnl
775ifelse(unrolling,`l',`
776ifelse(mop,`spmv_uxux',`dnl
777	rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type), b_rows,&beta, out+rows*i, 1);/* we scale the destination vector */
778')dnl
779ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,`dnl
780ifelse(mop,`spmv_uauz',`dnl
781	rsb__cblas_Xscal(RSB_M4_NUMERICAL_TYPE_PREPROCESSOR_SYMBOL(type),b_rows,NULL,out+rows*i,ystride);
782')dnl
783')dnl
784dnl
785',`dnl
786dnl
787ifelse(RSB_M4_IS_ZEROING_KERNEL_MOP(mop),1,`dnl
788		forloop(`row',0,decr(trowsu),`out[trowsu*bri+row]=0;')
789')dnl
790ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),1,`dnl
791		forloop(`row',0,decr(trowsu),`out[trowsu*bri+row]*=beta;')
792')dnl
793')dnl
794')dnl
795')dnl
796dnl
797dnl
798ifelse(should_init_out_vector_before_outer_loop,0,`dnl
799ifelse(RSB_M4_IS_ZEROING_KERNEL_MOP(mop),1,`dnl
800		forloop(`row',0,decr(trowsu),`out[trowsu*bri+row]=0;')
801')dnl
802ifelse(RSB_M4_IS_SCALING_KERNEL_MOP(mop),1,`dnl
803		forloop(`row',0,decr(trowsu),`out[trowsu*bri+row]*=beta;')
804')dnl
805')dnl
806dnl
807dnl	END EXTERNAL LOOP VECTOR SCALING
808dnl
809ifelse(RSB_M4_want_verbose_comments,`1',` /*		Inner loop. Occurs on the minor dimension.	*/ ')dnl
810dnl
811dnl	BEGIN KERNELS DEFINITION
812dnl
813ifelse(RSB_M4_IS_SPMX_KERNEL_MOP(mop),`1',`dnl
814dnl
815dnl	BEGIN SPMV KERNEL DEF
816dnl		/* SPMV KERNEL BEGINS HERE */
817dnl
818ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),1----,`dnl
819ifelse(RSB_M4_IS_SPMX_KERNEL_MOP(mop),1,`dnl
820ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`',`dnl
821ifelse(RSB_M4_want_verbose_comments,`1',`dnl
822/*
823		Symmetric kernels should process the first block separately, if it contains `diagonal' elements.
824		FIXME : this is NOT the case for blocked code.
825*/
826')dnl
827		k=fk;
828		if(RSB_UNLIKELY(lk==k)) continue;/* nothing to do here */
829		mi=bindx[k];
830		if(mi==Mi && ((lk-k)>1) && roff==coff)	/* a `diagonal' element, and not the only one, on a diagonally positioned matrix */
831		{
832			const mtype *b = rhs+(tcolsu*bci);
833			mtype *c=out+(trowsu*bri);
834dnl			const mtype *b = rhs+(trowsu*bri);
835dnl			mtype *c=out+(tcolsu*bci);
836dnl
837dnl	/* FIXME : THIS IS AN EXAMPLE : SHOULD INTRODUCE DIAGONAL-SUBTRACTION CODELET */
838dnl
839{RSB_M4_EXTRA_SYMMETRIC_DIAGONAL_FIXING_KERNEL(`row',`rows',b_rows,`column',`columns',b_columns,mtype,,mop,unrolling,transposition,RSB_M4_SYMMETRY_SWITCH(k_symmetry))}
840		}
841')dnl
842')dnl
843')dnl
844dnl
845ifelse(RSB_M4_AND(RSB_M4_IS_SPMX_KERNEL_MOP(mop),RSB_M4_SAME(transposed,1)),1,`dnl
846ifelse(RSB_M4_want_verbose_comments,`1',`dnl
847dnl		/* `Since this is a transposed kernel, we apply a correction to the output vector locations.' */
848')dnl
849dnl		rhs=(rhs-coff*(xstride))+roff*(xstride); out=(out-roff*(ystride))+coff*(ystride);
850')dnl
851dnl
852dnl
853ifelse(RSB_M4_IS_UNSYMMETRIC(k_symmetry),1,`dnl
854ifelse(transposed,`0',`dnl
855dnl
856dnl	RSB_M4_EARLY_EVICT_INSTRUCTION((a+k,bindx+k))`'dnl
857dnl
858dnl RSB_M4_SIMPLE_LOOP_UNROLL_2S_J..
859RSB_M4_SIMPLE_LOOP_UNROLL_5S(`k',`LI',`fk',`lk',`dnl
860',`dnl
861dnl
862			`const rsb_coo_idx_t' `j_'``''LI`'=bindx[k+LI];
863			`const mtype b_'``''LI`'=rhs[tcolsu*(`j_'``''LI`')*xstride];
864			`const mtype a_'``''LI`'=a[k+LI];
865dnl
866',`dnl
867',`dnl
868dnl			cacc+=a[k+LI]*b_``''LI;
869dnl			cacc+=a_``''LI*b_``''LI;
870			``cacc+=a_''``''LI``*b_''``''LI;
871',`dnl RSB_M4_EARLY_EVICT_INSTRUCTION((a+k,bindx+k))`'dnl
872',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL)
873dnl
874dnl	RSB_M4_EARLY_EVICT_INSTRUCTION((a+k,bindx+k))`'dnl
875dnl	RSB_M4_EARLY_EVICT_INSTRUCTION((outi+k-12))`'dnl
876dnl
877')dnl
878')dnl
879dnl
880dnl
881ifelse(RSB_M4_IS_UNSYMMETRIC(k_symmetry),1,`dnl
882ifelse(transposed,`1',`dnl
883dnl
884RSB_M4_SIMPLE_LOOP_UNROLL_2S_J(`k',`LI',`fk',`lk',`dnl
885dnl
886			`const rsb_coo_idx_t' `j_'``''LI`'=bindx[k+LI];
887			`const mtype a_'``''LI`'=RSB_M4_CONJ(VA[k+LI],mtype,transposition,k_symmetry);
888			`mtype c_'``''LI`'=a_``''LI*bt;
889dnl
890',`dnl
891			tout[(tcolsu)*(`j_'``''LI`')*ystride]+=`c_'``''LI`';
892',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL)
893dnl
894dnl
895')dnl
896')dnl
897dnl
898ifelse(RSB_M4_IS_NOT_UNSYMMETRIC(k_symmetry),1,`dnl
899ifelse(k_symmetry,RSB_M4_SYMBOL_HERMITIAN,`dnl
900ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_C),1,`dnl
901pushdef(`ntransposition',transposition)dnl
902pushdef(`ttransposition',RSB_M4_TRANSPOSE_TRANSPOSITION(transposition))dnl
903')dnl
904ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_T),1,`dnl
905pushdef(`ntransposition',transposition)dnl
906pushdef(`ttransposition',RSB_M4_TRANS_C)dnl
907')dnl
908ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl
909pushdef(`ntransposition',RSB_M4_TRANS_C)dnl
910pushdef(`ttransposition',transposition)dnl
911')dnl
912',`dnl
913ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_C),1,`dnl
914pushdef(`ntransposition',transposition)dnl
915pushdef(`ttransposition',transposition)dnl
916',`dnl
917pushdef(`ntransposition',RSB_M4_TRANSPOSE_TRANSPOSITION(transposition))dnl
918pushdef(`ttransposition',RSB_M4_TRANSPOSE_TRANSPOSITION(transposition))dnl
919')dnl
920')dnl
921dnl			// nt: ntransposition ttransposition
922			k=fk;
923			if(k==lk)continue;
924			j=bindx[k];
925			cacc += RSB_M4_CONJ(VA[k],mtype,ntransposition,k_symmetry)*rhs[tcolsu*j*xstride];
926			if(roff!=coff || (j!=i))
927				tout[(tcolsu)*(j)*ystride]+=RSB_M4_CONJ(VA[k],mtype,ttransposition,k_symmetry)*bt;
928			++k;
929dnl RSB_M4_SIMPLE_LOOP_UNROLL_2S..
930RSB_M4_SIMPLE_LOOP_UNROLL_2S_J(`k',`LI',`fk+1',`lk-1',`dnl
931dnl
932			`const rsb_coo_idx_t' `j_'``''LI`'=bindx[k+LI];
933			`const mtype b_'``''LI`'=rhs[tcolsu*(`j_'``''LI`')*xstride];
934			`const mtype a_'``''LI`'=VA[k+LI];
935			`mtype c_'``''LI`'=RSB_M4_CONJ_SYM(mtype,ttransposition,k_symmetry)( `a_'``''LI)*bt;
936dnl			`mtype c_'``''LI`'=RSB_M4_CONJ(( `a_'``''LI),mtype,transposition,k_symmetry) *bt ;
937dnl
938',`dnl
939			cacc += RSB_M4_CONJ_SYM(mtype,ntransposition,k_symmetry)(`a_'``''LI)*b_``''LI;
940			tout[(tcolsu)*(`j_'``''LI`')*ystride]+=`c_'``''LI`';
941',RSB_M4_SIMPLE_LOOP_UNROLL_DEFAULT_FACTOR_SMALL)
942			if(k<lk)
943			{
944				j=bindx[k];
945				cacc += RSB_M4_CONJ(VA[k],mtype,ntransposition,k_symmetry)*rhs[trowsu*j*xstride];
946				if(roff!=coff || (j!=i))
947					tout[(tcolsu)*(j)*ystride]+=RSB_M4_CONJ(VA[k],mtype,ttransposition,k_symmetry)*bt;
948				++k;
949			}
950popdef(`ntransposition')dnl
951popdef(`ttransposition')dnl
952dnl
953')dnl
954dnl
955ifelse(RSB_M4_should_merge_value_after_inner_loop,`1',`dnl
956dnl			outi[0]+=postalphamult`cacc';
957			out[(trowsu*i*ystride)]+=postalphamult`cacc';
958')dnl
959dnl
960dnl		}
961dnl
962dnl
963dnl
964dnl	FIXME : this code is only a quick hack for CSR!
965dnl
966dnl
967dnl		/* SPMV KERNEL ENDS HERE */
968popdef(`postalphamult')dnl
969dnl	END SPMV KERNEL DEF
970')dnl
971dnl
972ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),`1',`dnl
973dnl	BEGIN SPSV KERNEL DEF
974dnl	/* SPSV KERNEL BEGINS HERE */
975dnl
976ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),1,`dnl
977dnl		const mtype bb_0=rhs[(trowsu*bri)];
978ifelse(is_diag_d_spsv_kernel,1,`',`dnl
979ifelse(RSB_M4_OR(RSB_M4_IS_SPSX_OP_SCALING_KERNEL_MOP(mop),RSB_M4_IS_SPSX_OP_SETTING_KERNEL_MOP(mop)),1,`dnl
980		const mtype bb_0=rhs[(trowsu*Mi*extra_xstride)];
981')dnl
982')dnl
983		mtype ax_0;
984dnl
985ifelse(is_diag_d_spsv_kernel,1,`dnl
986dnl
987dnl	FIXME: missing incx, incy support here!
988dnl
989ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`dnl
990		const mtype aa=1;
991',`dnl
992		const mtype aa=VA[ifelse(uplo,`u',`fk',`lk-1')];
993ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK(),1,`dnl
994		if(aa == RSB_M4_ZERO(mtype))return RSB_ERR_INVALID_NUMERICAL_DATA;
995')dnl
996')dnl
997dnl
998
999ifelse(RSB_M4_IS_SPSX_OP_SCALING_KERNEL_MOP(mop),1,`dnl
1000dnl
1001dnl		out[tcolsu*bci]/=RSB_M4_CONJ(VA[bpntr[Mi+1]-1],mtype,transposition,k_symmetry);
1002dnl
1003',`dnl
1004dnl		out[tcolsu*bci]/=RSB_M4_CONJ(VA[bpntr[Mi+1]-1],mtype,transposition,k_symmetry);
1005dnl
1006')dnl
1007dnl
1008		out[tcolsu*bci]/=aa;
1009dnl
1010')dnl
1011dnl
1012ifelse(is_zero_acc_spsv_kernel,1,`dnl
1013		ax_0=0;
1014',`dnl
1015		ax_0=out[tcolsu*bci];
1016')dnl
1017dnl
1018dnl
1019')dnl
1020dnl
1021ifelse(RSB_M4_IS_SPSX_KERNEL_MOP(mop),`1',`dnl
1022pushdef(`skip_head_row_elements',ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),`1',`0',ifelse(uplo,`u',`1',`0')))dnl
1023pushdef(`skip_tail_row_elements',ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),`1',`0',ifelse(uplo,`u',`0',`1')))dnl
1024',`dnl
1025pushdef(`skip_head_row_elements',0)dnl
1026pushdef(`skip_tail_row_elements',0)dnl
1027')dnl
1028dnl
1029ifelse(is_a_backward_kernel,1,`
1030dnl
1031dnl	FIXME : backward kernels are noly used for SPSV, and they start with one element less
1032dnl
1033		for(k=lk-1-skip_tail_row_elements`'dnl
1034,a=VA+k,mi=bindx[k];k+1>=fk+1+skip_head_row_elements  ;--k,block_forward,mi=bindx[k])
1035dnl	/* k is the index of the block */
1036',`dnl
1037		ifelse(skip_head_row_elements,1,block_forward;)
1038		for(k=fk+skip_head_row_elements,mi=bindx[k];k<lk-skip_tail_row_elements  ;++k,block_forward,mi=bindx[k])
1039dnl	/* k is the index of the block */
1040')dnl
1041		{
1042ifelse(RSB_M4_SAME(transposition,RSB_M4_TRANS_N),1,`dnl
1043			const mtype *b=out + (tcolsu*bci);
1044			mtype *c=&ax_0;
1045')dnl
1046dnl
1047dnl	Fixed for Hermitian k_symmetry.
1048dnl
1049ifelse(is_diag_d_spsv_kernel,1,`dnl
1050		out[trowsu*bri]-=RSB_M4_CONJ(*a,mtype,transposition,k_symmetry)*ax_0;
1051',`dnl
1052{RSB_M4_KERNEL_FUNCTION_BODY(`row',`rows',b_rows,`column',`columns',b_columns,mtype,,mop,unrolling,RSB_M4_SYMBOL_UNSYMMETRIC)}
1053')dnl
1054dnl
1055		}
1056dnl
1057ifelse(is_diag_d_spsv_kernel,1,`dnl
1058ifelse(RSB_M4_IS_SPSX_OP_SCALING_KERNEL_MOP(mop),1,`dnl
1059		out[tcolsu*bci]*=alpha;
1060')dnl
1061')dnl
1062dnl
1063ifelse(is_diag_d_spsv_kernel,1,`',`dnl
1064ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`',`dnl
1065		if(lk-fk>0)
1066dnl	/* if this row block was not empty */
1067')dnl
1068		{
1069			/* `the last element (which for a lower triangular solve is on the diagonal')*/
1070dnl			block_backward;
1071			/* Lx=y ; x_0=y_0/L_1_1  */
1072			mtype *c_0=out+(trowsu*bri);
1073ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`dnl
1074			const mtype aa=1;
1075',`dnl
1076dnl			elements on the diagonal are real, and no conjugation is needed
1077			const mtype aa=VA[ifelse(uplo,`u',`fk',`lk-1')];
1078ifelse(RSB_M4_WANT_SPSM_DIAG_CHECK(),1,`dnl
1079		if(aa == RSB_M4_ZERO(mtype))return RSB_ERR_INVALID_NUMERICAL_DATA;
1080')dnl
1081')dnl
1082dnl
1083dnl
1084ifelse(RSB_M4_IS_SPSX_OP_SCALING_KERNEL_MOP(mop),1,`dnl
1085			*c_0 =(alpha*bb_0 - ax_0)/aa;	/* ax_0 + *a * *c_0=bb_0 -> (*c_0)=(bb_0 - ax_0 )/(*a) */
1086')dnl
1087ifelse(RSB_M4_IS_SPSX_OP_SETTING_KERNEL_MOP(mop),1,`dnl
1088			*c_0=(bb_0 - ax_0)/aa;	/* ax_0 + *a * *c_0=bb_0 -> (*c_0)=(bb_0 - ax_0 )/(*a) */
1089')dnl
1090dnl
1091ifelse(RSB_M4_IS_DIAGONAL_IMPLICIT(k_diagonal),1,`',`dnl
1092			block_forward;
1093')dnl
1094		}
1095')dnl
1096dnl
1097popdef(`skip_head_row_elements')dnl
1098popdef(`skip_tail_row_elements')dnl
1099dnl
1100dnl
1101dnl		/* SPSV KERNEL ENDS HERE */
1102dnl	END SPSV KERNEL DEF
1103')dnl
1104dnl
1105ifelse(RSB_M4_NOT(RSB_M4_IS_SPXX_KERNEL_MOP(mop)),`1',`dnl
1106dnl	BEGIN MISC KERNEL DEF
1107dnl
1108 		/* touppercase(mop) KERNEL HERE */
1109dnl		for(k=fk,mi=bindx[k];k<lk;++k,block_forward,mi=bindx[k]) 20120915 /*buggy loop */
1110		for(k=fk;k<lk;++k,block_forward)
1111		{
1112		mi=bindx[k];
1113		{
1114ifelse(mop,`scale',`dnl
1115			/*a=VA+indptr[(k)];*/
1116			const mtype *d=scale_factors+(trowsu*bri);
1117')dnl
1118ifelse(mop,`negation',`dnl
1119			/*a=VA+indptr[k];*/
1120')dnl
1121ifelse(RSB_M4_IS_ACC_WRITING_KERNEL_MOP(mop),`1',`dnl
1122			/*a=VA+indptr[k];*/
1123			mtype *local_row_sums = row_sums+(trowsu*bri);
1124')dnl
1125dnl {RSB_M4_KERNEL_FUNCTION_BODY(`row',`rows',b_rows,`column',`columns',b_columns,mtype,,mop,unrolling,RSB_M4_SYMBOL_UNSYMMETRIC)}
1126{RSB_M4_KERNEL_FUNCTION_BODY(`row',`rows',b_rows,`column',`columns',b_columns,mtype,,mop,unrolling,k_symmetry)}
1127		}
1128		}
1129dnl
1130dnl	END MISC KERNEL DEF
1131')dnl
1132dnl
1133dnl	END KERNELS DEFINITION
1134dnl
1135dnl	BEGIN COMMON EXTERNAL LOOP CLOSING
1136	}
1137dnl	END COMMON EXTERNAL LOOP CLOSING
1138dnl
1139dnl ifelse(RSB_M4_IS_STRIDED_KERNEL_MOP(mop),`1',dnl
1140dnl	`incx--;incy--;/* we are interested in the increment off 1 */
1141dnl ')dnl
1142dnl
1143dnl
1144ifelse(RSB_M4_IS_FORMAT_BCSS(matrix_storage),1,`dnl
1145dnl	this check would be good for no-looped functions only!
1146dnl	if(columns != b_columns || rows != b_rows)return RSB_ERR_BADARGS; /* a non comprehensive check of course*/
1147
1148dnl	FIXME : ONLY EXPERIMENTAL OPENMP SUPPORT
1149dnl
1150dnl
1151ifelse(RSB_M4_WANT_OMP_IN_KERNELS,`1',`dnl
1152	size_t tn;
1153	size_t nt;
1154`#'dnl
1155       pragma omp parallel num_threads(rsb_global_session_handle.rsb_g_threads) private(mi,Mi,k,tn,nt)
1156	{
1157	tn = omp_get_thread_num();
1158	nt = omp_get_num_threads();
1159	/*RSB_INFO("working on %d / %d threads\n",tn,nt);*/
1160	//for(Mi=tn;Mi<Mdim;Mi+=nt)
1161	size_t ui=((Mdim/nt)*(tn+1));
1162	size_t li=(Mdim/nt)*tn;
1163	if(ui>Mdim)ui=Mdim;
1164dnl	#pragma omp for schedule(static,1)		/* shared L1 cache */
1165#pragma omp for schedule(static,(Mdim+1)/2)		/* separate L1 caches */
1166	for(Mi=li;RSB_LIKELY(Mi<ui);++Mi)
1167	{
1168	//RSB_INFO("row %d working on %d / %d threads\n",mi,tn,nt);
1169',`dnl
1170dnl
1171')dnl
1172dnl ifelse(RSB_M4_IS_FORMAT_ROW_MAJOR(matrix_storage),1,`dnl
1173dnl 		/* should zero output block here (for efficiency) instead of function top */
1174dnl ')dnl
1175dnl
1176dnl		FIXME: the following is NEW, and useful also for SYMMETRIC
1177dnl		/* transpose.. is transposed */
1178dnl		/* useless for storage matrix_storage */
1179dnl		/*if(bpntr[Mi]==bpntr[Mi+1])continue;*/ /* empty  */
1180ifelse(mop,`spmv_uauz',`dnl
1181dnl		mtype *c=out+(rowsu*mi); /* declaration of c put here for experimental purposes */
1182')dnl
1183dnl
1184dnl
1185dnl	FIXME : blocked TRS kernels are broken, in this way
1186dnl
1187dnl			mi=bindx[k];
1188dnl			/* `mop' is mop */
1189dnl
1190dnl
1191dnl
1192dnl
1193dnl
1194popdef(`is_diag_d_spsv_kernel')dnl
1195popdef(`tcolsu')dnl
1196popdef(`trowsu')dnl
1197popdef(`colsu')dnl
1198popdef(`rowsu')dnl
1199popdef(`transposed')dnl 1/2
1200dnl popdef(`transposed')dnl 2/2
1201popdef(`should_init_out_vector_before_outer_loop')dnl
1202popdef(`total_block_columns')dnl
1203popdef(`total_block_rows')dnl
1204popdef(`total_rows')dnl
1205popdef(`total_columns')dnl
1206dnl
1207dnl
1208ifelse(RSB_M4_WANT_OMP_IN_KERNELS,`1',`dnl
1209	}
1210')dnl
1211popdef(`mi')dnl
1212popdef(`Mi')dnl
1213popdef(`brit')dnl
1214popdef(`bcit')dnl
1215popdef(`brin')dnl
1216popdef(`bcin')dnl
1217popdef(`bri')dnl
1218popdef(`bci')dnl
1219')dnl
1220dnl
1221	return RSB_ERR_NO_ERROR;
1222dnl
1223')dnl
1224')dnl
1225dnl
1226')')dnl
1227dnl
1228dnl
1229popdef(`skip_implementation')dnl
1230popdef(`out_dim')dnl
1231popdef(`is_a_backward_kernel')dnl
1232popdef(`is_an_externally_backward_kernel')dnl
1233popdef(`is_zero_acc_spsv_kernel')dnl
1234popdef(`block_forward')dnl
1235popdef(`block_backward')dnl
1236popdef(`extra_xstride')dnl
1237popdef(`extra_ystride')dnl
1238}
1239dnl
1240')dnl
1241dnl
1242')dnl
1243dnl
1244popdef(`uplo')dnl
1245popdef(`want_what')dnl
1246popdef(`k_diagonal')dnl
1247popdef(`citype')dnl
1248popdef(`mop')dnl
1249popdef(`matrix_storage')dnl
1250popdef(`k_symmetry')dnl
1251popdef(`transposition')dnl
1252popdef(`mtype')dnl
1253popdef(`itype')dnl
1254popdef(`unrolling')dnl
1255')dnl
1256dnl
1257dnl
1258define(`RSB_M4_BCSS_MISC_KERNELS',`dnl
1259dnl
1260pushdef(`unrollings',$1)dnl
1261dnl
1262dnl	FIXED BLOCK SIZE KERNELS :
1263dnl
1264foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
1265foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
1266foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl
1267foreach(`unrolling',unrollings,`dnl
1268ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop)RSB_M4_IS_SPMV_KERNEL_MOP(mop),00,`dnl
1269foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl
1270foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl
1271foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
1272foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
1273foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
1274foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
1275foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
1276RSB_M4_BCSS_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)
1277')dnl
1278')dnl
1279')dnl
1280')dnl
1281')dnl
1282')dnl
1283')dnl
1284')dnl
1285')dnl
1286')dnl
1287')dnl
1288')dnl
1289dnl
1290dnl	FIXED BLOCK SIZE DISPATCHERS :
1291dnl
1292foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
1293foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
1294ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop)RSB_M4_IS_SPMV_KERNEL_MOP(mop),00,`dnl
1295foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl
1296foreach(`unrolling',unrollings,`dnl
1297foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
1298foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
1299foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
1300foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
1301foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
1302RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
1303')dnl
1304')dnl
1305')dnl
1306')dnl
1307')dnl
1308')dnl
1309')dnl
1310')dnl
1311')dnl
1312')dnl
1313dnl
1314dnl
1315popdef(`unrollings')dnl
1316dnl
1317')dnl
1318dnl
1319dnl
1320dnl
1321dnl
1322define(`RSB_M4_BCSS_SPMV_KERNELS',`dnl
1323dnl
1324pushdef(`unrollings',$1)dnl
1325dnl
1326dnl	FIXED BLOCK SIZE KERNELS :
1327dnl
1328foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
1329foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
1330foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl
1331foreach(`unrolling',unrollings,`dnl
1332ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),1,`dnl
1333foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
1334foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl
1335foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl
1336foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
1337foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
1338foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
1339foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
1340RSB_M4_BCSS_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)
1341')dnl
1342')dnl
1343')dnl
1344')dnl
1345')dnl
1346')dnl
1347')dnl
1348')dnl
1349')dnl
1350')dnl
1351')dnl
1352')dnl
1353dnl
1354dnl	FIXED BLOCK SIZE DISPATCHERS :
1355dnl
1356foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
1357foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
1358ifelse(RSB_M4_IS_SPMV_KERNEL_MOP(mop),1,`dnl
1359foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl
1360foreach(`unrolling',unrollings,`dnl
1361foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
1362foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
1363foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
1364foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
1365foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
1366RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
1367')dnl
1368')dnl
1369')dnl
1370')dnl
1371')dnl
1372')dnl
1373')dnl
1374')dnl
1375')dnl
1376')dnl
1377dnl
1378dnl
1379popdef(`unrollings')dnl
1380dnl
1381')dnl
1382dnl
1383dnl
1384dnl
1385dnl
1386define(`RSB_M4_BCSS_SPSV_KERNELS',`dnl
1387dnl
1388pushdef(`unrollings',$1)dnl
1389dnl
1390dnl	FIXED BLOCK SIZE KERNELS :
1391dnl
1392foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
1393foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
1394ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop),1,`dnl
1395foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
1396foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl
1397foreach(`unrolling',unrollings,`dnl
1398foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl
1399foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl
1400foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
1401foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
1402foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
1403foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
1404RSB_M4_BCSS_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)
1405')dnl
1406')dnl
1407')dnl
1408')dnl
1409')dnl
1410')dnl
1411')dnl
1412')dnl
1413')dnl
1414')dnl
1415')dnl
1416')dnl
1417dnl
1418dnl	FIXED BLOCK SIZE DISPATCHERS :
1419dnl
1420foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
1421foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
1422ifelse(RSB_M4_IS_SPSV_KERNEL_MOP(mop),1,`dnl
1423foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl
1424foreach(`unrolling',unrollings,`dnl
1425foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
1426foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
1427foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
1428foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
1429foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
1430RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
1431')dnl
1432')dnl
1433')dnl
1434')dnl
1435')dnl
1436')dnl
1437')dnl
1438')dnl
1439')dnl
1440')dnl
1441dnl
1442dnl
1443popdef(`unrollings')dnl
1444dnl
1445')dnl
1446dnl
1447dnl
1448dnl
1449dnl
1450dnl
1451define(`RSB_M4_BCSS_KERNELS',`dnl
1452dnl
1453pushdef(`unrollings',$1)dnl
1454dnl
1455dnl	FIXED BLOCK SIZE KERNELS :
1456dnl
1457foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
1458foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
1459foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl
1460foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
1461foreach(`unrolling',unrollings,`dnl
1462foreach(`rowsu',RSB_M4_ROWS_UNROLL,`dnl
1463foreach(`colsu',RSB_M4_COLUMNS_UNROLL,`dnl
1464foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
1465foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
1466foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
1467foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
1468RSB_M4_BCSS_KERNEL_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,rowsu,colsu,unrolling,mop,citype,k_diagonal,uplo)
1469')dnl
1470')dnl
1471')dnl
1472')dnl
1473')dnl
1474')dnl
1475')dnl
1476')dnl
1477')dnl
1478')dnl
1479')dnl
1480dnl
1481dnl	FIXED BLOCK SIZE DISPATCHERS :
1482dnl
1483foreach(`type',RSB_M4_MATRIX_TYPES,`dnl
1484foreach(`mop',RSB_M4_MATRIX_OPS,`dnl
1485foreach(`k_diagonal',RSB_M4_MATRIX_DIAGONAL_TYPES,`dnl
1486foreach(`matrix_storage',RSB_M4_BCSS_FORMATS,`dnl
1487foreach(`unrolling',unrollings,`dnl
1488foreach(`k_symmetry',RSB_M4_MATRIX_SYMMETRY,`dnl
1489foreach(`transposition',RSB_M4_MATRIX_TRANSPOSITIONS,`dnl
1490foreach(`citype',RSB_M4_MATRIX_COORDINATE_TYPES,`dnl
1491foreach(`uplo',RSB_M4_MATRIX_UPLO_TYPES,`dnl
1492RSB_M4_BCSS_KERNEL_SIZE_DISPATCH_FUNCTION(`all',type,matrix_storage,transposition,k_symmetry,unrolling,,,mop,citype,k_diagonal,uplo)
1493')dnl
1494')dnl
1495')dnl
1496')dnl
1497')dnl
1498')dnl
1499')dnl
1500')dnl
1501')dnl
1502dnl
1503dnl
1504popdef(`unrollings')dnl
1505dnl
1506')dnl
1507dnl
1508dnl
1509dnl
1510