1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2019, Advanced Micro Devices, Inc.
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 
37 //
38 // -- Row storage case ---------------------------------------------------------
39 //
40 
41 #undef  GENTFUNC
42 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
43 \
44 void PASTEMAC3(ch,opname,arch,suf) \
45      ( \
46        conj_t              conja, \
47        conj_t              conjb, \
48        dim_t               m, \
49        dim_t               n, \
50        dim_t               k, \
51        ctype*     restrict alpha, \
52        ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
53        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
54        ctype*     restrict beta, \
55        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
56        auxinfo_t* restrict data, \
57        cntx_t*    restrict cntx  \
58      ) \
59 { \
60 	/* NOTE: This microkernel can actually handle arbitrarily large
61        values of m, n, and k. */ \
62 \
63 	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
64 	{ \
65 		/* Traverse c by rows. */ \
66 		for ( dim_t i = 0; i < m; ++i ) \
67 		{ \
68 			ctype* restrict ci = &c[ i*rs_c ]; \
69 			ctype* restrict ai = &a[ i*rs_a ]; \
70 \
71 			for ( dim_t j = 0; j < n; ++j ) \
72 			{ \
73 				ctype* restrict cij = &ci[ j*cs_c ]; \
74 				ctype* restrict bj  = &b [ j*cs_b ]; \
75 				ctype           ab; \
76 \
77 				PASTEMAC(ch,set0s)( ab ); \
78 \
79 				/* Perform a dot product to update the (i,j) element of c. */ \
80 				for ( dim_t l = 0; l < k; ++l ) \
81 				{ \
82 					ctype* restrict aij = &ai[ l*cs_a ]; \
83 					ctype* restrict bij = &bj[ l*rs_b ]; \
84 \
85 					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
86 				} \
87 \
88 				/* If beta is one, add ab into c. If beta is zero, overwrite c
89 				   with the result in ab. Otherwise, scale by beta and accumulate
90 				   ab to c. */ \
91 				if ( PASTEMAC(ch,eq1)( *beta ) ) \
92 				{ \
93 					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
94 				} \
95 				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
96 				{ \
97 					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
98 				} \
99 				else \
100 				{ \
101 					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
102 				} \
103 			} \
104 		} \
105 	} \
106 	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
107 	{ \
108 		/* Traverse c by rows. */ \
109 		for ( dim_t i = 0; i < m; ++i ) \
110 		{ \
111 			ctype* restrict ci = &c[ i*rs_c ]; \
112 			ctype* restrict ai = &a[ i*rs_a ]; \
113 \
114 			for ( dim_t j = 0; j < n; ++j ) \
115 			{ \
116 				ctype* restrict cij = &ci[ j*cs_c ]; \
117 				ctype* restrict bj  = &b [ j*cs_b ]; \
118 				ctype           ab; \
119 \
120 				PASTEMAC(ch,set0s)( ab ); \
121 \
122 				/* Perform a dot product to update the (i,j) element of c. */ \
123 				for ( dim_t l = 0; l < k; ++l ) \
124 				{ \
125 					ctype* restrict aij = &ai[ l*cs_a ]; \
126 					ctype* restrict bij = &bj[ l*rs_b ]; \
127 \
128 					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
129 				} \
130 \
131 				/* If beta is one, add ab into c. If beta is zero, overwrite c
132 				   with the result in ab. Otherwise, scale by beta and accumulate
133 				   ab to c. */ \
134 				if ( PASTEMAC(ch,eq1)( *beta ) ) \
135 				{ \
136 					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
137 				} \
138 				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
139 				{ \
140 					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
141 				} \
142 				else \
143 				{ \
144 					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
145 				} \
146 			} \
147 		} \
148 	} \
149 	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
150 	{ \
151 		/* Traverse c by rows. */ \
152 		for ( dim_t i = 0; i < m; ++i ) \
153 		{ \
154 			ctype* restrict ci = &c[ i*rs_c ]; \
155 			ctype* restrict ai = &a[ i*rs_a ]; \
156 \
157 			for ( dim_t j = 0; j < n; ++j ) \
158 			{ \
159 				ctype* restrict cij = &ci[ j*cs_c ]; \
160 				ctype* restrict bj  = &b [ j*cs_b ]; \
161 				ctype           ab; \
162 \
163 				PASTEMAC(ch,set0s)( ab ); \
164 \
165 				/* Perform a dot product to update the (i,j) element of c. */ \
166 				for ( dim_t l = 0; l < k; ++l ) \
167 				{ \
168 					ctype* restrict aij = &ai[ l*cs_a ]; \
169 					ctype* restrict bij = &bj[ l*rs_b ]; \
170 \
171 					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
172 				} \
173 \
174 				/* If beta is one, add ab into c. If beta is zero, overwrite c
175 				   with the result in ab. Otherwise, scale by beta and accumulate
176 				   ab to c. */ \
177 				if ( PASTEMAC(ch,eq1)( *beta ) ) \
178 				{ \
179 					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
180 				} \
181 				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
182 				{ \
183 					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
184 				} \
185 				else \
186 				{ \
187 					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
188 				} \
189 			} \
190 		} \
191 	} \
192 	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
193 	{ \
194 		/* Traverse c by rows. */ \
195 		for ( dim_t i = 0; i < m; ++i ) \
196 		{ \
197 			ctype* restrict ci = &c[ i*rs_c ]; \
198 			ctype* restrict ai = &a[ i*rs_a ]; \
199 \
200 			for ( dim_t j = 0; j < n; ++j ) \
201 			{ \
202 				ctype* restrict cij = &ci[ j*cs_c ]; \
203 				ctype* restrict bj  = &b [ j*cs_b ]; \
204 				ctype           ab; \
205 \
206 				PASTEMAC(ch,set0s)( ab ); \
207 \
208 				/* Perform a dot product to update the (i,j) element of c. */ \
209 				for ( dim_t l = 0; l < k; ++l ) \
210 				{ \
211 					ctype* restrict aij = &ai[ l*cs_a ]; \
212 					ctype* restrict bij = &bj[ l*rs_b ]; \
213 \
214 					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
215 				} \
216 \
217 				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
218 				PASTEMAC(ch,conjs)( ab ); \
219 \
220 				/* If beta is one, add ab into c. If beta is zero, overwrite c
221 				   with the result in ab. Otherwise, scale by beta and accumulate
222 				   ab to c. */ \
223 				if ( PASTEMAC(ch,eq1)( *beta ) ) \
224 				{ \
225 					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
226 				} \
227 				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
228 				{ \
229 					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
230 				} \
231 				else \
232 				{ \
233 					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
234 				} \
235 			} \
236 		} \
237 	} \
238 }
239 
240 INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
241 
242 //
243 // -- Column storage case ------------------------------------------------------
244 //
245 
246 #undef  GENTFUNC
247 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
248 \
249 void PASTEMAC3(ch,opname,arch,suf) \
250      ( \
251        conj_t              conja, \
252        conj_t              conjb, \
253        dim_t               m, \
254        dim_t               n, \
255        dim_t               k, \
256        ctype*     restrict alpha, \
257        ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
258        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
259        ctype*     restrict beta, \
260        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
261        auxinfo_t* restrict data, \
262        cntx_t*    restrict cntx  \
263      ) \
264 { \
265 	/* NOTE: This microkernel can actually handle arbitrarily large
266        values of m, n, and k. */ \
267 \
268 	if ( bli_is_noconj( conja ) && bli_is_noconj( conjb ) ) \
269 	{ \
270 		/* Traverse c by columns. */ \
271 		for ( dim_t j = 0; j < n; ++j ) \
272 		{ \
273 			ctype* restrict cj = &c[ j*cs_c ]; \
274 			ctype* restrict bj = &b[ j*cs_b ]; \
275 \
276 			for ( dim_t i = 0; i < m; ++i ) \
277 			{ \
278 				ctype* restrict cij = &cj[ i*rs_c ]; \
279 				ctype* restrict ai  = &a [ i*rs_a ]; \
280 				ctype           ab; \
281 \
282 				PASTEMAC(ch,set0s)( ab ); \
283 \
284 				/* Perform a dot product to update the (i,j) element of c. */ \
285 				for ( dim_t l = 0; l < k; ++l ) \
286 				{ \
287 					ctype* restrict aij = &ai[ l*cs_a ]; \
288 					ctype* restrict bij = &bj[ l*rs_b ]; \
289 \
290 					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
291 				} \
292 \
293 				/* If beta is one, add ab into c. If beta is zero, overwrite c
294 				   with the result in ab. Otherwise, scale by beta and accumulate
295 				   ab to c. */ \
296 				if ( PASTEMAC(ch,eq1)( *beta ) ) \
297 				{ \
298 					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
299 				} \
300 				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
301 				{ \
302 					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
303 				} \
304 				else \
305 				{ \
306 					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
307 				} \
308 			} \
309 		} \
310 	} \
311 	else if ( bli_is_noconj( conja ) && bli_is_conj( conjb ) ) \
312 	{ \
313 		/* Traverse c by columns. */ \
314 		for ( dim_t j = 0; j < n; ++j ) \
315 		{ \
316 			ctype* restrict cj = &c[ j*cs_c ]; \
317 			ctype* restrict bj = &b[ j*cs_b ]; \
318 \
319 			for ( dim_t i = 0; i < m; ++i ) \
320 			{ \
321 				ctype* restrict cij = &cj[ i*rs_c ]; \
322 				ctype* restrict ai  = &a [ i*rs_a ]; \
323 				ctype           ab; \
324 \
325 				PASTEMAC(ch,set0s)( ab ); \
326 \
327 				/* Perform a dot product to update the (i,j) element of c. */ \
328 				for ( dim_t l = 0; l < k; ++l ) \
329 				{ \
330 					ctype* restrict aij = &ai[ l*cs_a ]; \
331 					ctype* restrict bij = &bj[ l*rs_b ]; \
332 \
333 					PASTEMAC(ch,axpyjs)( *aij, *bij, ab ); \
334 				} \
335 \
336 				/* If beta is one, add ab into c. If beta is zero, overwrite c
337 				   with the result in ab. Otherwise, scale by beta and accumulate
338 				   ab to c. */ \
339 				if ( PASTEMAC(ch,eq1)( *beta ) ) \
340 				{ \
341 					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
342 				} \
343 				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
344 				{ \
345 					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
346 				} \
347 				else \
348 				{ \
349 					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
350 				} \
351 			} \
352 		} \
353 	} \
354 	else if ( bli_is_conj( conja ) && bli_is_noconj( conjb ) ) \
355 	{ \
356 		/* Traverse c by columns. */ \
357 		for ( dim_t j = 0; j < n; ++j ) \
358 		{ \
359 			ctype* restrict cj = &c[ j*cs_c ]; \
360 			ctype* restrict bj = &b[ j*cs_b ]; \
361 \
362 			for ( dim_t i = 0; i < m; ++i ) \
363 			{ \
364 				ctype* restrict cij = &cj[ i*rs_c ]; \
365 				ctype* restrict ai  = &a [ i*rs_a ]; \
366 				ctype           ab; \
367 \
368 				PASTEMAC(ch,set0s)( ab ); \
369 \
370 				/* Perform a dot product to update the (i,j) element of c. */ \
371 				for ( dim_t l = 0; l < k; ++l ) \
372 				{ \
373 					ctype* restrict aij = &ai[ l*cs_a ]; \
374 					ctype* restrict bij = &bj[ l*rs_b ]; \
375 \
376 					PASTEMAC(ch,dotjs)( *aij, *bij, ab ); \
377 				} \
378 \
379 				/* If beta is one, add ab into c. If beta is zero, overwrite c
380 				   with the result in ab. Otherwise, scale by beta and accumulate
381 				   ab to c. */ \
382 				if ( PASTEMAC(ch,eq1)( *beta ) ) \
383 				{ \
384 					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
385 				} \
386 				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
387 				{ \
388 					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
389 				} \
390 				else \
391 				{ \
392 					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
393 				} \
394 			} \
395 		} \
396 	} \
397 	else /* if ( bli_is_conj( conja ) && bli_is_conj( conjb ) ) */ \
398 	{ \
399 		/* Traverse c by columns. */ \
400 		for ( dim_t j = 0; j < n; ++j ) \
401 		{ \
402 			ctype* restrict cj = &c[ j*cs_c ]; \
403 			ctype* restrict bj = &b[ j*cs_b ]; \
404 \
405 			for ( dim_t i = 0; i < m; ++i ) \
406 			{ \
407 				ctype* restrict cij = &cj[ i*rs_c ]; \
408 				ctype* restrict ai  = &a [ i*rs_a ]; \
409 				ctype           ab; \
410 \
411 				PASTEMAC(ch,set0s)( ab ); \
412 \
413 				/* Perform a dot product to update the (i,j) element of c. */ \
414 				for ( dim_t l = 0; l < k; ++l ) \
415 				{ \
416 					ctype* restrict aij = &ai[ l*cs_a ]; \
417 					ctype* restrict bij = &bj[ l*rs_b ]; \
418 \
419 					PASTEMAC(ch,dots)( *aij, *bij, ab ); \
420 				} \
421 \
422 				/* Conjugate the result to simulate conj(a^T) * conj(b). */ \
423 				PASTEMAC(ch,conjs)( ab ); \
424 \
425 				/* If beta is one, add ab into c. If beta is zero, overwrite c
426 				   with the result in ab. Otherwise, scale by beta and accumulate
427 				   ab to c. */ \
428 				if ( PASTEMAC(ch,eq1)( *beta ) ) \
429 				{ \
430 					PASTEMAC(ch,axpys)( *alpha, ab, *cij ); \
431 				} \
432 				else if ( PASTEMAC(ch,eq0)( *beta ) ) \
433 				{ \
434 					PASTEMAC(ch,scal2s)( *alpha, ab, *cij ); \
435 				} \
436 				else \
437 				{ \
438 					PASTEMAC(ch,axpbys)( *alpha, ab, *beta, *cij ); \
439 				} \
440 			} \
441 		} \
442 	} \
443 }
444 
445 INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
446 
447 //
448 // -- General storage case -----------------------------------------------------
449 //
450 
451 INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
452 
453 
454 
455 
456 
457 
458 
459 
460 #if 0
461 
462 //
463 // -- Row storage case ---------------------------------------------------------
464 //
465 
466 #undef  GENTFUNC
467 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
468 \
469 void PASTEMAC3(ch,opname,arch,suf) \
470      ( \
471        conj_t              conja, \
472        conj_t              conjb, \
473        dim_t               m, \
474        dim_t               n, \
475        dim_t               k, \
476        ctype*     restrict alpha, \
477        ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
478        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
479        ctype*     restrict beta, \
480        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
481        auxinfo_t* restrict data, \
482        cntx_t*    restrict cntx  \
483      ) \
484 { \
485 	const dim_t     mn     = m * n; \
486 \
487 	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
488 	                    / sizeof( ctype ) ] \
489 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
490 	const inc_t     rs_ab  = n; \
491 	const inc_t     cs_ab  = 1; \
492 \
493 \
494 	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
495 	   sufficiently large enough to hold the m x n microtile.
496 
497 	   The ability to handle m < mr and n < nr is being provided so that
498 	   optimized ukernels can call one of these reference implementations
499 	   for their edge cases, if they choose. When they do so, they will
500 	   need to call the function directly, by its configuration-mangled
501 	   name, since it will have been overwritten in the context when
502 	   the optimized ukernel functions are registered. */ \
503 \
504 \
505 	/* Initialize the accumulator elements in ab to zero. */ \
506 	for ( dim_t i = 0; i < mn; ++i ) \
507 	{ \
508 		PASTEMAC(ch,set0s)( ab[i] ); \
509 	} \
510 \
511 	/* Perform a series of k rank-1 updates into ab. */ \
512 	for ( dim_t l = 0; l < k; ++l ) \
513 	{ \
514 		/* Traverse ab by rows; assume cs_ab = 1. */ \
515 		for ( dim_t i = 0; i < m; ++i ) \
516 		{ \
517 			for ( dim_t j = 0; j < n; ++j ) \
518 			{ \
519 				PASTEMAC(ch,dots) \
520 				( \
521 				  a[ i*rs_a ], \
522 				  b[ j*cs_b ], \
523 				  ab[ i*rs_ab + j*cs_ab ]  \
524 				); \
525 			} \
526 		} \
527 \
528 		a += cs_a; \
529 		b += rs_b; \
530 	} \
531 \
532 	/* Scale the result in ab by alpha. */ \
533 	for ( dim_t i = 0; i < mn; ++i ) \
534 	{ \
535 		PASTEMAC(ch,scals)( *alpha, ab[i] ); \
536 	} \
537 \
538 \
539 	/* If beta is one, add ab into c. If beta is zero, overwrite c with the
540 	   result in ab. Otherwise, scale by beta and accumulate ab to c. */ \
541 	if ( PASTEMAC(ch,eq1)( *beta ) ) \
542 	{ \
543 		/* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \
544 		for ( dim_t i = 0; i < m; ++i ) \
545 		for ( dim_t j = 0; j < n; ++j ) \
546 		{ \
547 			PASTEMAC(ch,adds) \
548 			( \
549 			  ab[ i*rs_ab + j*1 ], \
550 			  c[  i*rs_c  + j*1 ]  \
551 			) \
552 		} \
553 	} \
554 	else if ( PASTEMAC(ch,eq0)( *beta ) ) \
555 	{ \
556 \
557 		/* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \
558 		for ( dim_t i = 0; i < m; ++i ) \
559 		for ( dim_t j = 0; j < n; ++j ) \
560 		{ \
561 			PASTEMAC(ch,copys) \
562 			( \
563 			  ab[ i*rs_ab + j*1 ], \
564 			  c[  i*rs_c  + j*1 ]  \
565 			) \
566 		} \
567 	} \
568 	else /* beta != 0 && beta != 1 */ \
569 	{ \
570 		/* Traverse ab and c by rows; assume cs_a = cs_a = 1. */ \
571 		for ( dim_t i = 0; i < m; ++i ) \
572 		for ( dim_t j = 0; j < n; ++j ) \
573 		{ \
574 			PASTEMAC(ch,xpbys) \
575 			( \
576 			  ab[ i*rs_ab + j*1 ], \
577 			  *beta, \
578 			  c[  i*rs_c  + j*1 ]  \
579 			) \
580 		} \
581 	} \
582 }
583 
584 INSERT_GENTFUNC_BASIC2( gemmsup_r, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
585 
586 //
587 // -- Column storage case ------------------------------------------------------
588 //
589 
590 #undef  GENTFUNC
591 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
592 \
593 void PASTEMAC3(ch,opname,arch,suf) \
594      ( \
595        conj_t              conja, \
596        conj_t              conjb, \
597        dim_t               m, \
598        dim_t               n, \
599        dim_t               k, \
600        ctype*     restrict alpha, \
601        ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
602        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
603        ctype*     restrict beta, \
604        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
605        auxinfo_t* restrict data, \
606        cntx_t*    restrict cntx  \
607      ) \
608 { \
609 	const dim_t     mn     = m * n; \
610 \
611 	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
612 	                    / sizeof( ctype ) ] \
613 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
614 	const inc_t     rs_ab  = 1; \
615 	const inc_t     cs_ab  = m; \
616 \
617 \
618 	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
619 	   sufficiently large enough to hold the m x n microtile.
620 
621 	   The ability to handle m < mr and n < nr is being provided so that
622 	   optimized ukernels can call one of these reference implementations
623 	   for their edge cases, if they choose. When they do so, they will
624 	   need to call the function directly, by its configuration-mangled
625 	   name, since it will have been overwritten in the context when
626 	   the optimized ukernel functions are registered. */ \
627 \
628 \
629 	/* Initialize the accumulator elements in ab to zero. */ \
630 	for ( dim_t i = 0; i < mn; ++i ) \
631 	{ \
632 		PASTEMAC(ch,set0s)( ab[i] ); \
633 	} \
634 \
635 	/* Perform a series of k rank-1 updates into ab. */ \
636 	for ( dim_t l = 0; l < k; ++l ) \
637 	{ \
638 		/* Traverse ab by columns; assume rs_ab = 1. */ \
639 		for ( dim_t j = 0; j < n; ++j ) \
640 		{ \
641 			for ( dim_t i = 0; i < m; ++i ) \
642 			{ \
643 				PASTEMAC(ch,dots) \
644 				( \
645 				  a[ i*rs_a ], \
646 				  b[ j*cs_b ], \
647 				  ab[ i*rs_ab + j*cs_ab ]  \
648 				); \
649 			} \
650 		} \
651 \
652 		a += cs_a; \
653 		b += rs_b; \
654 	} \
655 \
656 	/* Scale the result in ab by alpha. */ \
657 	for ( dim_t i = 0; i < mn; ++i ) \
658 	{ \
659 		PASTEMAC(ch,scals)( *alpha, ab[i] ); \
660 	} \
661 \
662 \
663 	/* If beta is one, add ab into c. If beta is zero, overwrite c with the
664 	   result in ab. Otherwise, scale by beta and accumulate ab to c. */ \
665 	if ( PASTEMAC(ch,eq1)( *beta ) ) \
666 	{ \
667 		/* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \
668 		for ( dim_t j = 0; j < n; ++j ) \
669 		for ( dim_t i = 0; i < m; ++i ) \
670 		{ \
671 			PASTEMAC(ch,adds) \
672 			( \
673 			  ab[ i*1 + j*cs_ab ], \
674 			  c[  i*1 + j*cs_c  ]  \
675 			) \
676 		} \
677 	} \
678 	else if ( PASTEMAC(ch,eq0)( *beta ) ) \
679 	{ \
680 		/* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \
681 		for ( dim_t j = 0; j < n; ++j ) \
682 		for ( dim_t i = 0; i < m; ++i ) \
683 		{ \
684 			PASTEMAC(ch,copys) \
685 			( \
686 			  ab[ i*1 + j*cs_ab ], \
687 			  c[  i*1 + j*cs_c  ]  \
688 			) \
689 		} \
690 	} \
691 	else /* beta != 0 && beta != 1 */ \
692 	{ \
693 		/* Traverse ab and c by columns; assume rs_a = rs_a = 1. */ \
694 		for ( dim_t j = 0; j < n; ++j ) \
695 		for ( dim_t i = 0; i < m; ++i ) \
696 		{ \
697 			PASTEMAC(ch,xpbys) \
698 			( \
699 			  ab[ i*1 + j*cs_ab ], \
700 			  *beta, \
701 			  c[  i*1 + j*cs_c  ]  \
702 			) \
703 		} \
704 	} \
705 }
706 
707 INSERT_GENTFUNC_BASIC2( gemmsup_c, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
708 
709 //
710 // -- General storage case -----------------------------------------------------
711 //
712 
713 #undef  GENTFUNC
714 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
715 \
716 void PASTEMAC3(ch,opname,arch,suf) \
717      ( \
718        conj_t              conja, \
719        conj_t              conjb, \
720        dim_t               m, \
721        dim_t               n, \
722        dim_t               k, \
723        ctype*     restrict alpha, \
724        ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
725        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
726        ctype*     restrict beta, \
727        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
728        auxinfo_t* restrict data, \
729        cntx_t*    restrict cntx  \
730      ) \
731 { \
732 	const dim_t     mn     = m * n; \
733 \
734 	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
735 	                    / sizeof( ctype ) ] \
736 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
737 	const inc_t     rs_ab  = 1; \
738 	const inc_t     cs_ab  = m; \
739 \
740 \
741 	/* Assumptions: m <= mr, n <= nr so that the temporary array ab is
742 	   sufficiently large enough to hold the m x n microtile.
743 
744 	   The ability to handle m < mr and n < nr is being provided so that
745 	   optimized ukernels can call one of these reference implementations
746 	   for their edge cases, if they choose. When they do so, they will
747 	   need to call the function directly, by its configuration-mangled
748 	   name, since it will have been overwritten in the context when
749 	   the optimized ukernel functions are registered. */ \
750 \
751 \
752 	/* Initialize the accumulator elements in ab to zero. */ \
753 	for ( dim_t i = 0; i < mn; ++i ) \
754 	{ \
755 		PASTEMAC(ch,set0s)( ab[i] ); \
756 	} \
757 \
758 	/* Perform a series of k rank-1 updates into ab. */ \
759 	for ( dim_t l = 0; l < k; ++l ) \
760 	{ \
761 		/* General storage: doesn't matter how we traverse ab. */ \
762 		for ( dim_t j = 0; j < n; ++j ) \
763 		{ \
764 			for ( dim_t i = 0; i < m; ++i ) \
765 			{ \
766 				PASTEMAC(ch,dots) \
767 				( \
768 				  a[ i*rs_a ], \
769 				  b[ j*cs_b ], \
770 				  ab[ i*rs_ab + j*cs_ab ]  \
771 				); \
772 			} \
773 		} \
774 \
775 		a += cs_a; \
776 		b += rs_b; \
777 	} \
778 \
779 	/* Scale the result in ab by alpha. */ \
780 	for ( dim_t i = 0; i < mn; ++i ) \
781 	{ \
782 		PASTEMAC(ch,scals)( *alpha, ab[i] ); \
783 	} \
784 \
785 \
786 	/* If beta is one, add ab into c. If beta is zero, overwrite c with the
787 	   result in ab. Otherwise, scale by beta and accumulate ab to c. */ \
788 	if ( PASTEMAC(ch,eq1)( *beta ) ) \
789 	{ \
790 		/* General storage: doesn't matter how we traverse ab and c. */ \
791 		for ( dim_t j = 0; j < n; ++j ) \
792 		for ( dim_t i = 0; i < m; ++i ) \
793 		{ \
794 			PASTEMAC(ch,adds) \
795 			( \
796 			  ab[ i*rs_ab + j*cs_ab ], \
797 			  c[  i*rs_c  + j*cs_c  ]  \
798 			) \
799 		} \
800 	} \
801 	else if ( PASTEMAC(ch,eq0)( *beta ) ) \
802 	{ \
803 		/* General storage: doesn't matter how we traverse ab and c. */ \
804 		for ( dim_t j = 0; j < n; ++j ) \
805 		for ( dim_t i = 0; i < m; ++i ) \
806 		{ \
807 			PASTEMAC(ch,copys) \
808 			( \
809 			  ab[ i*rs_ab + j*cs_ab ], \
810 			  c[  i*rs_c  + j*cs_c  ]  \
811 			) \
812 		} \
813 	} \
814 	else /* beta != 0 && beta != 1 */ \
815 	{ \
816 		/* General storage: doesn't matter how we traverse ab and c. */ \
817 		for ( dim_t j = 0; j < n; ++j ) \
818 		for ( dim_t i = 0; i < m; ++i ) \
819 		{ \
820 			PASTEMAC(ch,xpbys) \
821 			( \
822 			  ab[ i*rs_ab + j*cs_ab ], \
823 			  *beta, \
824 			  c[  i*rs_c  + j*cs_c  ]  \
825 			) \
826 		} \
827 	} \
828 }
829 
830 INSERT_GENTFUNC_BASIC2( gemmsup_g, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
831 
832 #endif
833