1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 
37 //
38 // Define BLAS-like interfaces with typed operands.
39 //
40 
41 #undef  GENTFUNC
42 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
43 \
44 void PASTEMAC(ch,opname) \
45      ( \
46        doff_t  diagoffx, \
47        diag_t  diagx, \
48        uplo_t  uplox, \
49        trans_t transx, \
50        dim_t   m, \
51        dim_t   n, \
52        ctype*  x, inc_t rs_x, inc_t cs_x, \
53        ctype*  y, inc_t rs_y, inc_t cs_y, \
54        cntx_t* cntx, \
55        rntm_t* rntm  \
56      ) \
57 { \
58 	const num_t dt = PASTEMAC(ch,type); \
59 \
60 	ctype*   x1; \
61 	ctype*   y1; \
62 	uplo_t   uplox_eff; \
63 	conj_t   conjx; \
64 	dim_t    n_iter; \
65 	dim_t    n_elem, n_elem_max; \
66 	inc_t    ldx, incx; \
67 	inc_t    ldy, incy; \
68 	dim_t    j, i; \
69 	dim_t    ij0, n_shift; \
70 \
71 	/* Set various loop parameters. */ \
72 	bli_set_dims_incs_uplo_2m \
73 	( \
74 	  diagoffx, diagx, transx, \
75 	  uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
76 	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
77 	  &ij0, &n_shift \
78 	); \
79 \
80 	if ( bli_is_zeros( uplox_eff ) ) return; \
81 \
82 	/* Extract the conjugation component from the transx parameter. */ \
83 	conjx = bli_extract_conj( transx ); \
84 \
85 	/* Query the kernel needed for this operation. */ \
86 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
87 \
88 	/* Handle dense and upper/lower storage cases separately. */ \
89 	if ( bli_is_dense( uplox_eff ) ) \
90 	{ \
91 		for ( j = 0; j < n_iter; ++j ) \
92 		{ \
93 			n_elem = n_elem_max; \
94 \
95 			x1     = x + (j  )*ldx + (0  )*incx; \
96 			y1     = y + (j  )*ldy + (0  )*incy; \
97 \
98 			/* Invoke the kernel with the appropriate parameters. */ \
99 			f( \
100 			   conjx, \
101 			   n_elem, \
102 			   x1, incx, \
103 			   y1, incy, \
104 			   cntx  \
105 			 ); \
106 		} \
107 	} \
108 	else \
109 	{ \
110 		if ( bli_is_upper( uplox_eff ) ) \
111 		{ \
112 			for ( j = 0; j < n_iter; ++j ) \
113 			{ \
114 				n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
115 \
116 				x1     = x + (ij0+j  )*ldx + (0  )*incx; \
117 				y1     = y + (ij0+j  )*ldy + (0  )*incy; \
118 \
119 				/* Invoke the kernel with the appropriate parameters. */ \
120 				f( \
121 				   conjx, \
122 				   n_elem, \
123 				   x1, incx, \
124 				   y1, incy, \
125 				   cntx  \
126 				 ); \
127 			} \
128 		} \
129 		else if ( bli_is_lower( uplox_eff ) ) \
130 		{ \
131 			for ( j = 0; j < n_iter; ++j ) \
132 			{ \
133 				i      = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
134 				n_elem = n_elem_max - i; \
135 \
136 				x1     = x + (j  )*ldx + (ij0+i  )*incx; \
137 				y1     = y + (j  )*ldy + (ij0+i  )*incy; \
138 \
139 				/* Invoke the kernel with the appropriate parameters. */ \
140 				f( \
141 				   conjx, \
142 				   n_elem, \
143 				   x1, incx, \
144 				   y1, incy, \
145 				   cntx  \
146 				 ); \
147 			} \
148 		} \
149 	} \
150 }
151 
152 INSERT_GENTFUNC_BASIC2( addm_unb_var1,  addv,  BLIS_ADDV_KER )
153 INSERT_GENTFUNC_BASIC2( copym_unb_var1, copyv, BLIS_COPYV_KER )
154 INSERT_GENTFUNC_BASIC2( subm_unb_var1,  subv,  BLIS_SUBV_KER )
155 
156 
157 #undef  GENTFUNC
158 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
159 \
160 void PASTEMAC(ch,opname) \
161      ( \
162        doff_t  diagoffx, \
163        diag_t  diagx, \
164        uplo_t  uplox, \
165        trans_t transx, \
166        dim_t   m, \
167        dim_t   n, \
168        ctype*  alpha, \
169        ctype*  x, inc_t rs_x, inc_t cs_x, \
170        ctype*  y, inc_t rs_y, inc_t cs_y, \
171        cntx_t* cntx, \
172        rntm_t* rntm  \
173      ) \
174 { \
175 	const num_t dt = PASTEMAC(ch,type); \
176 \
177 	ctype*   x1; \
178 	ctype*   y1; \
179 	uplo_t   uplox_eff; \
180 	conj_t   conjx; \
181 	dim_t    n_iter; \
182 	dim_t    n_elem, n_elem_max; \
183 	inc_t    ldx, incx; \
184 	inc_t    ldy, incy; \
185 	dim_t    j, i; \
186 	dim_t    ij0, n_shift; \
187 \
188 	/* Set various loop parameters. */ \
189 	bli_set_dims_incs_uplo_2m \
190 	( \
191 	  diagoffx, diagx, transx, \
192 	  uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
193 	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
194 	  &ij0, &n_shift \
195 	); \
196 \
197 	if ( bli_is_zeros( uplox_eff ) ) return; \
198 \
199 	/* Extract the conjugation component from the transx parameter. */ \
200 	conjx = bli_extract_conj( transx ); \
201 \
202 	/* Query the kernel needed for this operation. */ \
203 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
204 \
205 	/* Handle dense and upper/lower storage cases separately. */ \
206 	if ( bli_is_dense( uplox_eff ) ) \
207 	{ \
208 		for ( j = 0; j < n_iter; ++j ) \
209 		{ \
210 			n_elem = n_elem_max; \
211 \
212 			x1     = x + (j  )*ldx + (0  )*incx; \
213 			y1     = y + (j  )*ldy + (0  )*incy; \
214 \
215 			/* Invoke the kernel with the appropriate parameters. */ \
216 			f( \
217 			   conjx, \
218 			   n_elem, \
219 			   alpha, \
220 			   x1, incx, \
221 			   y1, incy, \
222 			   cntx  \
223 			 ); \
224 		} \
225 	} \
226 	else \
227 	{ \
228 		if ( bli_is_upper( uplox_eff ) ) \
229 		{ \
230 			for ( j = 0; j < n_iter; ++j ) \
231 			{ \
232 				n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
233 \
234 				x1     = x + (ij0+j  )*ldx + (0  )*incx; \
235 				y1     = y + (ij0+j  )*ldy + (0  )*incy; \
236 \
237 				/* Invoke the kernel with the appropriate parameters. */ \
238 				f( \
239 				   conjx, \
240 				   n_elem, \
241 				   alpha, \
242 				   x1, incx, \
243 				   y1, incy, \
244 				   cntx  \
245 				 ); \
246 			} \
247 		} \
248 		else if ( bli_is_lower( uplox_eff ) ) \
249 		{ \
250 			for ( j = 0; j < n_iter; ++j ) \
251 			{ \
252 				i      = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
253 				n_elem = n_elem_max - i; \
254 \
255 				x1     = x + (j  )*ldx + (ij0+i  )*incx; \
256 				y1     = y + (j  )*ldy + (ij0+i  )*incy; \
257 \
258 				/* Invoke the kernel with the appropriate parameters. */ \
259 				f( \
260 				   conjx, \
261 				   n_elem, \
262 				   alpha, \
263 				   x1, incx, \
264 				   y1, incy, \
265 				   cntx  \
266 				 ); \
267 			} \
268 		} \
269 	} \
270 }
271 
272 INSERT_GENTFUNC_BASIC2( axpym_unb_var1,  axpyv,  BLIS_AXPYV_KER )
273 INSERT_GENTFUNC_BASIC2( scal2m_unb_var1, scal2v, BLIS_SCAL2V_KER )
274 
275 
276 #undef  GENTFUNC
277 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
278 \
279 void PASTEMAC(ch,opname) \
280      ( \
281        conj_t  conjalpha, \
282        doff_t  diagoffx, \
283        diag_t  diagx, \
284        uplo_t  uplox, \
285        dim_t   m, \
286        dim_t   n, \
287        ctype*  alpha, \
288        ctype*  x, inc_t rs_x, inc_t cs_x, \
289        cntx_t* cntx, \
290        rntm_t* rntm  \
291      ) \
292 { \
293 	const num_t dt = PASTEMAC(ch,type); \
294 \
295 	ctype*   x1; \
296 	uplo_t   uplox_eff; \
297 	dim_t    n_iter; \
298 	dim_t    n_elem, n_elem_max; \
299 	inc_t    ldx, incx; \
300 	dim_t    j, i; \
301 	dim_t    ij0, n_shift; \
302 \
303 	/* Set various loop parameters. */ \
304 	bli_set_dims_incs_uplo_1m \
305 	( \
306 	  diagoffx, diagx, \
307 	  uplox, m, n, rs_x, cs_x, \
308 	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \
309 	  &ij0, &n_shift \
310 	); \
311 \
312 	if ( bli_is_zeros( uplox_eff ) ) return; \
313 \
314 	/* Query the kernel needed for this operation. */ \
315 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
316 \
317 	/* Handle dense and upper/lower storage cases separately. */ \
318 	if ( bli_is_dense( uplox_eff ) ) \
319 	{ \
320 		for ( j = 0; j < n_iter; ++j ) \
321 		{ \
322 			n_elem = n_elem_max; \
323 \
324 			x1     = x + (j  )*ldx + (0  )*incx; \
325 \
326 			/* Invoke the kernel with the appropriate parameters. */ \
327 			f( \
328 			   conjalpha, \
329 			   n_elem, \
330 			   alpha, \
331 			   x1, incx, \
332 			   cntx  \
333 			 ); \
334 		} \
335 	} \
336 	else \
337 	{ \
338 		if ( bli_is_upper( uplox_eff ) ) \
339 		{ \
340 			for ( j = 0; j < n_iter; ++j ) \
341 			{ \
342 				n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
343 \
344 				x1     = x + (ij0+j  )*ldx + (0  )*incx; \
345 \
346 				/* Invoke the kernel with the appropriate parameters. */ \
347 				f( \
348 				   conjalpha, \
349 				   n_elem, \
350 				   alpha, \
351 				   x1, incx, \
352 				   cntx  \
353 				 ); \
354 			} \
355 		} \
356 		else if ( bli_is_lower( uplox_eff ) ) \
357 		{ \
358 			for ( j = 0; j < n_iter; ++j ) \
359 			{ \
360 				i      = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
361 				n_elem = n_elem_max - i; \
362 \
363 				x1     = x + (j  )*ldx + (ij0+i  )*incx; \
364 \
365 				/* Invoke the kernel with the appropriate parameters. */ \
366 				f( \
367 				   conjalpha, \
368 				   n_elem, \
369 				   alpha, \
370 				   x1, incx, \
371 				   cntx  \
372 				 ); \
373 			} \
374 		} \
375 	} \
376 }
377 
378 INSERT_GENTFUNC_BASIC2( scalm_unb_var1, scalv, BLIS_SCALV_KER )
379 INSERT_GENTFUNC_BASIC2( setm_unb_var1,  setv,  BLIS_SETV_KER )
380 
381 
382 #undef  GENTFUNC
383 #define GENTFUNC( ctype, ch, opname, kername, kerid ) \
384 \
385 void PASTEMAC(ch,opname) \
386      ( \
387        doff_t  diagoffx, \
388        diag_t  diagx, \
389        uplo_t  uplox, \
390        trans_t transx, \
391        dim_t   m, \
392        dim_t   n, \
393        ctype*  x, inc_t rs_x, inc_t cs_x, \
394        ctype*  beta, \
395        ctype*  y, inc_t rs_y, inc_t cs_y, \
396        cntx_t* cntx, \
397        rntm_t* rntm  \
398      ) \
399 { \
400 	const num_t dt = PASTEMAC(ch,type); \
401 \
402 	ctype*   x1; \
403 	ctype*   y1; \
404 	uplo_t   uplox_eff; \
405 	conj_t   conjx; \
406 	dim_t    n_iter; \
407 	dim_t    n_elem, n_elem_max; \
408 	inc_t    ldx, incx; \
409 	inc_t    ldy, incy; \
410 	dim_t    j, i; \
411 	dim_t    ij0, n_shift; \
412 \
413 	/* Set various loop parameters. */ \
414 	bli_set_dims_incs_uplo_2m \
415 	( \
416 	  diagoffx, diagx, transx, \
417 	  uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
418 	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
419 	  &ij0, &n_shift \
420 	); \
421 \
422 	if ( bli_is_zeros( uplox_eff ) ) return; \
423 \
424 	/* Extract the conjugation component from the transx parameter. */ \
425 	conjx = bli_extract_conj( transx ); \
426 \
427 	/* Query the kernel needed for this operation. */ \
428 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
429 \
430 	/* Handle dense and upper/lower storage cases separately. */ \
431 	if ( bli_is_dense( uplox_eff ) ) \
432 	{ \
433 		for ( j = 0; j < n_iter; ++j ) \
434 		{ \
435 			n_elem = n_elem_max; \
436 \
437 			x1     = x + (j  )*ldx + (0  )*incx; \
438 			y1     = y + (j  )*ldy + (0  )*incy; \
439 \
440 			/* Invoke the kernel with the appropriate parameters. */ \
441 			f( \
442 			   conjx, \
443 			   n_elem, \
444 			   x1, incx, \
445 			   beta, \
446 			   y1, incy, \
447 			   cntx  \
448 			 ); \
449 		} \
450 	} \
451 	else \
452 	{ \
453 		if ( bli_is_upper( uplox_eff ) ) \
454 		{ \
455 			for ( j = 0; j < n_iter; ++j ) \
456 			{ \
457 				n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
458 \
459 				x1     = x + (ij0+j  )*ldx + (0  )*incx; \
460 				y1     = y + (ij0+j  )*ldy + (0  )*incy; \
461 \
462 				/* Invoke the kernel with the appropriate parameters. */ \
463 				f( \
464 				   conjx, \
465 				   n_elem, \
466 				   x1, incx, \
467 				   beta, \
468 				   y1, incy, \
469 				   cntx  \
470 				 ); \
471 			} \
472 		} \
473 		else if ( bli_is_lower( uplox_eff ) ) \
474 		{ \
475 			for ( j = 0; j < n_iter; ++j ) \
476 			{ \
477 				i      = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
478 				n_elem = n_elem_max - i; \
479 \
480 				x1     = x + (j  )*ldx + (ij0+i  )*incx; \
481 				y1     = y + (j  )*ldy + (ij0+i  )*incy; \
482 \
483 				/* Invoke the kernel with the appropriate parameters. */ \
484 				f( \
485 				   conjx, \
486 				   n_elem, \
487 				   x1, incx, \
488 				   beta, \
489 				   y1, incy, \
490 				   cntx  \
491 				 ); \
492 			} \
493 		} \
494 	} \
495 }
496 
497 INSERT_GENTFUNC_BASIC2( xpbym_unb_var1,  xpbyv,  BLIS_XPBYV_KER )
498 
499 
500 #undef  GENTFUNC2
501 #define GENTFUNC2( ctype_x, ctype_y, chx, chy, opname ) \
502 \
503 void PASTEMAC2(chx,chy,opname) \
504      ( \
505        doff_t   diagoffx, \
506        diag_t   diagx, \
507        uplo_t   uplox, \
508        trans_t  transx, \
509        dim_t    m, \
510        dim_t    n, \
511        ctype_x* x, inc_t rs_x, inc_t cs_x, \
512        ctype_y* beta, \
513        ctype_y* y, inc_t rs_y, inc_t cs_y, \
514        cntx_t*  cntx, \
515        rntm_t*  rntm  \
516      ) \
517 { \
518 	ctype_x* restrict x1; \
519 	ctype_y* restrict y1; \
520 	uplo_t            uplox_eff; \
521 	dim_t             n_iter; \
522 	dim_t             n_elem, n_elem_max; \
523 	inc_t             ldx, incx; \
524 	inc_t             ldy, incy; \
525 	dim_t             j, i; \
526 	dim_t             ij0, n_shift; \
527 \
528 	/* Set various loop parameters. */ \
529 	bli_set_dims_incs_uplo_2m \
530 	( \
531 	  diagoffx, diagx, transx, \
532 	  uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
533 	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
534 	  &ij0, &n_shift \
535 	); \
536 \
537 	/* Extract the conjugation component from the transx parameter. */ \
538 	/*conjx = bli_extract_conj( transx );*/ \
539 \
540 	/* Handle dense and upper/lower storage cases separately. */ \
541 	if ( PASTEMAC(chy,eq1)( *beta ) ) \
542 	{ \
543 		if ( incx == 1 && incy == 1 ) \
544 		{ \
545 			n_elem = n_elem_max; \
546 \
547 			for ( j = 0; j < n_iter; ++j ) \
548 			{ \
549 				x1     = x + (j  )*ldx + (0  )*incx; \
550 				y1     = y + (j  )*ldy + (0  )*incy; \
551 \
552 				ctype_x* restrict chi1 = x1; \
553 				ctype_y* restrict psi1 = y1; \
554 \
555 				for ( i = 0; i < n_elem; ++i ) \
556 				{ \
557 					PASTEMAC2(chx,chy,adds)( chi1[i], psi1[i] ); \
558 				} \
559 			} \
560 		} \
561 		else \
562 		{ \
563 			n_elem = n_elem_max; \
564 \
565 			for ( j = 0; j < n_iter; ++j ) \
566 			{ \
567 				x1     = x + (j  )*ldx + (0  )*incx; \
568 				y1     = y + (j  )*ldy + (0  )*incy; \
569 \
570 				ctype_x* restrict chi1 = x1; \
571 				ctype_y* restrict psi1 = y1; \
572 \
573 				for ( i = 0; i < n_elem; ++i ) \
574 				{ \
575 					PASTEMAC2(chx,chy,adds)( *chi1, *psi1 ); \
576 \
577 					chi1 += incx; \
578 					psi1 += incy; \
579 				} \
580 			} \
581 		} \
582 	} \
583 	else /* ( !PASTEMAC(chy,eq1)( *beta ) ) */ \
584 	{ \
585 		if ( incx == 1 && incy == 1 ) \
586 		{ \
587 			n_elem = n_elem_max; \
588 \
589 			for ( j = 0; j < n_iter; ++j ) \
590 			{ \
591 				x1     = x + (j  )*ldx + (0  )*incx; \
592 				y1     = y + (j  )*ldy + (0  )*incy; \
593 \
594 				ctype_x* restrict chi1 = x1; \
595 				ctype_y* restrict psi1 = y1; \
596 \
597 				for ( i = 0; i < n_elem; ++i ) \
598 				{ \
599 					PASTEMAC3(chx,chy,chy,xpbys)( chi1[i], *beta, psi1[i] ); \
600 				} \
601 			} \
602 		} \
603 		else \
604 		{ \
605 			n_elem = n_elem_max; \
606 \
607 			for ( j = 0; j < n_iter; ++j ) \
608 			{ \
609 				x1     = x + (j  )*ldx + (0  )*incx; \
610 				y1     = y + (j  )*ldy + (0  )*incy; \
611 \
612 				ctype_x* restrict chi1 = x1; \
613 				ctype_y* restrict psi1 = y1; \
614 \
615 				for ( i = 0; i < n_elem; ++i ) \
616 				{ \
617 					PASTEMAC3(chx,chy,chy,xpbys)( *chi1, *beta, *psi1 ); \
618 \
619 					chi1 += incx; \
620 					psi1 += incy; \
621 				} \
622 			} \
623 		} \
624 	} \
625 }
626 
627 INSERT_GENTFUNC2_BASIC0( xpbym_md_unb_var1 )
628 INSERT_GENTFUNC2_MIXDP0( xpbym_md_unb_var1 )
629 
630