1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 
37 
38 #undef  GENTFUNC
39 #define GENTFUNC( ctype, ch, opname, arch, suf, ff ) \
40 \
41 void PASTEMAC3(ch,opname,arch,suf) \
42      ( \
43        conj_t           conjat, \
44        conj_t           conja, \
45        conj_t           conjw, \
46        conj_t           conjx, \
47        dim_t            m, \
48        dim_t            b_n, \
49        ctype*  restrict alpha, \
50        ctype*  restrict a, inc_t inca, inc_t lda, \
51        ctype*  restrict w, inc_t incw, \
52        ctype*  restrict x, inc_t incx, \
53        ctype*  restrict beta, \
54        ctype*  restrict y, inc_t incy, \
55        ctype*  restrict z, inc_t incz, \
56        cntx_t* restrict cntx  \
57      ) \
58 { \
59 	/* A is m x n.                   */ \
60 	/* y = beta * y + alpha * A^T w; */ \
61 	/* z =        z + alpha * A   x; */ \
62 \
63 	if ( 1 && inca == 1 && incw == 1 && incx == 1 && \
64 	     incy == 1 && incz == 1 && b_n == ff ) \
65 	{ \
66 		ctype r[ ff ]; \
67 		ctype ax[ ff ]; \
68 \
69 		/* If beta is zero, clear y. Otherwise, scale by beta. */ \
70 		if ( PASTEMAC(ch,eq0)( *beta ) ) \
71 		{ \
72 			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( y[i] ); \
73 		} \
74 		else \
75 		{ \
76 			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,scals)( *beta, y[i] ); \
77 		} \
78 \
79 		/* If the vectors are empty or if alpha is zero, return early. */ \
80 		if ( bli_zero_dim1( m ) || PASTEMAC(ch,eq0)( *alpha ) ) return; \
81 \
82 		/* Initialize r vector to 0. */ \
83 		for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,set0s)( r[i] ); \
84 \
85 		/* Scale x by alpha, storing to a temporary array ax. */ \
86 		if ( bli_is_conj( conjx ) ) \
87 		{ \
88 			PRAGMA_SIMD \
89 			for ( dim_t i = 0; i < ff; ++i ) \
90 				PASTEMAC(ch,scal2js)( *alpha, x[i], ax[i] ); \
91 		} \
92 		else \
93 		{ \
94 			PRAGMA_SIMD \
95 			for ( dim_t i = 0; i < ff; ++i ) \
96 				PASTEMAC(ch,scal2s)( *alpha, x[i], ax[i] ); \
97 		} \
98 \
99 		/* If a must be conjugated, we do so indirectly by first toggling the
100 		   effective conjugation of w and then conjugating the resulting dot
101 		   products. */ \
102 		conj_t conjw_use = conjw; \
103 \
104 		if ( bli_is_conj( conjat ) ) \
105 			bli_toggle_conj( &conjw_use ); \
106 \
107 		if ( bli_is_noconj( conjw_use ) ) \
108 		{ \
109 			if ( bli_is_noconj( conja ) ) \
110 			{ \
111 				PRAGMA_SIMD \
112 				for ( dim_t p = 0; p < m; ++p ) \
113 				for ( dim_t i = 0; i < ff; ++i ) \
114 				{ \
115 					PASTEMAC(ch,axpys)( a[p + i*lda], w[p], r[i] ); \
116 					PASTEMAC(ch,axpys)( ax[i], a[p + i*lda], z[p] ); \
117 				} \
118 			} \
119 			else \
120 			{ \
121 				PRAGMA_SIMD \
122 				for ( dim_t p = 0; p < m; ++p ) \
123 				for ( dim_t i = 0; i < ff; ++i ) \
124 				{ \
125 					PASTEMAC(ch,axpys)( a[p + i*lda], w[p], r[i] ); \
126 					PASTEMAC(ch,axpyjs)( ax[i], a[p + i*lda], z[p] ); \
127 				} \
128 			} \
129 		} \
130 		else \
131 		{ \
132 			if ( bli_is_noconj( conja ) ) \
133 			{ \
134 				PRAGMA_SIMD \
135 				for ( dim_t p = 0; p < m; ++p ) \
136 				for ( dim_t i = 0; i < ff; ++i ) \
137 				{ \
138 					PASTEMAC(ch,axpyjs)( a[p + i*lda], w[p], r[i] ); \
139 					PASTEMAC(ch,axpys)( ax[i], a[p + i*lda], z[p] ); \
140 				} \
141 			} \
142 			else \
143 			{ \
144 				PRAGMA_SIMD \
145 				for ( dim_t p = 0; p < m; ++p ) \
146 				for ( dim_t i = 0; i < ff; ++i ) \
147 				{ \
148 					PASTEMAC(ch,axpyjs)( a[p + i*lda], w[p], r[i] ); \
149 					PASTEMAC(ch,axpyjs)( ax[i], a[p + i*lda], z[p] ); \
150 				} \
151 			} \
152 		} \
153 \
154 		if ( bli_is_conj( conjat ) ) \
155 			for ( dim_t i = 0; i < ff; ++i ) PASTEMAC(ch,conjs)( r[i] ); \
156 \
157 		for ( dim_t i = 0; i < ff; ++i ) \
158 		{ \
159 			PASTEMAC(ch,axpys)( *alpha, r[i], y[i] ); \
160 		} \
161 	} \
162 	else \
163 	{ \
164 		/* Query the context for the kernel function pointer. */ \
165 		const num_t              dt     = PASTEMAC(ch,type); \
166 		PASTECH(ch,dotxf_ker_ft) kfp_df \
167 		= \
168 		bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
169 		PASTECH(ch,axpyf_ker_ft) kfp_af \
170 		= \
171 		bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
172 \
173 		kfp_df \
174 		( \
175 		  conjat, \
176 		  conjw, \
177 		  m, \
178 		  b_n, \
179 		  alpha, \
180 		  a, inca, lda, \
181 		  w, incw, \
182 		  beta, \
183 		  y, incy, \
184 		  cntx  \
185 		); \
186 \
187 		kfp_af \
188 		( \
189 		  conja, \
190 		  conjx, \
191 		  m, \
192 		  b_n, \
193 		  alpha, \
194 		  a, inca, lda, \
195 		  x, incx, \
196 		  z, incz, \
197 		  cntx  \
198 		); \
199 	} \
200 }
201 
202 //INSERT_GENTFUNC_BASIC2( dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
203 GENTFUNC( float,    s, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
204 GENTFUNC( double,   d, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
205 GENTFUNC( scomplex, c, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
206 GENTFUNC( dcomplex, z, dotxaxpyf, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4 )
207