1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "blis1.h"
12 
bl1_sscalm(conj1_t conj,int m,int n,float * alpha,float * a,int a_rs,int a_cs)13 void bl1_sscalm( conj1_t conj, int m, int n, float* alpha, float* a, int a_rs, int a_cs )
14 {
15 	float     alpha_conj;
16 	float*    a_begin;
17 	int       lda, inca;
18 	int       n_iter;
19 	int       n_elem;
20 	int       j;
21 
22 	// Return early if possible.
23 	if ( bl1_zero_dim2( m, n ) ) return;
24 	if ( bl1_seq1( alpha ) ) return;
25 
26 	// Handle cases where A is a vector to ensure that the underlying axpy
27 	// gets invoked only once.
28 	if ( bl1_is_vector( m, n ) )
29 	{
30 		// Initialize with values appropriate for a vector.
31 		n_iter = 1;
32 		n_elem = bl1_vector_dim( m, n );
33 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
34 		inca   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, a_rs, a_cs );
35 	}
36 	else // matrix case
37 	{
38 		// Initialize with optimal values for column-major storage.
39 		n_iter = n;
40 		n_elem = m;
41 		lda    = a_cs;
42 		inca   = a_rs;
43 
44 		// An optimization: if A is row-major, then let's access the matrix
45 		// by rows instead of by columns to increase spatial locality.
46 		if ( bl1_is_row_storage( a_rs, a_cs ) )
47 		{
48 			bl1_swap_ints( n_iter, n_elem );
49 			bl1_swap_ints( lda, inca );
50 		}
51 	}
52 
53 	bl1_scopys( conj, alpha, &alpha_conj );
54 
55 	for ( j = 0; j < n_iter; j++ )
56 	{
57 		a_begin = a + j*lda;
58 
59 		bl1_sscal( n_elem,
60 		           &alpha_conj,
61 		           a_begin, inca );
62 	}
63 }
64 
bl1_dscalm(conj1_t conj,int m,int n,double * alpha,double * a,int a_rs,int a_cs)65 void bl1_dscalm( conj1_t conj, int m, int n, double* alpha, double* a, int a_rs, int a_cs )
66 {
67 	double    alpha_conj;
68 	double*   a_begin;
69 	int       lda, inca;
70 	int       n_iter;
71 	int       n_elem;
72 	int       j;
73 
74 	// Return early if possible.
75 	if ( bl1_zero_dim2( m, n ) ) return;
76 	if ( bl1_deq1( alpha ) ) return;
77 
78 	// Handle cases where A is a vector to ensure that the underlying axpy
79 	// gets invoked only once.
80 	if ( bl1_is_vector( m, n ) )
81 	{
82 		// Initialize with values appropriate for a vector.
83 		n_iter = 1;
84 		n_elem = bl1_vector_dim( m, n );
85 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
86 		inca   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, a_rs, a_cs );
87 	}
88 	else // matrix case
89 	{
90 		// Initialize with optimal values for column-major storage.
91 		n_iter = n;
92 		n_elem = m;
93 		lda    = a_cs;
94 		inca   = a_rs;
95 
96 		// An optimization: if A is row-major, then let's access the matrix
97 		// by rows instead of by columns to increase spatial locality.
98 		if ( bl1_is_row_storage( a_rs, a_cs ) )
99 		{
100 			bl1_swap_ints( n_iter, n_elem );
101 			bl1_swap_ints( lda, inca );
102 		}
103 	}
104 
105 	bl1_dcopys( conj, alpha, &alpha_conj );
106 
107 	for ( j = 0; j < n_iter; j++ )
108 	{
109 		a_begin = a + j*lda;
110 
111 		bl1_dscal( n_elem,
112 		           &alpha_conj,
113 		           a_begin, inca );
114 	}
115 }
116 
bl1_csscalm(conj1_t conj,int m,int n,float * alpha,scomplex * a,int a_rs,int a_cs)117 void bl1_csscalm( conj1_t conj, int m, int n, float* alpha, scomplex* a, int a_rs, int a_cs )
118 {
119 	float     alpha_conj;
120 	scomplex* a_begin;
121 	int       lda, inca;
122 	int       n_iter;
123 	int       n_elem;
124 	int       j;
125 
126 	// Return early if possible.
127 	if ( bl1_zero_dim2( m, n ) ) return;
128 	if ( bl1_seq1( alpha ) ) return;
129 
130 	// Handle cases where A is a vector to ensure that the underlying axpy
131 	// gets invoked only once.
132 	if ( bl1_is_vector( m, n ) )
133 	{
134 		// Initialize with values appropriate for a vector.
135 		n_iter = 1;
136 		n_elem = bl1_vector_dim( m, n );
137 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
138 		inca   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, a_rs, a_cs );
139 	}
140 	else // matrix case
141 	{
142 		// Initialize with optimal values for column-major storage.
143 		n_iter = n;
144 		n_elem = m;
145 		lda    = a_cs;
146 		inca   = a_rs;
147 
148 		// An optimization: if A is row-major, then let's access the matrix
149 		// by rows instead of by columns to increase spatial locality.
150 		if ( bl1_is_row_storage( a_rs, a_cs ) )
151 		{
152 			bl1_swap_ints( n_iter, n_elem );
153 			bl1_swap_ints( lda, inca );
154 		}
155 	}
156 
157 	bl1_scopys( conj, alpha, &alpha_conj );
158 
159 	for ( j = 0; j < n_iter; j++ )
160 	{
161 		a_begin = a + j*lda;
162 
163 		bl1_csscal( n_elem,
164 		            &alpha_conj,
165 		            a_begin, inca );
166 	}
167 }
168 
bl1_cscalm(conj1_t conj,int m,int n,scomplex * alpha,scomplex * a,int a_rs,int a_cs)169 void bl1_cscalm( conj1_t conj, int m, int n, scomplex* alpha, scomplex* a, int a_rs, int a_cs )
170 {
171 	scomplex  alpha_conj;
172 	scomplex* a_begin;
173 	int       lda, inca;
174 	int       n_iter;
175 	int       n_elem;
176 	int       j;
177 
178 	// Return early if possible.
179 	if ( bl1_zero_dim2( m, n ) ) return;
180 	if ( bl1_ceq1( alpha ) ) return;
181 
182 	// Handle cases where A is a vector to ensure that the underlying axpy
183 	// gets invoked only once.
184 	if ( bl1_is_vector( m, n ) )
185 	{
186 		// Initialize with values appropriate for a vector.
187 		n_iter = 1;
188 		n_elem = bl1_vector_dim( m, n );
189 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
190 		inca   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, a_rs, a_cs );
191 	}
192 	else // matrix case
193 	{
194 		// Initialize with optimal values for column-major storage.
195 		n_iter = n;
196 		n_elem = m;
197 		lda    = a_cs;
198 		inca   = a_rs;
199 
200 		// An optimization: if A is row-major, then let's access the matrix
201 		// by rows instead of by columns to increase spatial locality.
202 		if ( bl1_is_row_storage( a_rs, a_cs ) )
203 		{
204 			bl1_swap_ints( n_iter, n_elem );
205 			bl1_swap_ints( lda, inca );
206 		}
207 	}
208 
209 	bl1_ccopys( conj, alpha, &alpha_conj );
210 
211 	for ( j = 0; j < n_iter; j++ )
212 	{
213 		a_begin = a + j*lda;
214 
215 		bl1_cscal( n_elem,
216 		           &alpha_conj,
217 		           a_begin, inca );
218 	}
219 }
220 
bl1_zdscalm(conj1_t conj,int m,int n,double * alpha,dcomplex * a,int a_rs,int a_cs)221 void bl1_zdscalm( conj1_t conj, int m, int n, double* alpha, dcomplex* a, int a_rs, int a_cs )
222 {
223 	double    alpha_conj;
224 	dcomplex* a_begin;
225 	int       lda, inca;
226 	int       n_iter;
227 	int       n_elem;
228 	int       j;
229 
230 	// Return early if possible.
231 	if ( bl1_zero_dim2( m, n ) ) return;
232 	if ( bl1_deq1( alpha ) ) return;
233 
234 	// Handle cases where A is a vector to ensure that the underlying axpy
235 	// gets invoked only once.
236 	if ( bl1_is_vector( m, n ) )
237 	{
238 		// Initialize with values appropriate for a vector.
239 		n_iter = 1;
240 		n_elem = bl1_vector_dim( m, n );
241 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
242 		inca   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, a_rs, a_cs );
243 	}
244 	else // matrix case
245 	{
246 		// Initialize with optimal values for column-major storage.
247 		n_iter = n;
248 		n_elem = m;
249 		lda    = a_cs;
250 		inca   = a_rs;
251 
252 		// An optimization: if A is row-major, then let's access the matrix
253 		// by rows instead of by columns to increase spatial locality.
254 		if ( bl1_is_row_storage( a_rs, a_cs ) )
255 		{
256 			bl1_swap_ints( n_iter, n_elem );
257 			bl1_swap_ints( lda, inca );
258 		}
259 	}
260 
261 	bl1_dcopys( conj, alpha, &alpha_conj );
262 
263 	for ( j = 0; j < n_iter; j++ )
264 	{
265 		a_begin = a + j*lda;
266 
267 		bl1_zdscal( n_elem,
268 		            &alpha_conj,
269 		            a_begin, inca );
270 	}
271 }
272 
bl1_zscalm(conj1_t conj,int m,int n,dcomplex * alpha,dcomplex * a,int a_rs,int a_cs)273 void bl1_zscalm( conj1_t conj, int m, int n, dcomplex* alpha, dcomplex* a, int a_rs, int a_cs )
274 {
275 	dcomplex  alpha_conj;
276 	dcomplex* a_begin;
277 	int       lda, inca;
278 	int       n_iter;
279 	int       n_elem;
280 	int       j;
281 
282 	// Return early if possible.
283 	if ( bl1_zero_dim2( m, n ) ) return;
284 	if ( bl1_zeq1( alpha ) ) return;
285 
286 	// Handle cases where A is a vector to ensure that the underlying axpy
287 	// gets invoked only once.
288 	if ( bl1_is_vector( m, n ) )
289 	{
290 		// Initialize with values appropriate for a vector.
291 		n_iter = 1;
292 		n_elem = bl1_vector_dim( m, n );
293 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
294 		inca   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, a_rs, a_cs );
295 	}
296 	else // matrix case
297 	{
298 		// Initialize with optimal values for column-major storage.
299 		n_iter = n;
300 		n_elem = m;
301 		lda    = a_cs;
302 		inca   = a_rs;
303 
304 		// An optimization: if A is row-major, then let's access the matrix
305 		// by rows instead of by columns to increase spatial locality.
306 		if ( bl1_is_row_storage( a_rs, a_cs ) )
307 		{
308 			bl1_swap_ints( n_iter, n_elem );
309 			bl1_swap_ints( lda, inca );
310 		}
311 	}
312 
313 	bl1_zcopys( conj, alpha, &alpha_conj );
314 
315 	for ( j = 0; j < n_iter; j++ )
316 	{
317 		a_begin = a + j*lda;
318 
319 		bl1_zscal( n_elem,
320 		           &alpha_conj,
321 		           a_begin, inca );
322 	}
323 }
324 
325