1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 
37 
38 // transposed of general matrices, read along panels, write across panels
kernel_sgetr_4_lib4(int tri,int kmax,int kna,float alpha,float * A,float * C,int sdc)39 void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
40 	{
41 
42 	if(tri==1)
43 		{
44 		// A is lower triangular, C is upper triangular
45 		// kmax+1 4-wide + end 3x3 triangle
46 
47 		kmax += 1;
48 		}
49 
50 	const int bs = 4;
51 
52 	int k;
53 
54 	k = 0;
55 
56 	if(kmax<kna)
57 		goto cleanup_loop;
58 
59 	if(kna>0)
60 		{
61 		for( ; k<kna; k++)
62 			{
63 			C[0+bs*0] = alpha * A[0+bs*0];
64 			C[0+bs*1] = alpha * A[1+bs*0];
65 			C[0+bs*2] = alpha * A[2+bs*0];
66 			C[0+bs*3] = alpha * A[3+bs*0];
67 
68 			C += 1;
69 			A += bs;
70 			}
71 		C += bs*(sdc-1);
72 		}
73 
74 	for( ; k<kmax-3; k+=4)
75 		{
76 		C[0+bs*0] = alpha * A[0+bs*0];
77 		C[0+bs*1] = alpha * A[1+bs*0];
78 		C[0+bs*2] = alpha * A[2+bs*0];
79 		C[0+bs*3] = alpha * A[3+bs*0];
80 
81 		C[1+bs*0] = alpha * A[0+bs*1];
82 		C[1+bs*1] = alpha * A[1+bs*1];
83 		C[1+bs*2] = alpha * A[2+bs*1];
84 		C[1+bs*3] = alpha * A[3+bs*1];
85 
86 		C[2+bs*0] = alpha * A[0+bs*2];
87 		C[2+bs*1] = alpha * A[1+bs*2];
88 		C[2+bs*2] = alpha * A[2+bs*2];
89 		C[2+bs*3] = alpha * A[3+bs*2];
90 
91 		C[3+bs*0] = alpha * A[0+bs*3];
92 		C[3+bs*1] = alpha * A[1+bs*3];
93 		C[3+bs*2] = alpha * A[2+bs*3];
94 		C[3+bs*3] = alpha * A[3+bs*3];
95 
96 		C += bs*sdc;
97 		A += bs*bs;
98 		}
99 
100 	cleanup_loop:
101 
102 	for( ; k<kmax; k++)
103 		{
104 		C[0+bs*0] = alpha * A[0+bs*0];
105 		C[0+bs*1] = alpha * A[1+bs*0];
106 		C[0+bs*2] = alpha * A[2+bs*0];
107 		C[0+bs*3] = alpha * A[3+bs*0];
108 
109 		C += 1;
110 		A += bs;
111 		}
112 
113 	if(tri==1)
114 		{
115 		// end 3x3 triangle
116 		kna = (bs-(bs-kna+kmax)%bs)%bs;
117 
118 		if(kna==1)
119 			{
120 			C[0+bs*1] = alpha * A[1+bs*0];
121 			C[0+bs*2] = alpha * A[2+bs*0];
122 			C[0+bs*3] = alpha * A[3+bs*0];
123 			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
124 			C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
125 			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
126 			}
127 		else if(kna==2)
128 			{
129 			C[0+bs*1] = alpha * A[1+bs*0];
130 			C[0+bs*2] = alpha * A[2+bs*0];
131 			C[0+bs*3] = alpha * A[3+bs*0];
132 			C[1+bs*2] = alpha * A[2+bs*1];
133 			C[1+bs*3] = alpha * A[3+bs*1];
134 			C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
135 			}
136 		else
137 			{
138 			C[0+bs*1] = alpha * A[1+bs*0];
139 			C[0+bs*2] = alpha * A[2+bs*0];
140 			C[0+bs*3] = alpha * A[3+bs*0];
141 			C[1+bs*2] = alpha * A[2+bs*1];
142 			C[1+bs*3] = alpha * A[3+bs*1];
143 			C[2+bs*3] = alpha * A[3+bs*2];
144 			}
145 		}
146 
147 	}
148 
149 
150 
151 // transposed of general matrices, read along panels, write across panels
kernel_sgetr_3_lib4(int tri,int kmax,int kna,float alpha,float * A,float * C,int sdc)152 void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
153 	{
154 
155 	if(tri==1)
156 		{
157 		// A is lower triangular, C is upper triangular
158 		// kmax+1 3-wide + end 2x2 triangle
159 
160 		kmax += 1;
161 		}
162 
163 	const int bs = 4;
164 
165 	int k;
166 
167 	k = 0;
168 
169 	if(kmax<kna)
170 		goto cleanup_loop;
171 
172 	if(kna>0)
173 		{
174 		for( ; k<kna; k++)
175 			{
176 			C[0+bs*0] = alpha * A[0+bs*0];
177 			C[0+bs*1] = alpha * A[1+bs*0];
178 			C[0+bs*2] = alpha * A[2+bs*0];
179 
180 			C += 1;
181 			A += bs;
182 			}
183 		C += bs*(sdc-1);
184 		}
185 
186 	for( ; k<kmax-3; k+=4)
187 		{
188 		C[0+bs*0] = alpha * A[0+bs*0];
189 		C[0+bs*1] = alpha * A[1+bs*0];
190 		C[0+bs*2] = alpha * A[2+bs*0];
191 
192 		C[1+bs*0] = alpha * A[0+bs*1];
193 		C[1+bs*1] = alpha * A[1+bs*1];
194 		C[1+bs*2] = alpha * A[2+bs*1];
195 
196 		C[2+bs*0] = alpha * A[0+bs*2];
197 		C[2+bs*1] = alpha * A[1+bs*2];
198 		C[2+bs*2] = alpha * A[2+bs*2];
199 
200 		C[3+bs*0] = alpha * A[0+bs*3];
201 		C[3+bs*1] = alpha * A[1+bs*3];
202 		C[3+bs*2] = alpha * A[2+bs*3];
203 
204 		C += bs*sdc;
205 		A += bs*bs;
206 		}
207 
208 	cleanup_loop:
209 
210 	for( ; k<kmax; k++)
211 		{
212 		C[0+bs*0] = alpha * A[0+bs*0];
213 		C[0+bs*1] = alpha * A[1+bs*0];
214 		C[0+bs*2] = alpha * A[2+bs*0];
215 
216 		C += 1;
217 		A += bs;
218 		}
219 
220 	if(tri==1)
221 		{
222 		// end 2x2 triangle
223 		kna = (bs-(bs-kna+kmax)%bs)%bs;
224 
225 		if(kna==1)
226 			{
227 			C[0+bs*1] = alpha * A[1+bs*0];
228 			C[0+bs*2] = alpha * A[2+bs*0];
229 			C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
230 			}
231 		else
232 			{
233 			C[0+bs*1] = alpha * A[1+bs*0];
234 			C[0+bs*2] = alpha * A[2+bs*0];
235 			C[1+bs*2] = alpha * A[2+bs*1];
236 			}
237 		}
238 
239 	}
240 
241 
242 
243 // transposed of general matrices, read along panels, write across panels
kernel_sgetr_2_lib4(int tri,int kmax,int kna,float alpha,float * A,float * C,int sdc)244 void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
245 	{
246 
247 	if(tri==1)
248 		{
249 		// A is lower triangular, C is upper triangular
250 		// kmax+1 2-wide + end 1x1 triangle
251 
252 		kmax += 1;
253 		}
254 
255 	const int bs = 4;
256 
257 	int k;
258 
259 	k = 0;
260 
261 	if(kmax<kna)
262 		goto cleanup_loop;
263 
264 	if(kna>0)
265 		{
266 		for( ; k<kna; k++)
267 			{
268 			C[0+bs*0] = alpha * A[0+bs*0];
269 			C[0+bs*1] = alpha * A[1+bs*0];
270 
271 			C += 1;
272 			A += bs;
273 			}
274 		C += bs*(sdc-1);
275 		}
276 
277 	for( ; k<kmax-3; k+=4)
278 		{
279 		C[0+bs*0] = alpha * A[0+bs*0];
280 		C[0+bs*1] = alpha * A[1+bs*0];
281 
282 		C[1+bs*0] = alpha * A[0+bs*1];
283 		C[1+bs*1] = alpha * A[1+bs*1];
284 
285 		C[2+bs*0] = alpha * A[0+bs*2];
286 		C[2+bs*1] = alpha * A[1+bs*2];
287 
288 		C[3+bs*0] = alpha * A[0+bs*3];
289 		C[3+bs*1] = alpha * A[1+bs*3];
290 
291 		C += bs*sdc;
292 		A += bs*bs;
293 		}
294 
295 	cleanup_loop:
296 
297 	for( ; k<kmax; k++)
298 		{
299 		C[0+bs*0] = alpha * A[0+bs*0];
300 		C[0+bs*1] = alpha * A[1+bs*0];
301 
302 		C += 1;
303 		A += bs;
304 		}
305 
306 	if(tri==1)
307 		{
308 		// end 1x1 triangle
309 		C[0+bs*1] = alpha * A[1+bs*0];
310 		}
311 
312 	}
313 
314 
315 
316 // transposed of general matrices, read along panels, write across panels
kernel_sgetr_1_lib4(int tri,int kmax,int kna,float alpha,float * A,float * C,int sdc)317 void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
318 	{
319 
320 	if(tri==1)
321 		{
322 		// A is lower triangular, C is upper triangular
323 		// kmax+1 1-wide
324 
325 		kmax += 1;
326 		}
327 
328 	const int bs = 4;
329 
330 	int k;
331 
332 	k = 0;
333 
334 	if(kmax<kna)
335 		goto cleanup_loop;
336 
337 	if(kna>0)
338 		{
339 		for( ; k<kna; k++)
340 			{
341 			C[0+bs*0] = alpha * A[0+bs*0];
342 
343 			C += 1;
344 			A += bs;
345 			}
346 		C += bs*(sdc-1);
347 		}
348 
349 	for( ; k<kmax-3; k+=4)
350 		{
351 		C[0+bs*0] = alpha * A[0+bs*0];
352 
353 		C[1+bs*0] = alpha * A[0+bs*1];
354 
355 		C[2+bs*0] = alpha * A[0+bs*2];
356 
357 		C[3+bs*0] = alpha * A[0+bs*3];
358 
359 		C += bs*sdc;
360 		A += bs*bs;
361 		}
362 
363 	cleanup_loop:
364 
365 	for( ; k<kmax; k++)
366 		{
367 		C[0+bs*0] = alpha * A[0+bs*0];
368 
369 		C += 1;
370 		A += bs;
371 		}
372 
373 	}
374 
375 
376 
377 
378