1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38 // transposed of general matrices, read along panels, write across panels
kernel_sgetr_4_lib4(int tri,int kmax,int kna,float alpha,float * A,float * C,int sdc)39 void kernel_sgetr_4_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
40 {
41
42 if(tri==1)
43 {
44 // A is lower triangular, C is upper triangular
45 // kmax+1 4-wide + end 3x3 triangle
46
47 kmax += 1;
48 }
49
50 const int bs = 4;
51
52 int k;
53
54 k = 0;
55
56 if(kmax<kna)
57 goto cleanup_loop;
58
59 if(kna>0)
60 {
61 for( ; k<kna; k++)
62 {
63 C[0+bs*0] = alpha * A[0+bs*0];
64 C[0+bs*1] = alpha * A[1+bs*0];
65 C[0+bs*2] = alpha * A[2+bs*0];
66 C[0+bs*3] = alpha * A[3+bs*0];
67
68 C += 1;
69 A += bs;
70 }
71 C += bs*(sdc-1);
72 }
73
74 for( ; k<kmax-3; k+=4)
75 {
76 C[0+bs*0] = alpha * A[0+bs*0];
77 C[0+bs*1] = alpha * A[1+bs*0];
78 C[0+bs*2] = alpha * A[2+bs*0];
79 C[0+bs*3] = alpha * A[3+bs*0];
80
81 C[1+bs*0] = alpha * A[0+bs*1];
82 C[1+bs*1] = alpha * A[1+bs*1];
83 C[1+bs*2] = alpha * A[2+bs*1];
84 C[1+bs*3] = alpha * A[3+bs*1];
85
86 C[2+bs*0] = alpha * A[0+bs*2];
87 C[2+bs*1] = alpha * A[1+bs*2];
88 C[2+bs*2] = alpha * A[2+bs*2];
89 C[2+bs*3] = alpha * A[3+bs*2];
90
91 C[3+bs*0] = alpha * A[0+bs*3];
92 C[3+bs*1] = alpha * A[1+bs*3];
93 C[3+bs*2] = alpha * A[2+bs*3];
94 C[3+bs*3] = alpha * A[3+bs*3];
95
96 C += bs*sdc;
97 A += bs*bs;
98 }
99
100 cleanup_loop:
101
102 for( ; k<kmax; k++)
103 {
104 C[0+bs*0] = alpha * A[0+bs*0];
105 C[0+bs*1] = alpha * A[1+bs*0];
106 C[0+bs*2] = alpha * A[2+bs*0];
107 C[0+bs*3] = alpha * A[3+bs*0];
108
109 C += 1;
110 A += bs;
111 }
112
113 if(tri==1)
114 {
115 // end 3x3 triangle
116 kna = (bs-(bs-kna+kmax)%bs)%bs;
117
118 if(kna==1)
119 {
120 C[0+bs*1] = alpha * A[1+bs*0];
121 C[0+bs*2] = alpha * A[2+bs*0];
122 C[0+bs*3] = alpha * A[3+bs*0];
123 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
124 C[1+bs*(sdc+2)] = alpha * A[3+bs*1];
125 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
126 }
127 else if(kna==2)
128 {
129 C[0+bs*1] = alpha * A[1+bs*0];
130 C[0+bs*2] = alpha * A[2+bs*0];
131 C[0+bs*3] = alpha * A[3+bs*0];
132 C[1+bs*2] = alpha * A[2+bs*1];
133 C[1+bs*3] = alpha * A[3+bs*1];
134 C[2+bs*(sdc+2)] = alpha * A[3+bs*2];
135 }
136 else
137 {
138 C[0+bs*1] = alpha * A[1+bs*0];
139 C[0+bs*2] = alpha * A[2+bs*0];
140 C[0+bs*3] = alpha * A[3+bs*0];
141 C[1+bs*2] = alpha * A[2+bs*1];
142 C[1+bs*3] = alpha * A[3+bs*1];
143 C[2+bs*3] = alpha * A[3+bs*2];
144 }
145 }
146
147 }
148
149
150
151 // transposed of general matrices, read along panels, write across panels
kernel_sgetr_3_lib4(int tri,int kmax,int kna,float alpha,float * A,float * C,int sdc)152 void kernel_sgetr_3_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
153 {
154
155 if(tri==1)
156 {
157 // A is lower triangular, C is upper triangular
158 // kmax+1 3-wide + end 2x2 triangle
159
160 kmax += 1;
161 }
162
163 const int bs = 4;
164
165 int k;
166
167 k = 0;
168
169 if(kmax<kna)
170 goto cleanup_loop;
171
172 if(kna>0)
173 {
174 for( ; k<kna; k++)
175 {
176 C[0+bs*0] = alpha * A[0+bs*0];
177 C[0+bs*1] = alpha * A[1+bs*0];
178 C[0+bs*2] = alpha * A[2+bs*0];
179
180 C += 1;
181 A += bs;
182 }
183 C += bs*(sdc-1);
184 }
185
186 for( ; k<kmax-3; k+=4)
187 {
188 C[0+bs*0] = alpha * A[0+bs*0];
189 C[0+bs*1] = alpha * A[1+bs*0];
190 C[0+bs*2] = alpha * A[2+bs*0];
191
192 C[1+bs*0] = alpha * A[0+bs*1];
193 C[1+bs*1] = alpha * A[1+bs*1];
194 C[1+bs*2] = alpha * A[2+bs*1];
195
196 C[2+bs*0] = alpha * A[0+bs*2];
197 C[2+bs*1] = alpha * A[1+bs*2];
198 C[2+bs*2] = alpha * A[2+bs*2];
199
200 C[3+bs*0] = alpha * A[0+bs*3];
201 C[3+bs*1] = alpha * A[1+bs*3];
202 C[3+bs*2] = alpha * A[2+bs*3];
203
204 C += bs*sdc;
205 A += bs*bs;
206 }
207
208 cleanup_loop:
209
210 for( ; k<kmax; k++)
211 {
212 C[0+bs*0] = alpha * A[0+bs*0];
213 C[0+bs*1] = alpha * A[1+bs*0];
214 C[0+bs*2] = alpha * A[2+bs*0];
215
216 C += 1;
217 A += bs;
218 }
219
220 if(tri==1)
221 {
222 // end 2x2 triangle
223 kna = (bs-(bs-kna+kmax)%bs)%bs;
224
225 if(kna==1)
226 {
227 C[0+bs*1] = alpha * A[1+bs*0];
228 C[0+bs*2] = alpha * A[2+bs*0];
229 C[1+bs*(sdc+1)] = alpha * A[2+bs*1];
230 }
231 else
232 {
233 C[0+bs*1] = alpha * A[1+bs*0];
234 C[0+bs*2] = alpha * A[2+bs*0];
235 C[1+bs*2] = alpha * A[2+bs*1];
236 }
237 }
238
239 }
240
241
242
243 // transposed of general matrices, read along panels, write across panels
kernel_sgetr_2_lib4(int tri,int kmax,int kna,float alpha,float * A,float * C,int sdc)244 void kernel_sgetr_2_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
245 {
246
247 if(tri==1)
248 {
249 // A is lower triangular, C is upper triangular
250 // kmax+1 2-wide + end 1x1 triangle
251
252 kmax += 1;
253 }
254
255 const int bs = 4;
256
257 int k;
258
259 k = 0;
260
261 if(kmax<kna)
262 goto cleanup_loop;
263
264 if(kna>0)
265 {
266 for( ; k<kna; k++)
267 {
268 C[0+bs*0] = alpha * A[0+bs*0];
269 C[0+bs*1] = alpha * A[1+bs*0];
270
271 C += 1;
272 A += bs;
273 }
274 C += bs*(sdc-1);
275 }
276
277 for( ; k<kmax-3; k+=4)
278 {
279 C[0+bs*0] = alpha * A[0+bs*0];
280 C[0+bs*1] = alpha * A[1+bs*0];
281
282 C[1+bs*0] = alpha * A[0+bs*1];
283 C[1+bs*1] = alpha * A[1+bs*1];
284
285 C[2+bs*0] = alpha * A[0+bs*2];
286 C[2+bs*1] = alpha * A[1+bs*2];
287
288 C[3+bs*0] = alpha * A[0+bs*3];
289 C[3+bs*1] = alpha * A[1+bs*3];
290
291 C += bs*sdc;
292 A += bs*bs;
293 }
294
295 cleanup_loop:
296
297 for( ; k<kmax; k++)
298 {
299 C[0+bs*0] = alpha * A[0+bs*0];
300 C[0+bs*1] = alpha * A[1+bs*0];
301
302 C += 1;
303 A += bs;
304 }
305
306 if(tri==1)
307 {
308 // end 1x1 triangle
309 C[0+bs*1] = alpha * A[1+bs*0];
310 }
311
312 }
313
314
315
316 // transposed of general matrices, read along panels, write across panels
kernel_sgetr_1_lib4(int tri,int kmax,int kna,float alpha,float * A,float * C,int sdc)317 void kernel_sgetr_1_lib4(int tri, int kmax, int kna, float alpha, float *A, float *C, int sdc)
318 {
319
320 if(tri==1)
321 {
322 // A is lower triangular, C is upper triangular
323 // kmax+1 1-wide
324
325 kmax += 1;
326 }
327
328 const int bs = 4;
329
330 int k;
331
332 k = 0;
333
334 if(kmax<kna)
335 goto cleanup_loop;
336
337 if(kna>0)
338 {
339 for( ; k<kna; k++)
340 {
341 C[0+bs*0] = alpha * A[0+bs*0];
342
343 C += 1;
344 A += bs;
345 }
346 C += bs*(sdc-1);
347 }
348
349 for( ; k<kmax-3; k+=4)
350 {
351 C[0+bs*0] = alpha * A[0+bs*0];
352
353 C[1+bs*0] = alpha * A[0+bs*1];
354
355 C[2+bs*0] = alpha * A[0+bs*2];
356
357 C[3+bs*0] = alpha * A[0+bs*3];
358
359 C += bs*sdc;
360 A += bs*bs;
361 }
362
363 cleanup_loop:
364
365 for( ; k<kmax; k++)
366 {
367 C[0+bs*0] = alpha * A[0+bs*0];
368
369 C += 1;
370 A += bs;
371 }
372
373 }
374
375
376
377
378