1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 #include <math.h>
37 
38 #include <blasfeo_common.h>
39 #include <blasfeo_s_kernel.h>
40 
41 
42 
43 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
kernel_strmm_nt_ru_8x4_lib8(int kmax,float * alpha,float * A,float * B,float * D)44 void kernel_strmm_nt_ru_8x4_lib8(int kmax, float *alpha, float *A, float *B, float *D)
45 	{
46 
47 	const int bs = 8;
48 
49 	float
50 		a_0, a_1, a_2, a_3,
51 		a_4, a_5, a_6, a_7,
52 		b_0, b_1, b_2, b_3;
53 
54 #if defined(TARGET_GENERIC)
55 	float CC[32] = {0};
56 #else
57 	ALIGNED( float CC[32], 64 ) = {0};
58 #endif
59 
60 	int k;
61 
62 	k = 0;
63 
64 	// k = 0
65 	if(kmax>0)
66 		{
67 		a_0 = A[0];
68 		a_1 = A[1];
69 		a_2 = A[2];
70 		a_3 = A[3];
71 		a_4 = A[4];
72 		a_5 = A[5];
73 		a_6 = A[6];
74 		a_7 = A[7];
75 
76 		b_0 = B[0];
77 
78 		CC[0+bs*0] += a_0 * b_0;
79 		CC[1+bs*0] += a_1 * b_0;
80 		CC[2+bs*0] += a_2 * b_0;
81 		CC[3+bs*0] += a_3 * b_0;
82 		CC[4+bs*0] += a_4 * b_0;
83 		CC[5+bs*0] += a_5 * b_0;
84 		CC[6+bs*0] += a_6 * b_0;
85 		CC[7+bs*0] += a_7 * b_0;
86 
87 		A += bs;
88 		B += bs;
89 		k++;
90 		}
91 
92 	// k = 1
93 	if(kmax>1)
94 		{
95 		a_0 = A[0];
96 		a_1 = A[1];
97 		a_2 = A[2];
98 		a_3 = A[3];
99 		a_4 = A[4];
100 		a_5 = A[5];
101 		a_6 = A[6];
102 		a_7 = A[7];
103 
104 		b_0 = B[0];
105 		b_1 = B[1];
106 
107 		CC[0+bs*0] += a_0 * b_0;
108 		CC[1+bs*0] += a_1 * b_0;
109 		CC[2+bs*0] += a_2 * b_0;
110 		CC[3+bs*0] += a_3 * b_0;
111 		CC[4+bs*0] += a_4 * b_0;
112 		CC[5+bs*0] += a_5 * b_0;
113 		CC[6+bs*0] += a_6 * b_0;
114 		CC[7+bs*0] += a_7 * b_0;
115 
116 		CC[0+bs*1] += a_0 * b_1;
117 		CC[1+bs*1] += a_1 * b_1;
118 		CC[2+bs*1] += a_2 * b_1;
119 		CC[3+bs*1] += a_3 * b_1;
120 		CC[4+bs*1] += a_4 * b_1;
121 		CC[5+bs*1] += a_5 * b_1;
122 		CC[6+bs*1] += a_6 * b_1;
123 		CC[7+bs*1] += a_7 * b_1;
124 
125 		A += bs;
126 		B += bs;
127 		k++;
128 		}
129 
130 	// k = 2
131 	if(kmax>2)
132 		{
133 		a_0 = A[0];
134 		a_1 = A[1];
135 		a_2 = A[2];
136 		a_3 = A[3];
137 		a_4 = A[4];
138 		a_5 = A[5];
139 		a_6 = A[6];
140 		a_7 = A[7];
141 
142 		b_0 = B[0];
143 		b_1 = B[1];
144 		b_2 = B[2];
145 
146 		CC[0+bs*0] += a_0 * b_0;
147 		CC[1+bs*0] += a_1 * b_0;
148 		CC[2+bs*0] += a_2 * b_0;
149 		CC[3+bs*0] += a_3 * b_0;
150 		CC[4+bs*0] += a_4 * b_0;
151 		CC[5+bs*0] += a_5 * b_0;
152 		CC[6+bs*0] += a_6 * b_0;
153 		CC[7+bs*0] += a_7 * b_0;
154 
155 		CC[0+bs*1] += a_0 * b_1;
156 		CC[1+bs*1] += a_1 * b_1;
157 		CC[2+bs*1] += a_2 * b_1;
158 		CC[3+bs*1] += a_3 * b_1;
159 		CC[4+bs*1] += a_4 * b_1;
160 		CC[5+bs*1] += a_5 * b_1;
161 		CC[6+bs*1] += a_6 * b_1;
162 		CC[7+bs*1] += a_7 * b_1;
163 
164 		CC[0+bs*2] += a_0 * b_2;
165 		CC[1+bs*2] += a_1 * b_2;
166 		CC[2+bs*2] += a_2 * b_2;
167 		CC[3+bs*2] += a_3 * b_2;
168 		CC[4+bs*2] += a_4 * b_2;
169 		CC[5+bs*2] += a_5 * b_2;
170 		CC[6+bs*2] += a_6 * b_2;
171 		CC[7+bs*2] += a_7 * b_2;
172 
173 		A += bs;
174 		B += bs;
175 		k++;
176 		}
177 
178 	kernel_sgemm_nt_8x4_lib8(kmax-k, alpha, A, B, alpha, CC, D);
179 
180 	return;
181 
182 	}
183 #endif
184 
185 
186 
187 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
kernel_strmm_nt_ru_8x4_vs_lib8(int kmax,float * alpha,float * A,float * B,float * D,int km,int kn)188 void kernel_strmm_nt_ru_8x4_vs_lib8(int kmax, float *alpha, float *A, float *B, float *D, int km, int kn)
189 	{
190 
191 	const int bs = 8;
192 
193 	float
194 		a_0, a_1, a_2, a_3,
195 		a_4, a_5, a_6, a_7,
196 		b_0, b_1, b_2, b_3;
197 
198 #if defined(TARGET_GENERIC)
199 	float CC[32] = {0};
200 #else
201 	ALIGNED( float CC[32], 64 ) = {0};
202 #endif
203 
204 	int k;
205 
206 	k = 0;
207 
208 	// k = 0
209 	if(kmax>0)
210 		{
211 		a_0 = A[0];
212 		a_1 = A[1];
213 		a_2 = A[2];
214 		a_3 = A[3];
215 		a_4 = A[4];
216 		a_5 = A[5];
217 		a_6 = A[6];
218 		a_7 = A[7];
219 
220 		b_0 = B[0];
221 
222 		CC[0+bs*0] += a_0 * b_0;
223 		CC[1+bs*0] += a_1 * b_0;
224 		CC[2+bs*0] += a_2 * b_0;
225 		CC[3+bs*0] += a_3 * b_0;
226 		CC[4+bs*0] += a_4 * b_0;
227 		CC[5+bs*0] += a_5 * b_0;
228 		CC[6+bs*0] += a_6 * b_0;
229 		CC[7+bs*0] += a_7 * b_0;
230 
231 		A += bs;
232 		B += bs;
233 		k++;
234 		}
235 
236 	// k = 1
237 	if(kmax>1)
238 		{
239 		a_0 = A[0];
240 		a_1 = A[1];
241 		a_2 = A[2];
242 		a_3 = A[3];
243 		a_4 = A[4];
244 		a_5 = A[5];
245 		a_6 = A[6];
246 		a_7 = A[7];
247 
248 		b_0 = B[0];
249 		b_1 = B[1];
250 
251 		CC[0+bs*0] += a_0 * b_0;
252 		CC[1+bs*0] += a_1 * b_0;
253 		CC[2+bs*0] += a_2 * b_0;
254 		CC[3+bs*0] += a_3 * b_0;
255 		CC[4+bs*0] += a_4 * b_0;
256 		CC[5+bs*0] += a_5 * b_0;
257 		CC[6+bs*0] += a_6 * b_0;
258 		CC[7+bs*0] += a_7 * b_0;
259 
260 		CC[0+bs*1] += a_0 * b_1;
261 		CC[1+bs*1] += a_1 * b_1;
262 		CC[2+bs*1] += a_2 * b_1;
263 		CC[3+bs*1] += a_3 * b_1;
264 		CC[4+bs*1] += a_4 * b_1;
265 		CC[5+bs*1] += a_5 * b_1;
266 		CC[6+bs*1] += a_6 * b_1;
267 		CC[7+bs*1] += a_7 * b_1;
268 
269 		A += bs;
270 		B += bs;
271 		k++;
272 		}
273 
274 	// k = 2
275 	if(kmax>2)
276 		{
277 		a_0 = A[0];
278 		a_1 = A[1];
279 		a_2 = A[2];
280 		a_3 = A[3];
281 		a_4 = A[4];
282 		a_5 = A[5];
283 		a_6 = A[6];
284 		a_7 = A[7];
285 
286 		b_0 = B[0];
287 		b_1 = B[1];
288 		b_2 = B[2];
289 
290 		CC[0+bs*0] += a_0 * b_0;
291 		CC[1+bs*0] += a_1 * b_0;
292 		CC[2+bs*0] += a_2 * b_0;
293 		CC[3+bs*0] += a_3 * b_0;
294 		CC[4+bs*0] += a_4 * b_0;
295 		CC[5+bs*0] += a_5 * b_0;
296 		CC[6+bs*0] += a_6 * b_0;
297 		CC[7+bs*0] += a_7 * b_0;
298 
299 		CC[0+bs*1] += a_0 * b_1;
300 		CC[1+bs*1] += a_1 * b_1;
301 		CC[2+bs*1] += a_2 * b_1;
302 		CC[3+bs*1] += a_3 * b_1;
303 		CC[4+bs*1] += a_4 * b_1;
304 		CC[5+bs*1] += a_5 * b_1;
305 		CC[6+bs*1] += a_6 * b_1;
306 		CC[7+bs*1] += a_7 * b_1;
307 
308 		CC[0+bs*2] += a_0 * b_2;
309 		CC[1+bs*2] += a_1 * b_2;
310 		CC[2+bs*2] += a_2 * b_2;
311 		CC[3+bs*2] += a_3 * b_2;
312 		CC[4+bs*2] += a_4 * b_2;
313 		CC[5+bs*2] += a_5 * b_2;
314 		CC[6+bs*2] += a_6 * b_2;
315 		CC[7+bs*2] += a_7 * b_2;
316 
317 		A += bs;
318 		B += bs;
319 		k++;
320 		}
321 
322 	kernel_sgemm_nt_8x4_lib8(kmax-k, alpha, A, B, alpha, CC, CC);
323 
324 	if(km>=8)
325 		{
326 		D[0+bs*0] = CC[0+bs*0];
327 		D[1+bs*0] = CC[1+bs*0];
328 		D[2+bs*0] = CC[2+bs*0];
329 		D[3+bs*0] = CC[3+bs*0];
330 		D[4+bs*0] = CC[4+bs*0];
331 		D[5+bs*0] = CC[5+bs*0];
332 		D[6+bs*0] = CC[6+bs*0];
333 		D[7+bs*0] = CC[7+bs*0];
334 
335 		if(kn==1)
336 			return;
337 
338 		D[0+bs*1] = CC[0+bs*1];
339 		D[1+bs*1] = CC[1+bs*1];
340 		D[2+bs*1] = CC[2+bs*1];
341 		D[3+bs*1] = CC[3+bs*1];
342 		D[4+bs*1] = CC[4+bs*1];
343 		D[5+bs*1] = CC[5+bs*1];
344 		D[6+bs*1] = CC[6+bs*1];
345 		D[7+bs*1] = CC[7+bs*1];
346 
347 		if(kn==2)
348 			return;
349 
350 		D[0+bs*2] = CC[0+bs*2];
351 		D[1+bs*2] = CC[1+bs*2];
352 		D[2+bs*2] = CC[2+bs*2];
353 		D[3+bs*2] = CC[3+bs*2];
354 		D[4+bs*2] = CC[4+bs*2];
355 		D[5+bs*2] = CC[5+bs*2];
356 		D[6+bs*2] = CC[6+bs*2];
357 		D[7+bs*2] = CC[7+bs*2];
358 
359 		if(kn==3)
360 			return;
361 
362 		D[0+bs*3] = CC[0+bs*3];
363 		D[1+bs*3] = CC[1+bs*3];
364 		D[2+bs*3] = CC[2+bs*3];
365 		D[3+bs*3] = CC[3+bs*3];
366 		D[4+bs*3] = CC[4+bs*3];
367 		D[5+bs*3] = CC[5+bs*3];
368 		D[6+bs*3] = CC[6+bs*3];
369 		D[7+bs*3] = CC[7+bs*3];
370 		}
371 	else if(km>=7)
372 		{
373 		D[0+bs*0] = CC[0+bs*0];
374 		D[1+bs*0] = CC[1+bs*0];
375 		D[2+bs*0] = CC[2+bs*0];
376 		D[3+bs*0] = CC[3+bs*0];
377 		D[4+bs*0] = CC[4+bs*0];
378 		D[5+bs*0] = CC[5+bs*0];
379 		D[6+bs*0] = CC[6+bs*0];
380 
381 		if(kn==1)
382 			return;
383 
384 		D[0+bs*1] = CC[0+bs*1];
385 		D[1+bs*1] = CC[1+bs*1];
386 		D[2+bs*1] = CC[2+bs*1];
387 		D[3+bs*1] = CC[3+bs*1];
388 		D[4+bs*1] = CC[4+bs*1];
389 		D[5+bs*1] = CC[5+bs*1];
390 		D[6+bs*1] = CC[6+bs*1];
391 
392 		if(kn==2)
393 			return;
394 
395 		D[0+bs*2] = CC[0+bs*2];
396 		D[1+bs*2] = CC[1+bs*2];
397 		D[2+bs*2] = CC[2+bs*2];
398 		D[3+bs*2] = CC[3+bs*2];
399 		D[4+bs*2] = CC[4+bs*2];
400 		D[5+bs*2] = CC[5+bs*2];
401 		D[6+bs*2] = CC[6+bs*2];
402 
403 		if(kn==3)
404 			return;
405 
406 		D[0+bs*3] = CC[0+bs*3];
407 		D[1+bs*3] = CC[1+bs*3];
408 		D[2+bs*3] = CC[2+bs*3];
409 		D[3+bs*3] = CC[3+bs*3];
410 		D[4+bs*3] = CC[4+bs*3];
411 		D[5+bs*3] = CC[5+bs*3];
412 		D[6+bs*3] = CC[6+bs*3];
413 		}
414 	else if(km>=6)
415 		{
416 		D[0+bs*0] = CC[0+bs*0];
417 		D[1+bs*0] = CC[1+bs*0];
418 		D[2+bs*0] = CC[2+bs*0];
419 		D[3+bs*0] = CC[3+bs*0];
420 		D[4+bs*0] = CC[4+bs*0];
421 		D[5+bs*0] = CC[5+bs*0];
422 
423 		if(kn==1)
424 			return;
425 
426 		D[0+bs*1] = CC[0+bs*1];
427 		D[1+bs*1] = CC[1+bs*1];
428 		D[2+bs*1] = CC[2+bs*1];
429 		D[3+bs*1] = CC[3+bs*1];
430 		D[4+bs*1] = CC[4+bs*1];
431 		D[5+bs*1] = CC[5+bs*1];
432 
433 		if(kn==2)
434 			return;
435 
436 		D[0+bs*2] = CC[0+bs*2];
437 		D[1+bs*2] = CC[1+bs*2];
438 		D[2+bs*2] = CC[2+bs*2];
439 		D[3+bs*2] = CC[3+bs*2];
440 		D[4+bs*2] = CC[4+bs*2];
441 		D[5+bs*2] = CC[5+bs*2];
442 
443 		if(kn==3)
444 			return;
445 
446 		D[0+bs*3] = CC[0+bs*3];
447 		D[1+bs*3] = CC[1+bs*3];
448 		D[2+bs*3] = CC[2+bs*3];
449 		D[3+bs*3] = CC[3+bs*3];
450 		D[4+bs*3] = CC[4+bs*3];
451 		D[5+bs*3] = CC[5+bs*3];
452 		}
453 	else if(km>=5)
454 		{
455 		D[0+bs*0] = CC[0+bs*0];
456 		D[1+bs*0] = CC[1+bs*0];
457 		D[2+bs*0] = CC[2+bs*0];
458 		D[3+bs*0] = CC[3+bs*0];
459 		D[4+bs*0] = CC[4+bs*0];
460 
461 		if(kn==1)
462 			return;
463 
464 		D[0+bs*1] = CC[0+bs*1];
465 		D[1+bs*1] = CC[1+bs*1];
466 		D[2+bs*1] = CC[2+bs*1];
467 		D[3+bs*1] = CC[3+bs*1];
468 		D[4+bs*1] = CC[4+bs*1];
469 
470 		if(kn==2)
471 			return;
472 
473 		D[0+bs*2] = CC[0+bs*2];
474 		D[1+bs*2] = CC[1+bs*2];
475 		D[2+bs*2] = CC[2+bs*2];
476 		D[3+bs*2] = CC[3+bs*2];
477 		D[4+bs*2] = CC[4+bs*2];
478 
479 		if(kn==3)
480 			return;
481 
482 		D[0+bs*3] = CC[0+bs*3];
483 		D[1+bs*3] = CC[1+bs*3];
484 		D[2+bs*3] = CC[2+bs*3];
485 		D[3+bs*3] = CC[3+bs*3];
486 		D[4+bs*3] = CC[4+bs*3];
487 		}
488 	else if(km>=4)
489 		{
490 		D[0+bs*0] = CC[0+bs*0];
491 		D[1+bs*0] = CC[1+bs*0];
492 		D[2+bs*0] = CC[2+bs*0];
493 		D[3+bs*0] = CC[3+bs*0];
494 
495 		if(kn==1)
496 			return;
497 
498 		D[0+bs*1] = CC[0+bs*1];
499 		D[1+bs*1] = CC[1+bs*1];
500 		D[2+bs*1] = CC[2+bs*1];
501 		D[3+bs*1] = CC[3+bs*1];
502 
503 		if(kn==2)
504 			return;
505 
506 		D[0+bs*2] = CC[0+bs*2];
507 		D[1+bs*2] = CC[1+bs*2];
508 		D[2+bs*2] = CC[2+bs*2];
509 		D[3+bs*2] = CC[3+bs*2];
510 
511 		if(kn==3)
512 			return;
513 
514 		D[0+bs*3] = CC[0+bs*3];
515 		D[1+bs*3] = CC[1+bs*3];
516 		D[2+bs*3] = CC[2+bs*3];
517 		D[3+bs*3] = CC[3+bs*3];
518 		}
519 	else if(km>=3)
520 		{
521 		D[0+bs*0] = CC[0+bs*0];
522 		D[1+bs*0] = CC[1+bs*0];
523 		D[2+bs*0] = CC[2+bs*0];
524 
525 		if(kn==1)
526 			return;
527 
528 		D[0+bs*1] = CC[0+bs*1];
529 		D[1+bs*1] = CC[1+bs*1];
530 		D[2+bs*1] = CC[2+bs*1];
531 
532 		if(kn==2)
533 			return;
534 
535 		D[0+bs*2] = CC[0+bs*2];
536 		D[1+bs*2] = CC[1+bs*2];
537 		D[2+bs*2] = CC[2+bs*2];
538 
539 		if(kn==3)
540 			return;
541 
542 		D[0+bs*3] = CC[0+bs*3];
543 		D[1+bs*3] = CC[1+bs*3];
544 		D[2+bs*3] = CC[2+bs*3];
545 		}
546 	else if(km>=2)
547 		{
548 		D[0+bs*0] = CC[0+bs*0];
549 		D[1+bs*0] = CC[1+bs*0];
550 
551 		if(kn==1)
552 			return;
553 
554 		D[0+bs*1] = CC[0+bs*1];
555 		D[1+bs*1] = CC[1+bs*1];
556 
557 		if(kn==2)
558 			return;
559 
560 		D[0+bs*2] = CC[0+bs*2];
561 		D[1+bs*2] = CC[1+bs*2];
562 
563 		if(kn==3)
564 			return;
565 
566 		D[0+bs*3] = CC[0+bs*3];
567 		D[1+bs*3] = CC[1+bs*3];
568 		}
569 	else //if(km>=1)
570 		{
571 		D[0+bs*0] = CC[0+bs*0];
572 
573 		if(kn==1)
574 			return;
575 
576 		D[0+bs*1] = CC[0+bs*1];
577 
578 		if(kn==2)
579 			return;
580 
581 		D[0+bs*2] = CC[0+bs*2];
582 
583 		if(kn==3)
584 			return;
585 
586 		D[0+bs*3] = CC[0+bs*3];
587 		}
588 
589 	return;
590 
591 	}
592 #endif
593 
594