1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36 #include <math.h>
37
38 #include <blasfeo_common.h>
39 #include <blasfeo_s_kernel.h>
40
41
42
43 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
kernel_strmm_nt_ru_8x4_lib8(int kmax,float * alpha,float * A,float * B,float * D)44 void kernel_strmm_nt_ru_8x4_lib8(int kmax, float *alpha, float *A, float *B, float *D)
45 {
46
47 const int bs = 8;
48
49 float
50 a_0, a_1, a_2, a_3,
51 a_4, a_5, a_6, a_7,
52 b_0, b_1, b_2, b_3;
53
54 #if defined(TARGET_GENERIC)
55 float CC[32] = {0};
56 #else
57 ALIGNED( float CC[32], 64 ) = {0};
58 #endif
59
60 int k;
61
62 k = 0;
63
64 // k = 0
65 if(kmax>0)
66 {
67 a_0 = A[0];
68 a_1 = A[1];
69 a_2 = A[2];
70 a_3 = A[3];
71 a_4 = A[4];
72 a_5 = A[5];
73 a_6 = A[6];
74 a_7 = A[7];
75
76 b_0 = B[0];
77
78 CC[0+bs*0] += a_0 * b_0;
79 CC[1+bs*0] += a_1 * b_0;
80 CC[2+bs*0] += a_2 * b_0;
81 CC[3+bs*0] += a_3 * b_0;
82 CC[4+bs*0] += a_4 * b_0;
83 CC[5+bs*0] += a_5 * b_0;
84 CC[6+bs*0] += a_6 * b_0;
85 CC[7+bs*0] += a_7 * b_0;
86
87 A += bs;
88 B += bs;
89 k++;
90 }
91
92 // k = 1
93 if(kmax>1)
94 {
95 a_0 = A[0];
96 a_1 = A[1];
97 a_2 = A[2];
98 a_3 = A[3];
99 a_4 = A[4];
100 a_5 = A[5];
101 a_6 = A[6];
102 a_7 = A[7];
103
104 b_0 = B[0];
105 b_1 = B[1];
106
107 CC[0+bs*0] += a_0 * b_0;
108 CC[1+bs*0] += a_1 * b_0;
109 CC[2+bs*0] += a_2 * b_0;
110 CC[3+bs*0] += a_3 * b_0;
111 CC[4+bs*0] += a_4 * b_0;
112 CC[5+bs*0] += a_5 * b_0;
113 CC[6+bs*0] += a_6 * b_0;
114 CC[7+bs*0] += a_7 * b_0;
115
116 CC[0+bs*1] += a_0 * b_1;
117 CC[1+bs*1] += a_1 * b_1;
118 CC[2+bs*1] += a_2 * b_1;
119 CC[3+bs*1] += a_3 * b_1;
120 CC[4+bs*1] += a_4 * b_1;
121 CC[5+bs*1] += a_5 * b_1;
122 CC[6+bs*1] += a_6 * b_1;
123 CC[7+bs*1] += a_7 * b_1;
124
125 A += bs;
126 B += bs;
127 k++;
128 }
129
130 // k = 2
131 if(kmax>2)
132 {
133 a_0 = A[0];
134 a_1 = A[1];
135 a_2 = A[2];
136 a_3 = A[3];
137 a_4 = A[4];
138 a_5 = A[5];
139 a_6 = A[6];
140 a_7 = A[7];
141
142 b_0 = B[0];
143 b_1 = B[1];
144 b_2 = B[2];
145
146 CC[0+bs*0] += a_0 * b_0;
147 CC[1+bs*0] += a_1 * b_0;
148 CC[2+bs*0] += a_2 * b_0;
149 CC[3+bs*0] += a_3 * b_0;
150 CC[4+bs*0] += a_4 * b_0;
151 CC[5+bs*0] += a_5 * b_0;
152 CC[6+bs*0] += a_6 * b_0;
153 CC[7+bs*0] += a_7 * b_0;
154
155 CC[0+bs*1] += a_0 * b_1;
156 CC[1+bs*1] += a_1 * b_1;
157 CC[2+bs*1] += a_2 * b_1;
158 CC[3+bs*1] += a_3 * b_1;
159 CC[4+bs*1] += a_4 * b_1;
160 CC[5+bs*1] += a_5 * b_1;
161 CC[6+bs*1] += a_6 * b_1;
162 CC[7+bs*1] += a_7 * b_1;
163
164 CC[0+bs*2] += a_0 * b_2;
165 CC[1+bs*2] += a_1 * b_2;
166 CC[2+bs*2] += a_2 * b_2;
167 CC[3+bs*2] += a_3 * b_2;
168 CC[4+bs*2] += a_4 * b_2;
169 CC[5+bs*2] += a_5 * b_2;
170 CC[6+bs*2] += a_6 * b_2;
171 CC[7+bs*2] += a_7 * b_2;
172
173 A += bs;
174 B += bs;
175 k++;
176 }
177
178 kernel_sgemm_nt_8x4_lib8(kmax-k, alpha, A, B, alpha, CC, D);
179
180 return;
181
182 }
183 #endif
184
185
186
187 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
kernel_strmm_nt_ru_8x4_vs_lib8(int kmax,float * alpha,float * A,float * B,float * D,int km,int kn)188 void kernel_strmm_nt_ru_8x4_vs_lib8(int kmax, float *alpha, float *A, float *B, float *D, int km, int kn)
189 {
190
191 const int bs = 8;
192
193 float
194 a_0, a_1, a_2, a_3,
195 a_4, a_5, a_6, a_7,
196 b_0, b_1, b_2, b_3;
197
198 #if defined(TARGET_GENERIC)
199 float CC[32] = {0};
200 #else
201 ALIGNED( float CC[32], 64 ) = {0};
202 #endif
203
204 int k;
205
206 k = 0;
207
208 // k = 0
209 if(kmax>0)
210 {
211 a_0 = A[0];
212 a_1 = A[1];
213 a_2 = A[2];
214 a_3 = A[3];
215 a_4 = A[4];
216 a_5 = A[5];
217 a_6 = A[6];
218 a_7 = A[7];
219
220 b_0 = B[0];
221
222 CC[0+bs*0] += a_0 * b_0;
223 CC[1+bs*0] += a_1 * b_0;
224 CC[2+bs*0] += a_2 * b_0;
225 CC[3+bs*0] += a_3 * b_0;
226 CC[4+bs*0] += a_4 * b_0;
227 CC[5+bs*0] += a_5 * b_0;
228 CC[6+bs*0] += a_6 * b_0;
229 CC[7+bs*0] += a_7 * b_0;
230
231 A += bs;
232 B += bs;
233 k++;
234 }
235
236 // k = 1
237 if(kmax>1)
238 {
239 a_0 = A[0];
240 a_1 = A[1];
241 a_2 = A[2];
242 a_3 = A[3];
243 a_4 = A[4];
244 a_5 = A[5];
245 a_6 = A[6];
246 a_7 = A[7];
247
248 b_0 = B[0];
249 b_1 = B[1];
250
251 CC[0+bs*0] += a_0 * b_0;
252 CC[1+bs*0] += a_1 * b_0;
253 CC[2+bs*0] += a_2 * b_0;
254 CC[3+bs*0] += a_3 * b_0;
255 CC[4+bs*0] += a_4 * b_0;
256 CC[5+bs*0] += a_5 * b_0;
257 CC[6+bs*0] += a_6 * b_0;
258 CC[7+bs*0] += a_7 * b_0;
259
260 CC[0+bs*1] += a_0 * b_1;
261 CC[1+bs*1] += a_1 * b_1;
262 CC[2+bs*1] += a_2 * b_1;
263 CC[3+bs*1] += a_3 * b_1;
264 CC[4+bs*1] += a_4 * b_1;
265 CC[5+bs*1] += a_5 * b_1;
266 CC[6+bs*1] += a_6 * b_1;
267 CC[7+bs*1] += a_7 * b_1;
268
269 A += bs;
270 B += bs;
271 k++;
272 }
273
274 // k = 2
275 if(kmax>2)
276 {
277 a_0 = A[0];
278 a_1 = A[1];
279 a_2 = A[2];
280 a_3 = A[3];
281 a_4 = A[4];
282 a_5 = A[5];
283 a_6 = A[6];
284 a_7 = A[7];
285
286 b_0 = B[0];
287 b_1 = B[1];
288 b_2 = B[2];
289
290 CC[0+bs*0] += a_0 * b_0;
291 CC[1+bs*0] += a_1 * b_0;
292 CC[2+bs*0] += a_2 * b_0;
293 CC[3+bs*0] += a_3 * b_0;
294 CC[4+bs*0] += a_4 * b_0;
295 CC[5+bs*0] += a_5 * b_0;
296 CC[6+bs*0] += a_6 * b_0;
297 CC[7+bs*0] += a_7 * b_0;
298
299 CC[0+bs*1] += a_0 * b_1;
300 CC[1+bs*1] += a_1 * b_1;
301 CC[2+bs*1] += a_2 * b_1;
302 CC[3+bs*1] += a_3 * b_1;
303 CC[4+bs*1] += a_4 * b_1;
304 CC[5+bs*1] += a_5 * b_1;
305 CC[6+bs*1] += a_6 * b_1;
306 CC[7+bs*1] += a_7 * b_1;
307
308 CC[0+bs*2] += a_0 * b_2;
309 CC[1+bs*2] += a_1 * b_2;
310 CC[2+bs*2] += a_2 * b_2;
311 CC[3+bs*2] += a_3 * b_2;
312 CC[4+bs*2] += a_4 * b_2;
313 CC[5+bs*2] += a_5 * b_2;
314 CC[6+bs*2] += a_6 * b_2;
315 CC[7+bs*2] += a_7 * b_2;
316
317 A += bs;
318 B += bs;
319 k++;
320 }
321
322 kernel_sgemm_nt_8x4_lib8(kmax-k, alpha, A, B, alpha, CC, CC);
323
324 if(km>=8)
325 {
326 D[0+bs*0] = CC[0+bs*0];
327 D[1+bs*0] = CC[1+bs*0];
328 D[2+bs*0] = CC[2+bs*0];
329 D[3+bs*0] = CC[3+bs*0];
330 D[4+bs*0] = CC[4+bs*0];
331 D[5+bs*0] = CC[5+bs*0];
332 D[6+bs*0] = CC[6+bs*0];
333 D[7+bs*0] = CC[7+bs*0];
334
335 if(kn==1)
336 return;
337
338 D[0+bs*1] = CC[0+bs*1];
339 D[1+bs*1] = CC[1+bs*1];
340 D[2+bs*1] = CC[2+bs*1];
341 D[3+bs*1] = CC[3+bs*1];
342 D[4+bs*1] = CC[4+bs*1];
343 D[5+bs*1] = CC[5+bs*1];
344 D[6+bs*1] = CC[6+bs*1];
345 D[7+bs*1] = CC[7+bs*1];
346
347 if(kn==2)
348 return;
349
350 D[0+bs*2] = CC[0+bs*2];
351 D[1+bs*2] = CC[1+bs*2];
352 D[2+bs*2] = CC[2+bs*2];
353 D[3+bs*2] = CC[3+bs*2];
354 D[4+bs*2] = CC[4+bs*2];
355 D[5+bs*2] = CC[5+bs*2];
356 D[6+bs*2] = CC[6+bs*2];
357 D[7+bs*2] = CC[7+bs*2];
358
359 if(kn==3)
360 return;
361
362 D[0+bs*3] = CC[0+bs*3];
363 D[1+bs*3] = CC[1+bs*3];
364 D[2+bs*3] = CC[2+bs*3];
365 D[3+bs*3] = CC[3+bs*3];
366 D[4+bs*3] = CC[4+bs*3];
367 D[5+bs*3] = CC[5+bs*3];
368 D[6+bs*3] = CC[6+bs*3];
369 D[7+bs*3] = CC[7+bs*3];
370 }
371 else if(km>=7)
372 {
373 D[0+bs*0] = CC[0+bs*0];
374 D[1+bs*0] = CC[1+bs*0];
375 D[2+bs*0] = CC[2+bs*0];
376 D[3+bs*0] = CC[3+bs*0];
377 D[4+bs*0] = CC[4+bs*0];
378 D[5+bs*0] = CC[5+bs*0];
379 D[6+bs*0] = CC[6+bs*0];
380
381 if(kn==1)
382 return;
383
384 D[0+bs*1] = CC[0+bs*1];
385 D[1+bs*1] = CC[1+bs*1];
386 D[2+bs*1] = CC[2+bs*1];
387 D[3+bs*1] = CC[3+bs*1];
388 D[4+bs*1] = CC[4+bs*1];
389 D[5+bs*1] = CC[5+bs*1];
390 D[6+bs*1] = CC[6+bs*1];
391
392 if(kn==2)
393 return;
394
395 D[0+bs*2] = CC[0+bs*2];
396 D[1+bs*2] = CC[1+bs*2];
397 D[2+bs*2] = CC[2+bs*2];
398 D[3+bs*2] = CC[3+bs*2];
399 D[4+bs*2] = CC[4+bs*2];
400 D[5+bs*2] = CC[5+bs*2];
401 D[6+bs*2] = CC[6+bs*2];
402
403 if(kn==3)
404 return;
405
406 D[0+bs*3] = CC[0+bs*3];
407 D[1+bs*3] = CC[1+bs*3];
408 D[2+bs*3] = CC[2+bs*3];
409 D[3+bs*3] = CC[3+bs*3];
410 D[4+bs*3] = CC[4+bs*3];
411 D[5+bs*3] = CC[5+bs*3];
412 D[6+bs*3] = CC[6+bs*3];
413 }
414 else if(km>=6)
415 {
416 D[0+bs*0] = CC[0+bs*0];
417 D[1+bs*0] = CC[1+bs*0];
418 D[2+bs*0] = CC[2+bs*0];
419 D[3+bs*0] = CC[3+bs*0];
420 D[4+bs*0] = CC[4+bs*0];
421 D[5+bs*0] = CC[5+bs*0];
422
423 if(kn==1)
424 return;
425
426 D[0+bs*1] = CC[0+bs*1];
427 D[1+bs*1] = CC[1+bs*1];
428 D[2+bs*1] = CC[2+bs*1];
429 D[3+bs*1] = CC[3+bs*1];
430 D[4+bs*1] = CC[4+bs*1];
431 D[5+bs*1] = CC[5+bs*1];
432
433 if(kn==2)
434 return;
435
436 D[0+bs*2] = CC[0+bs*2];
437 D[1+bs*2] = CC[1+bs*2];
438 D[2+bs*2] = CC[2+bs*2];
439 D[3+bs*2] = CC[3+bs*2];
440 D[4+bs*2] = CC[4+bs*2];
441 D[5+bs*2] = CC[5+bs*2];
442
443 if(kn==3)
444 return;
445
446 D[0+bs*3] = CC[0+bs*3];
447 D[1+bs*3] = CC[1+bs*3];
448 D[2+bs*3] = CC[2+bs*3];
449 D[3+bs*3] = CC[3+bs*3];
450 D[4+bs*3] = CC[4+bs*3];
451 D[5+bs*3] = CC[5+bs*3];
452 }
453 else if(km>=5)
454 {
455 D[0+bs*0] = CC[0+bs*0];
456 D[1+bs*0] = CC[1+bs*0];
457 D[2+bs*0] = CC[2+bs*0];
458 D[3+bs*0] = CC[3+bs*0];
459 D[4+bs*0] = CC[4+bs*0];
460
461 if(kn==1)
462 return;
463
464 D[0+bs*1] = CC[0+bs*1];
465 D[1+bs*1] = CC[1+bs*1];
466 D[2+bs*1] = CC[2+bs*1];
467 D[3+bs*1] = CC[3+bs*1];
468 D[4+bs*1] = CC[4+bs*1];
469
470 if(kn==2)
471 return;
472
473 D[0+bs*2] = CC[0+bs*2];
474 D[1+bs*2] = CC[1+bs*2];
475 D[2+bs*2] = CC[2+bs*2];
476 D[3+bs*2] = CC[3+bs*2];
477 D[4+bs*2] = CC[4+bs*2];
478
479 if(kn==3)
480 return;
481
482 D[0+bs*3] = CC[0+bs*3];
483 D[1+bs*3] = CC[1+bs*3];
484 D[2+bs*3] = CC[2+bs*3];
485 D[3+bs*3] = CC[3+bs*3];
486 D[4+bs*3] = CC[4+bs*3];
487 }
488 else if(km>=4)
489 {
490 D[0+bs*0] = CC[0+bs*0];
491 D[1+bs*0] = CC[1+bs*0];
492 D[2+bs*0] = CC[2+bs*0];
493 D[3+bs*0] = CC[3+bs*0];
494
495 if(kn==1)
496 return;
497
498 D[0+bs*1] = CC[0+bs*1];
499 D[1+bs*1] = CC[1+bs*1];
500 D[2+bs*1] = CC[2+bs*1];
501 D[3+bs*1] = CC[3+bs*1];
502
503 if(kn==2)
504 return;
505
506 D[0+bs*2] = CC[0+bs*2];
507 D[1+bs*2] = CC[1+bs*2];
508 D[2+bs*2] = CC[2+bs*2];
509 D[3+bs*2] = CC[3+bs*2];
510
511 if(kn==3)
512 return;
513
514 D[0+bs*3] = CC[0+bs*3];
515 D[1+bs*3] = CC[1+bs*3];
516 D[2+bs*3] = CC[2+bs*3];
517 D[3+bs*3] = CC[3+bs*3];
518 }
519 else if(km>=3)
520 {
521 D[0+bs*0] = CC[0+bs*0];
522 D[1+bs*0] = CC[1+bs*0];
523 D[2+bs*0] = CC[2+bs*0];
524
525 if(kn==1)
526 return;
527
528 D[0+bs*1] = CC[0+bs*1];
529 D[1+bs*1] = CC[1+bs*1];
530 D[2+bs*1] = CC[2+bs*1];
531
532 if(kn==2)
533 return;
534
535 D[0+bs*2] = CC[0+bs*2];
536 D[1+bs*2] = CC[1+bs*2];
537 D[2+bs*2] = CC[2+bs*2];
538
539 if(kn==3)
540 return;
541
542 D[0+bs*3] = CC[0+bs*3];
543 D[1+bs*3] = CC[1+bs*3];
544 D[2+bs*3] = CC[2+bs*3];
545 }
546 else if(km>=2)
547 {
548 D[0+bs*0] = CC[0+bs*0];
549 D[1+bs*0] = CC[1+bs*0];
550
551 if(kn==1)
552 return;
553
554 D[0+bs*1] = CC[0+bs*1];
555 D[1+bs*1] = CC[1+bs*1];
556
557 if(kn==2)
558 return;
559
560 D[0+bs*2] = CC[0+bs*2];
561 D[1+bs*2] = CC[1+bs*2];
562
563 if(kn==3)
564 return;
565
566 D[0+bs*3] = CC[0+bs*3];
567 D[1+bs*3] = CC[1+bs*3];
568 }
569 else //if(km>=1)
570 {
571 D[0+bs*0] = CC[0+bs*0];
572
573 if(kn==1)
574 return;
575
576 D[0+bs*1] = CC[0+bs*1];
577
578 if(kn==2)
579 return;
580
581 D[0+bs*2] = CC[0+bs*2];
582
583 if(kn==3)
584 return;
585
586 D[0+bs*3] = CC[0+bs*3];
587 }
588
589 return;
590
591 }
592 #endif
593
594