1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 
37 // ---- ge
38 
39 // 4
40 
41 // both A and B are aligned to 256-bit boundaries
kernel_sgecpsc_4_0_lib4(int kmax,float * alphap,float * A,float * B)42 void kernel_sgecpsc_4_0_lib4(int kmax, float *alphap, float *A, float *B)
43 	{
44 
45 	if(kmax<=0)
46 		return;
47 
48 	const int bs = 4;
49 	float alpha = alphap[0];
50 
51 	int k;
52 
53 	for(k=0; k<kmax; k++)
54 		{
55 
56 		B[0+bs*0] = alpha * A[0+bs*0];
57 		B[1+bs*0] = alpha * A[1+bs*0];
58 		B[2+bs*0] = alpha * A[2+bs*0];
59 		B[3+bs*0] = alpha * A[3+bs*0];
60 
61 		A += 4;
62 		B += 4;
63 
64 		}
65 
66 	}
67 
kernel_sgecp_4_0_lib4(int kmax,float * A,float * B)68 void kernel_sgecp_4_0_lib4(int kmax, float *A, float *B)
69 	{
70 
71 	if(kmax<=0)
72 		return;
73 
74 	const int bs = 4;
75 
76 	int k;
77 
78 	for(k=0; k<kmax; k++)
79 		{
80 
81 		B[0+bs*0] = A[0+bs*0];
82 		B[1+bs*0] = A[1+bs*0];
83 		B[2+bs*0] = A[2+bs*0];
84 		B[3+bs*0] = A[3+bs*0];
85 
86 		A += 4;
87 		B += 4;
88 
89 		}
90 
91 	}
92 
93 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_sgecpsc_4_1_lib4(int kmax,float * alphap,float * A0,int sda,float * B)94 void kernel_sgecpsc_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
95 	{
96 
97 	if(kmax<=0)
98 		return;
99 
100 	const int bs = 4;
101 
102 	float *A1 = A0 + bs*sda;
103 	float alpha = alphap[0];
104 
105 	int k;
106 
107 	for(k=0; k<kmax; k++)
108 		{
109 
110 		B[0+bs*0] = alpha * A0[1+bs*0];
111 		B[1+bs*0] = alpha * A0[2+bs*0];
112 		B[2+bs*0] = alpha * A0[3+bs*0];
113 
114 		B[3+bs*0] = alpha * A1[0+bs*0];
115 
116 		A0 += 4;
117 		A1 += 4;
118 		B  += 4;
119 
120 		}
121 
122 	}
123 
124 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_sgecp_4_1_lib4(int kmax,float * A0,int sda,float * B)125 void kernel_sgecp_4_1_lib4(int kmax, float *A0, int sda, float *B)
126 	{
127 
128 	if(kmax<=0)
129 		return;
130 
131 	const int bs = 4;
132 
133 	float *A1 = A0 + bs*sda;
134 
135 	int k;
136 
137 	for(k=0; k<kmax; k++)
138 		{
139 
140 		B[0+bs*0] = A0[1+bs*0];
141 		B[1+bs*0] = A0[2+bs*0];
142 		B[2+bs*0] = A0[3+bs*0];
143 
144 		B[3+bs*0] = A1[0+bs*0];
145 
146 		A0 += 4;
147 		A1 += 4;
148 		B  += 4;
149 
150 		}
151 
152 	}
153 
154 // both A and B are aligned to 256-bit boundaries, 2 element of A must be skipped
kernel_sgecpsc_4_2_lib4(int kmax,float * alphap,float * A0,int sda,float * B)155 void kernel_sgecpsc_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
156 	{
157 
158 	if(kmax<=0)
159 		return;
160 
161 	const int bs = 4;
162 
163 	float *A1 = A0 + bs*sda;
164 	float alpha = alphap[0];
165 
166 	int k;
167 
168 	for(k=0; k<kmax; k++)
169 		{
170 
171 		B[0+bs*0] = alpha * A0[2+bs*0];
172 		B[1+bs*0] = alpha * A0[3+bs*0];
173 
174 		B[2+bs*0] = alpha * A1[0+bs*0];
175 		B[3+bs*0] = alpha * A1[1+bs*0];
176 
177 		A0 += 4;
178 		A1 += 4;
179 		B  += 4;
180 
181 		}
182 
183 	}
184 
185 // both A and B are aligned to 256-bit boundaries, 2 element of A must be skipped
kernel_sgecp_4_2_lib4(int kmax,float * A0,int sda,float * B)186 void kernel_sgecp_4_2_lib4(int kmax, float *A0, int sda, float *B)
187 	{
188 
189 	if(kmax<=0)
190 		return;
191 
192 	const int bs = 4;
193 
194 	float *A1 = A0 + bs*sda;
195 
196 	int k;
197 
198 	for(k=0; k<kmax; k++)
199 		{
200 
201 		B[0+bs*0] = A0[2+bs*0];
202 		B[1+bs*0] = A0[3+bs*0];
203 
204 		B[2+bs*0] = A1[0+bs*0];
205 		B[3+bs*0] = A1[1+bs*0];
206 
207 
208 		A0 += 4;
209 		A1 += 4;
210 		B  += 4;
211 
212 		}
213 
214 	}
215 
216 // both A and B are aligned to 256-bit boundaries, 3 element of A must be skipped
kernel_sgecpsc_4_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)217 void kernel_sgecpsc_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
218 	{
219 
220 	if(kmax<=0)
221 		return;
222 
223 	const int bs = 4;
224 
225 	float *A1 = A0 + bs*sda;
226 	float alpha = alphap[0];
227 
228 	int k;
229 
230 	for(k=0; k<kmax; k++)
231 		{
232 
233 		B[0+bs*0] = alpha * A0[3+bs*0];
234 
235 		B[1+bs*0] = alpha * A1[0+bs*0];
236 		B[2+bs*0] = alpha * A1[1+bs*0];
237 		B[3+bs*0] = alpha * A1[2+bs*0];
238 
239 		A0 += 4;
240 		A1 += 4;
241 		B  += 4;
242 
243 		}
244 
245 	}
246 
247 // both A and B are aligned to 256-bit boundaries, 3 element of A must be skipped
kernel_sgecp_4_3_lib4(int kmax,float * A0,int sda,float * B)248 void kernel_sgecp_4_3_lib4(int kmax, float *A0, int sda, float *B)
249 	{
250 
251 	if(kmax<=0)
252 		return;
253 
254 	const int bs = 4;
255 
256 	float *A1 = A0 + bs*sda;
257 
258 	int k;
259 
260 	for(k=0; k<kmax; k++)
261 		{
262 
263 		B[0+bs*0] = A0[3+bs*0];
264 
265 		B[1+bs*0] = A1[0+bs*0];
266 		B[2+bs*0] = A1[1+bs*0];
267 		B[3+bs*0] = A1[2+bs*0];
268 
269 
270 		A0 += 4;
271 		A1 += 4;
272 		B  += 4;
273 
274 		}
275 
276 	}
277 
278 // 3
279 
kernel_sgecpsc_3_0_lib4(int kmax,float * alphap,float * A,float * B)280 void kernel_sgecpsc_3_0_lib4(int kmax, float *alphap, float *A, float *B)
281 	{
282 
283 	if(kmax<=0)
284 		return;
285 
286 	const int bs = 4;
287 
288 	float alpha = alphap[0];
289 
290 	int k;
291 
292 	for(k=0; k<kmax; k++)
293 		{
294 
295 		B[0+bs*0] = alpha * A[0+bs*0];
296 		B[1+bs*0] = alpha * A[1+bs*0];
297 		B[2+bs*0] = alpha * A[2+bs*0];
298 
299 		A += 4;
300 		B += 4;
301 
302 		}
303 
304 	}
305 
kernel_sgecp_3_0_lib4(int kmax,float * A,float * B)306 void kernel_sgecp_3_0_lib4(int kmax, float *A, float *B)
307 	{
308 
309 	if(kmax<=0)
310 		return;
311 
312 	const int bs = 4;
313 
314 	int k;
315 
316 	for(k=0; k<kmax; k++)
317 		{
318 
319 		B[0+bs*0] = A[0+bs*0];
320 		B[1+bs*0] = A[1+bs*0];
321 		B[2+bs*0] = A[2+bs*0];
322 
323 		A += 4;
324 		B += 4;
325 
326 		}
327 
328 	}
329 
330 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_sgecpsc_3_2_lib4(int kmax,float * alphap,float * A0,int sda,float * B)331 void kernel_sgecpsc_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
332 	{
333 
334 	if(kmax<=0)
335 		return;
336 
337 	const int bs = 4;
338 
339 	float *A1 = A0 + bs*sda;
340 	float alpha = alphap[0];
341 
342 	int k;
343 
344 	for(k=0; k<kmax; k++)
345 		{
346 
347 		B[0+bs*0] = alpha * A0[2+bs*0];
348 		B[1+bs*0] = alpha * A0[3+bs*0];
349 
350 		B[2+bs*0] = alpha * A1[0+bs*0];
351 
352 		A0 += 4;
353 		A1 += 4;
354 		B  += 4;
355 
356 		}
357 
358 	}
359 
360 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_sgecp_3_2_lib4(int kmax,float * A0,int sda,float * B)361 void kernel_sgecp_3_2_lib4(int kmax, float *A0, int sda, float *B)
362 	{
363 
364 	if(kmax<=0)
365 		return;
366 
367 	const int bs = 4;
368 
369 	float *A1 = A0 + bs*sda;
370 
371 	int k;
372 
373 	for(k=0; k<kmax; k++)
374 		{
375 
376 		B[0+bs*0] = A0[2+bs*0];
377 
378 		B[1+bs*0] = A0[3+bs*0];
379 		B[2+bs*0] = A1[0+bs*0];
380 
381 		A0 += 4;
382 		A1 += 4;
383 		B  += 4;
384 
385 		}
386 
387 	}
388 
389 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_sgecpsc_3_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)390 void kernel_sgecpsc_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
391 	{
392 
393 	if(kmax<=0)
394 		return;
395 
396 	const int bs = 4;
397 
398 	float *A1 = A0 + bs*sda;
399 	float alpha = *alphap;
400 
401 	int k;
402 
403 	for(k=0; k<kmax; k++)
404 		{
405 
406 		B[0+bs*0] = alpha * A0[3+bs*0];
407 
408 		B[1+bs*0] = alpha * A1[0+bs*0];
409 		B[2+bs*0] = alpha * A1[1+bs*0];
410 
411 		A0 += 4;
412 		A1 += 4;
413 		B  += 4;
414 
415 		}
416 
417 	}
418 
419 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_sgecp_3_3_lib4(int kmax,float * A0,int sda,float * B)420 void kernel_sgecp_3_3_lib4(int kmax, float *A0, int sda, float *B)
421 	{
422 
423 	if(kmax<=0)
424 		return;
425 
426 	const int bs = 4;
427 
428 	float *A1 = A0 + bs*sda;
429 
430 	int k;
431 
432 	for(k=0; k<kmax; k++)
433 		{
434 
435 		B[0+bs*0] = A0[3+bs*0];
436 
437 		B[1+bs*0] = A1[0+bs*0];
438 		B[2+bs*0] = A1[1+bs*0];
439 
440 		A0 += 4;
441 		A1 += 4;
442 		B  += 4;
443 
444 		}
445 
446 	}
447 
448 // 2
449 
kernel_sgecpsc_2_0_lib4(int kmax,float * alphap,float * A,float * B)450 void kernel_sgecpsc_2_0_lib4(int kmax, float *alphap, float *A, float *B)
451 	{
452 
453 	if(kmax<=0)
454 		return;
455 
456 	const int bs = 4;
457 	float alpha = alphap[0];
458 
459 	int k;
460 
461 	for(k=0; k<kmax; k++)
462 		{
463 
464 		B[0+bs*0] = alpha * A[0+bs*0];
465 		B[1+bs*0] = alpha * A[1+bs*0];
466 
467 		A += 4;
468 		B += 4;
469 
470 		}
471 
472 	}
473 
kernel_sgecp_2_0_lib4(int kmax,float * A,float * B)474 void kernel_sgecp_2_0_lib4(int kmax, float *A, float *B)
475 	{
476 
477 	if(kmax<=0)
478 		return;
479 
480 	const int bs = 4;
481 
482 	int k;
483 
484 	for(k=0; k<kmax; k++)
485 		{
486 
487 		B[0+bs*0] = A[0+bs*0];
488 		B[1+bs*0] = A[1+bs*0];
489 
490 		A += 4;
491 		B += 4;
492 
493 		}
494 
495 	}
496 
497 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_sgecpsc_2_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)498 void kernel_sgecpsc_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
499 	{
500 
501 	if(kmax<=0)
502 		return;
503 
504 	const int bs = 4;
505 	float alpha = alphap[0];
506 	float *A1 = A0 + bs*sda;
507 
508 	int k;
509 
510 	for(k=0; k<kmax; k++)
511 		{
512 
513 		B[0+bs*0] = alpha * A0[3+bs*0];
514 		B[1+bs*0] = alpha * A1[0+bs*0];
515 
516 		A0 += 4;
517 		A1 += 4;
518 		B  += 4;
519 
520 		}
521 
522 	}
523 
524 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_sgecp_2_3_lib4(int kmax,float * A0,int sda,float * B)525 void kernel_sgecp_2_3_lib4(int kmax, float *A0, int sda, float *B)
526 	{
527 
528 	if(kmax<=0)
529 		return;
530 
531 	const int bs = 4;
532 
533 	float *A1 = A0 + bs*sda;
534 
535 	int k;
536 
537 	for(k=0; k<kmax; k++)
538 		{
539 
540 		B[0+bs*0] = A0[3+bs*0];
541 		B[1+bs*0] = A1[0+bs*0];
542 
543 		A0 += 4;
544 		A1 += 4;
545 		B  += 4;
546 
547 		}
548 
549 	}
550 
551 // 1
552 
kernel_sgecpsc_1_0_lib4(int kmax,float * alphap,float * A,float * B)553 void kernel_sgecpsc_1_0_lib4(int kmax, float *alphap, float *A, float *B)
554 	{
555 
556 	if(kmax<=0)
557 		return;
558 
559 	const int bs = 4;
560 
561 	float alpha = alphap[0];
562 
563 	int k;
564 
565 	for(k=0; k<kmax; k++)
566 		{
567 
568 		B[0+bs*0] = alpha * A[0+bs*0];
569 
570 		A += 4;
571 		B += 4;
572 
573 		}
574 
575 	}
576 
kernel_sgecp_1_0_lib4(int kmax,float * A,float * B)577 void kernel_sgecp_1_0_lib4(int kmax, float *A, float *B)
578 	{
579 
580 	if(kmax<=0)
581 		return;
582 
583 	const int bs = 4;
584 
585 	int k;
586 
587 	for(k=0; k<kmax; k++)
588 		{
589 
590 		B[0+bs*0] = A[0+bs*0];
591 
592 		A += 4;
593 		B += 4;
594 
595 		}
596 
597 	}
598 
599 
600 
601 // ---- tr
602 
603 // both A and B are aligned to 256-bit boundaries
kernel_strcp_l_4_0_lib4(int kmax,float * A,float * B)604 void kernel_strcp_l_4_0_lib4(int kmax, float *A, float *B)
605 	{
606 
607 	// A and C are lower triangular
608 	// kmax+1 4-wide + end 3x3 triangle
609 
610 	kmax += 1;
611 
612 	if(kmax<=0)
613 		return;
614 
615 	const int bs = 4;
616 
617 	int k;
618 
619 	for(k=0; k<kmax; k++)
620 		{
621 
622 		B[0+bs*0] = A[0+bs*0];
623 		B[1+bs*0] = A[1+bs*0];
624 		B[2+bs*0] = A[2+bs*0];
625 		B[3+bs*0] = A[3+bs*0];
626 
627 		A += 4;
628 		B += 4;
629 
630 		}
631 
632 	// 3x3 triangle
633 
634 	B[1+bs*0] = A[1+bs*0];
635 	B[2+bs*0] = A[2+bs*0];
636 	B[3+bs*0] = A[3+bs*0];
637 
638 	B[2+bs*1] = A[2+bs*1];
639 	B[3+bs*1] = A[3+bs*1];
640 
641 	B[3+bs*2] = A[3+bs*2];
642 
643 	}
644 
645 
646 
647 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_strcp_l_4_1_lib4(int kmax,float * A0,int sda,float * B)648 void kernel_strcp_l_4_1_lib4(int kmax, float *A0, int sda, float *B)
649 	{
650 
651 	// A and C are lower triangular
652 	// kmax+1 4-wide + end 3x3 triangle
653 
654 	kmax += 1;
655 
656 	if(kmax<=0)
657 		return;
658 
659 	const int bs = 4;
660 
661 	float *A1 = A0 + bs*sda;
662 
663 	int k;
664 
665 	for(k=0; k<kmax; k++)
666 		{
667 
668 		B[0+bs*0] = A0[1+bs*0];
669 		B[1+bs*0] = A0[2+bs*0];
670 		B[2+bs*0] = A0[3+bs*0];
671 		B[3+bs*0] = A1[0+bs*0];
672 
673 		A0 += 4;
674 		A1 += 4;
675 		B  += 4;
676 
677 		}
678 
679 	// 3x3 triangle
680 
681 	B[1+0*bs] = A0[2+0*bs];
682 	B[2+0*bs] = A0[3+0*bs];
683 	B[3+0*bs] = A1[0+0*bs];
684 
685 	B[2+1*bs] = A0[3+1*bs];
686 	B[3+1*bs] = A1[0+1*bs];
687 
688 	B[3+2*bs] = A1[0+2*bs];
689 
690 	}
691 
692 
693 
694 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_strcp_l_4_2_lib4(int kmax,float * A0,int sda,float * B)695 void kernel_strcp_l_4_2_lib4(int kmax, float *A0, int sda, float *B)
696 	{
697 
698 	// A and C are lower triangular
699 	// kmax+1 4-wide + end 3x3 triangle
700 
701 	kmax += 1;
702 
703 	if(kmax<=0)
704 		return;
705 
706 	const int bs = 4;
707 
708 	float *A1 = A0 + bs*sda;
709 
710 	int k;
711 
712 	for(k=0; k<kmax; k++)
713 		{
714 
715 		B[0+bs*0] = A0[2+bs*0];
716 		B[1+bs*0] = A0[3+bs*0];
717 		B[2+bs*0] = A1[0+bs*0];
718 		B[3+bs*0] = A1[1+bs*0];
719 
720 		A0 += 4;
721 		A1 += 4;
722 		B  += 4;
723 
724 		}
725 
726 	// 3x3 triangle}
727 
728 	B[1+bs*0] = A0[3+bs*0];
729 	B[2+bs*0] = A1[0+bs*0];
730 	B[3+bs*0] = A1[1+bs*0];
731 
732 	B[2+bs*1] = A1[0+bs*1];
733 	B[3+bs*1] = A1[1+bs*1];
734 
735 	B[3+bs*2] = A1[1+bs*2];
736 
737 	}
738 
739 
740 
741 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_strcp_l_4_3_lib4(int kmax,float * A0,int sda,float * B)742 void kernel_strcp_l_4_3_lib4(int kmax, float *A0, int sda, float *B)
743 	{
744 
745 	// A and C are lower triangular
746 	// kmax+1 4-wide + end 3x3 triangle
747 
748 	kmax += 1;
749 
750 	if(kmax<=0)
751 		return;
752 
753 	const int bs = 4;
754 
755 	float *A1 = A0 + bs*sda;
756 
757 	int k;
758 
759 	for(k=0; k<kmax; k++)
760 		{
761 
762 		B[0+bs*0] = A0[3+bs*0];
763 		B[1+bs*0] = A1[0+bs*0];
764 		B[2+bs*0] = A1[1+bs*0];
765 		B[3+bs*0] = A1[2+bs*0];
766 
767 		A0 += 4;
768 		A1 += 4;
769 		B  += 4;
770 
771 		}
772 
773 	// 3x3 triangle
774 
775 	B[1+bs*0] = A1[0+bs*0];
776 	B[2+bs*0] = A1[1+bs*0];
777 	B[3+bs*0] = A1[2+bs*0];
778 
779 	B[2+bs*1] = A1[1+bs*1];
780 	B[3+bs*1] = A1[2+bs*1];
781 
782 	B[3+bs*2] = A1[2+bs*2];
783 
784 	}
785 
786 
787 
788 // both A and B are aligned to 64-bit boundaries
kernel_strcp_l_3_0_lib4(int kmax,float * A,float * B)789 void kernel_strcp_l_3_0_lib4(int kmax, float *A, float *B)
790 	{
791 
792 	// A and C are lower triangular
793 	// kmax+1 3-wide + end 2x2 triangle
794 
795 	kmax += 1;
796 
797 	if(kmax<=0)
798 		return;
799 
800 	const int bs = 4;
801 
802 	int k;
803 
804 	for(k=0; k<kmax; k++)
805 		{
806 
807 		B[0+bs*0] = A[0+bs*0];
808 		B[1+bs*0] = A[1+bs*0];
809 		B[2+bs*0] = A[2+bs*0];
810 
811 		A += 4;
812 		B += 4;
813 
814 		}
815 
816 	// 2x2 triangle
817 
818 	B[1+bs*0] = A[1+bs*0];
819 	B[2+bs*0] = A[2+bs*0];
820 
821 	B[2+bs*1] = A[2+bs*1];
822 
823 	}
824 
825 
826 
827 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_strcp_l_3_2_lib4(int kmax,float * A0,int sda,float * B)828 void kernel_strcp_l_3_2_lib4(int kmax, float *A0, int sda, float *B)
829 	{
830 
831 	// A and C are lower triangular
832 	// kmax+1 3-wide + end 2x2 triangle
833 
834 	kmax += 1;
835 
836 	if(kmax<=0)
837 		return;
838 
839 	const int bs = 4;
840 
841 	float *A1 = A0 + bs*sda;
842 
843 	int k;
844 
845 	for(k=0; k<kmax; k++)
846 		{
847 
848 		B[0+bs*0] = A0[2+bs*0];
849 		B[1+bs*0] = A0[3+bs*0];
850 		B[2+bs*0] = A1[0+bs*0];
851 
852 		A0 += 4;
853 		A1 += 4;
854 		B  += 4;
855 
856 		}
857 
858 	// 2x2 triangle
859 
860 	B[1+bs*0] = A0[3+bs*0];
861 	B[2+bs*0] = A1[0+bs*0];
862 
863 	B[2+bs*1] = A1[0+bs*1];
864 
865 	}
866 
867 
868 
869 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_strcp_l_3_3_lib4(int kmax,float * A0,int sda,float * B)870 void kernel_strcp_l_3_3_lib4(int kmax, float *A0, int sda, float *B)
871 	{
872 
873 	// A and C are lower triangular
874 	// kmax+1 3-wide + end 2x2 triangle
875 
876 	kmax += 1;
877 
878 	if(kmax<=0)
879 		return;
880 
881 	const int bs = 4;
882 
883 	float *A1 = A0 + bs*sda;
884 
885 	int k;
886 
887 	for(k=0; k<kmax; k++)
888 		{
889 
890 		B[0+bs*0] = A0[3+bs*0];
891 		B[1+bs*0] = A1[0+bs*0];
892 		B[2+bs*0] = A1[1+bs*0];
893 
894 		A0 += 4;
895 		A1 += 4;
896 		B  += 4;
897 
898 		}
899 
900 	// 2x2 triangle
901 
902 	B[1+bs*0] = A1[0+bs*0];
903 	B[2+bs*0] = A1[1+bs*0];
904 
905 	B[2+bs*1] = A1[1+bs*1];
906 
907 	}
908 
909 
910 
911 // both A and B are aligned to 64-bit boundaries
kernel_strcp_l_2_0_lib4(int kmax,float * A,float * B)912 void kernel_strcp_l_2_0_lib4(int kmax, float *A, float *B)
913 	{
914 
915 	// A and C are lower triangular
916 	// kmax+1 2-wide + end 1x1 triangle
917 
918 	kmax += 1;
919 
920 	if(kmax<=0)
921 		return;
922 
923 	const int bs = 4;
924 
925 	int k;
926 
927 	for(k=0; k<kmax; k++)
928 		{
929 
930 		B[0+bs*0] = A[0+bs*0];
931 		B[1+bs*0] = A[1+bs*0];
932 
933 		A += 4;
934 		B += 4;
935 
936 		}
937 
938 	// 1x1 triangle
939 
940 	B[1+bs*0] = A[1+bs*0];
941 
942 	}
943 
944 
945 
946 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_strcp_l_2_3_lib4(int kmax,float * A0,int sda,float * B)947 void kernel_strcp_l_2_3_lib4(int kmax, float *A0, int sda, float *B)
948 	{
949 
950 	// A and C are lower triangular
951 	// kmax+1 2-wide + end 1x1 triangle
952 
953 	kmax += 1;
954 
955 	if(kmax<=0)
956 		return;
957 
958 	const int bs = 4;
959 
960 	float *A1 = A0 + bs*sda;
961 
962 	int k;
963 
964 	for(k=0; k<kmax; k++)
965 		{
966 
967 		B[0+bs*0] = A0[3+bs*0];
968 		B[1+bs*0] = A1[0+bs*0];
969 
970 		A0 += 4;
971 		A1 += 4;
972 		B  += 4;
973 
974 		}
975 
976 	// 1x1 triangle
977 
978 	B[1+bs*0] = A1[0+bs*0];
979 
980 	}
981 
982 
983 
984 // both A and B are aligned 64-bit boundaries
kernel_strcp_l_1_0_lib4(int kmax,float * A,float * B)985 void kernel_strcp_l_1_0_lib4(int kmax, float *A, float *B)
986 	{
987 
988 	// A and C are lower triangular
989 	// kmax+1 1-wide
990 
991 	kmax += 1;
992 
993 	if(kmax<=0)
994 		return;
995 
996 	const int bs = 4;
997 
998 	int k;
999 
1000 	for(k=0; k<kmax; k++)
1001 		{
1002 
1003 		B[0+bs*0] = A[0+bs*0];
1004 
1005 		A += 4;
1006 		B += 4;
1007 
1008 		}
1009 
1010 	}
1011 
1012 
1013 // --- add
1014 
1015 // both A and B are aligned to 256-bit boundaries
kernel_sgead_4_0_lib4(int kmax,float * alphap,float * A,float * B)1016 void kernel_sgead_4_0_lib4(int kmax, float *alphap, float *A, float *B)
1017 	{
1018 
1019 	if(kmax<=0)
1020 		return;
1021 
1022 	const int bs = 4;
1023 
1024 	float alpha = alphap[0];
1025 
1026 	int k;
1027 
1028 	for(k=0; k<kmax; k++)
1029 		{
1030 
1031 		B[0+bs*0] += alpha * A[0+bs*0];
1032 		B[1+bs*0] += alpha * A[1+bs*0];
1033 		B[2+bs*0] += alpha * A[2+bs*0];
1034 		B[3+bs*0] += alpha * A[3+bs*0];
1035 
1036 		A += 4;
1037 		B += 4;
1038 
1039 		}
1040 
1041 	}
1042 
1043 
1044 
1045 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_sgead_4_1_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1046 void kernel_sgead_4_1_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1047 	{
1048 
1049 	if(kmax<=0)
1050 		return;
1051 
1052 	const int bs = 4;
1053 
1054 	float alpha = alphap[0];
1055 
1056 	float *A1 = A0 + bs*sda;
1057 
1058 	int k;
1059 
1060 	for(k=0; k<kmax; k++)
1061 		{
1062 
1063 		B[0+bs*0] += alpha * A0[1+bs*0];
1064 		B[1+bs*0] += alpha * A0[2+bs*0];
1065 		B[2+bs*0] += alpha * A0[3+bs*0];
1066 		B[3+bs*0] += alpha * A1[0+bs*0];
1067 
1068 		A0 += 4;
1069 		A1 += 4;
1070 		B  += 4;
1071 
1072 		}
1073 
1074 	}
1075 
1076 
1077 
1078 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_sgead_4_2_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1079 void kernel_sgead_4_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1080 	{
1081 
1082 	if(kmax<=0)
1083 		return;
1084 
1085 	const int bs = 4;
1086 
1087 	float alpha = alphap[0];
1088 
1089 	float *A1 = A0 + bs*sda;
1090 
1091 	int k;
1092 
1093 	for(k=0; k<kmax; k++)
1094 		{
1095 
1096 		B[0+bs*0] += alpha * A0[2+bs*0];
1097 		B[1+bs*0] += alpha * A0[3+bs*0];
1098 		B[2+bs*0] += alpha * A1[0+bs*0];
1099 		B[3+bs*0] += alpha * A1[1+bs*0];
1100 
1101 		A0 += 4;
1102 		A1 += 4;
1103 		B  += 4;
1104 
1105 		}
1106 
1107 	}
1108 
1109 
1110 
1111 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_sgead_4_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1112 void kernel_sgead_4_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1113 	{
1114 
1115 	if(kmax<=0)
1116 		return;
1117 
1118 	const int bs = 4;
1119 
1120 	float alpha = alphap[0];
1121 
1122 	float *A1 = A0 + bs*sda;
1123 
1124 	int k;
1125 
1126 	for(k=0; k<kmax; k++)
1127 		{
1128 
1129 		B[0+bs*0] += alpha * A0[3+bs*0];
1130 		B[1+bs*0] += alpha * A1[0+bs*0];
1131 		B[2+bs*0] += alpha * A1[1+bs*0];
1132 		B[3+bs*0] += alpha * A1[2+bs*0];
1133 
1134 		A0 += 4;
1135 		A1 += 4;
1136 		B  += 4;
1137 
1138 		}
1139 
1140 	}
1141 
1142 
1143 
1144 // both A and B are aligned to 64-bit boundaries
kernel_sgead_3_0_lib4(int kmax,float * alphap,float * A,float * B)1145 void kernel_sgead_3_0_lib4(int kmax, float *alphap, float *A, float *B)
1146 	{
1147 
1148 	if(kmax<=0)
1149 		return;
1150 
1151 	const int bs = 4;
1152 
1153 	float alpha = alphap[0];
1154 
1155 	int k;
1156 
1157 	for(k=0; k<kmax; k++)
1158 		{
1159 
1160 		B[0+bs*0] += alpha * A[0+bs*0];
1161 		B[1+bs*0] += alpha * A[1+bs*0];
1162 		B[2+bs*0] += alpha * A[2+bs*0];
1163 
1164 		A += 4;
1165 		B += 4;
1166 
1167 		}
1168 
1169 	}
1170 
1171 
1172 
1173 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_sgead_3_2_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1174 void kernel_sgead_3_2_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1175 	{
1176 
1177 	if(kmax<=0)
1178 		return;
1179 
1180 	const int bs = 4;
1181 
1182 	float alpha = alphap[0];
1183 
1184 	float *A1 = A0 + bs*sda;
1185 
1186 	int k;
1187 
1188 	for(k=0; k<kmax; k++)
1189 		{
1190 
1191 		B[0+bs*0] += alpha * A0[2+bs*0];
1192 		B[1+bs*0] += alpha * A0[3+bs*0];
1193 		B[2+bs*0] += alpha * A1[0+bs*0];
1194 
1195 		A0 += 4;
1196 		A1 += 4;
1197 		B  += 4;
1198 
1199 		}
1200 
1201 	}
1202 
1203 
1204 
1205 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_sgead_3_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1206 void kernel_sgead_3_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1207 	{
1208 
1209 	if(kmax<=0)
1210 		return;
1211 
1212 	const int bs = 4;
1213 
1214 	float alpha = alphap[0];
1215 
1216 	float *A1 = A0 + bs*sda;
1217 
1218 	int k;
1219 
1220 	for(k=0; k<kmax; k++)
1221 		{
1222 
1223 		B[0+bs*0] += alpha * A0[3+bs*0];
1224 		B[1+bs*0] += alpha * A1[0+bs*0];
1225 		B[2+bs*0] += alpha * A1[1+bs*0];
1226 
1227 		A0 += 4;
1228 		A1 += 4;
1229 		B  += 4;
1230 
1231 		}
1232 
1233 	}
1234 
1235 
1236 
1237 // both A and B are aligned to 64-bit boundaries
kernel_sgead_2_0_lib4(int kmax,float * alphap,float * A,float * B)1238 void kernel_sgead_2_0_lib4(int kmax, float *alphap, float *A, float *B)
1239 	{
1240 
1241 	if(kmax<=0)
1242 		return;
1243 
1244 	const int bs = 4;
1245 
1246 	float alpha = alphap[0];
1247 
1248 	int k;
1249 
1250 	for(k=0; k<kmax; k++)
1251 		{
1252 
1253 		B[0+bs*0] += alpha * A[0+bs*0];
1254 		B[1+bs*0] += alpha * A[1+bs*0];
1255 
1256 		A += 4;
1257 		B += 4;
1258 
1259 		}
1260 
1261 	}
1262 
1263 
1264 
1265 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_sgead_2_3_lib4(int kmax,float * alphap,float * A0,int sda,float * B)1266 void kernel_sgead_2_3_lib4(int kmax, float *alphap, float *A0, int sda, float *B)
1267 	{
1268 
1269 	if(kmax<=0)
1270 		return;
1271 
1272 	const int bs = 4;
1273 
1274 	float alpha = alphap[0];
1275 
1276 	float *A1 = A0 + bs*sda;
1277 
1278 	int k;
1279 
1280 	for(k=0; k<kmax; k++)
1281 		{
1282 
1283 		B[0+bs*0] += alpha * A0[3+bs*0];
1284 		B[1+bs*0] += alpha * A1[0+bs*0];
1285 
1286 		A0 += 4;
1287 		A1 += 4;
1288 		B  += 4;
1289 
1290 		}
1291 
1292 	}
1293 
1294 
1295 
1296 // both A and B are aligned 64-bit boundaries
kernel_sgead_1_0_lib4(int kmax,float * alphap,float * A,float * B)1297 void kernel_sgead_1_0_lib4(int kmax, float *alphap, float *A, float *B)
1298 	{
1299 
1300 	if(kmax<=0)
1301 		return;
1302 
1303 	const int bs = 4;
1304 
1305 	float alpha = alphap[0];
1306 
1307 	int k;
1308 
1309 	for(k=0; k<kmax; k++)
1310 		{
1311 
1312 		B[0+bs*0] += alpha * A[0+bs*0];
1313 
1314 		A += 4;
1315 		B += 4;
1316 
1317 		}
1318 
1319 	}
1320 
1321 
1322 
1323 
1324 
1325