1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 
37 
38 
39 
40 /*
41  * Copy only
42  */
43 
44 
45 
46 
47 
48 // both A and B are aligned to 256-bit boundaries
kernel_dgecp_4_0_lib4(int tri,int kmax,double * A,double * B)49 void kernel_dgecp_4_0_lib4(int tri, int kmax, double *A, double *B)
50 	{
51 
52 	if(tri==1)
53 		{
54 		// A and C are lower triangular
55 		// kmax+1 4-wide + end 3x3 triangle
56 
57 		kmax += 1;
58 		}
59 
60 	if(kmax<=0)
61 		return;
62 
63 	const int bs = 4;
64 
65 	int k;
66 
67 	for(k=0; k<kmax-3; k+=4)
68 		{
69 		B[0+bs*0] = A[0+bs*0];
70 		B[1+bs*0] = A[1+bs*0];
71 		B[2+bs*0] = A[2+bs*0];
72 		B[3+bs*0] = A[3+bs*0];
73 
74 		B[0+bs*1] = A[0+bs*1];
75 		B[1+bs*1] = A[1+bs*1];
76 		B[2+bs*1] = A[2+bs*1];
77 		B[3+bs*1] = A[3+bs*1];
78 
79 		B[0+bs*2] = A[0+bs*2];
80 		B[1+bs*2] = A[1+bs*2];
81 		B[2+bs*2] = A[2+bs*2];
82 		B[3+bs*2] = A[3+bs*2];
83 
84 		B[0+bs*3] = A[0+bs*3];
85 		B[1+bs*3] = A[1+bs*3];
86 		B[2+bs*3] = A[2+bs*3];
87 		B[3+bs*3] = A[3+bs*3];
88 
89 		A += 16;
90 		B += 16;
91 
92 		}
93 	for(; k<kmax; k++)
94 		{
95 
96 		B[0+bs*0] = A[0+bs*0];
97 		B[1+bs*0] = A[1+bs*0];
98 		B[2+bs*0] = A[2+bs*0];
99 		B[3+bs*0] = A[3+bs*0];
100 
101 		A += 4;
102 		B += 4;
103 
104 		}
105 
106 	if(tri==1)
107 		{
108 		// 3x3 triangle
109 
110 		B[1+bs*0] = A[1+bs*0];
111 		B[2+bs*0] = A[2+bs*0];
112 		B[3+bs*0] = A[3+bs*0];
113 
114 		B[2+bs*1] = A[2+bs*1];
115 		B[3+bs*1] = A[3+bs*1];
116 
117 		B[3+bs*2] = A[3+bs*2];
118 
119 		}
120 
121 	}
122 
123 
124 
125 
126 
127 
128 /*
129  * Copy and Scale
130  *
131  * Used by: dge dtr
132  */
133 
134 
135 
136 
137 
138 // both.o A and B are aligned to 256-bit boundaries
kernel_dgecpsc_4_0_lib4(int tri,int kmax,double alpha,double * A,double * B)139 void kernel_dgecpsc_4_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
140 	{
141 
142 	if(tri==1)
143 		{
144 		// A and C are lower triangular
145 		// kmax+1 4-wide + end 3x3 triangle
146 
147 		kmax += 1;
148 		}
149 
150 	if(kmax<=0)
151 		return;
152 
153 	const int bs = 4;
154 
155 	int k;
156 
157 	for(k=0; k<kmax-3; k+=4)
158 		{
159 		B[0+bs*0] = alpha*A[0+bs*0];
160 		B[1+bs*0] = alpha*A[1+bs*0];
161 		B[2+bs*0] = alpha*A[2+bs*0];
162 		B[3+bs*0] = alpha*A[3+bs*0];
163 
164 		B[0+bs*1] = alpha*A[0+bs*1];
165 		B[1+bs*1] = alpha*A[1+bs*1];
166 		B[2+bs*1] = alpha*A[2+bs*1];
167 		B[3+bs*1] = alpha*A[3+bs*1];
168 
169 		B[0+bs*2] = alpha*A[0+bs*2];
170 		B[1+bs*2] = alpha*A[1+bs*2];
171 		B[2+bs*2] = alpha*A[2+bs*2];
172 		B[3+bs*2] = alpha*A[3+bs*2];
173 
174 		B[0+bs*3] = alpha*A[0+bs*3];
175 		B[1+bs*3] = alpha*A[1+bs*3];
176 		B[2+bs*3] = alpha*A[2+bs*3];
177 		B[3+bs*3] = alpha*A[3+bs*3];
178 
179 		A += 16;
180 		B += 16;
181 
182 		}
183 	for(; k<kmax; k++)
184 		{
185 
186 		B[0+bs*0] = alpha*A[0+bs*0];
187 		B[1+bs*0] = alpha*A[1+bs*0];
188 		B[2+bs*0] = alpha*A[2+bs*0];
189 		B[3+bs*0] = alpha*A[3+bs*0];
190 
191 		A += 4;
192 		B += 4;
193 
194 		}
195 
196 	if(tri==1)
197 		{
198 		// 3x3 triangle
199 
200 		B[1+bs*0] = alpha*A[1+bs*0];
201 		B[2+bs*0] = alpha*A[2+bs*0];
202 		B[3+bs*0] = alpha*A[3+bs*0];
203 
204 		B[2+bs*1] = alpha*A[2+bs*1];
205 		B[3+bs*1] = alpha*A[3+bs*1];
206 
207 		B[3+bs*2] = alpha*A[3+bs*2];
208 
209 		}
210 
211 	}
212 
213 
214 
215 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_dgecpsc_4_1_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)216 void kernel_dgecpsc_4_1_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
217 	{
218 
219 	if(tri==1)
220 		{
221 		// A and C are lower triangular
222 		// kmax+1 4-wide + end 3x3 triangle
223 
224 		kmax += 1;
225 		}
226 
227 	if(kmax<=0)
228 		return;
229 
230 	const int bs = 4;
231 
232 	double *A1 = A0 + bs*sda;
233 
234 	int k;
235 
236 	for(k=0; k<kmax-3; k+=4)
237 		{
238 
239 		B[0+bs*0] = alpha*A0[1+bs*0];
240 		B[1+bs*0] = alpha*A0[2+bs*0];
241 		B[2+bs*0] = alpha*A0[3+bs*0];
242 		B[3+bs*0] = alpha*A1[0+bs*0];
243 
244 		B[0+bs*1] = alpha*A0[1+bs*1];
245 		B[1+bs*1] = alpha*A0[2+bs*1];
246 		B[2+bs*1] = alpha*A0[3+bs*1];
247 		B[3+bs*1] = alpha*A1[0+bs*1];
248 
249 		B[0+bs*2] = alpha*A0[1+bs*2];
250 		B[1+bs*2] = alpha*A0[2+bs*2];
251 		B[2+bs*2] = alpha*A0[3+bs*2];
252 		B[3+bs*2] = alpha*A1[0+bs*2];
253 
254 		B[0+bs*3] = alpha*A0[1+bs*3];
255 		B[1+bs*3] = alpha*A0[2+bs*3];
256 		B[2+bs*3] = alpha*A0[3+bs*3];
257 		B[3+bs*3] = alpha*A1[0+bs*3];
258 
259 		A0 += 16;
260 		A1 += 16;
261 		B  += 16;
262 
263 		}
264 	for(; k<kmax; k++)
265 		{
266 
267 		B[0+bs*0] = alpha*A0[1+bs*0];
268 		B[1+bs*0] = alpha*A0[2+bs*0];
269 		B[2+bs*0] = alpha*A0[3+bs*0];
270 		B[3+bs*0] = alpha*A1[0+bs*0];
271 
272 		A0 += 4;
273 		A1 += 4;
274 		B  += 4;
275 
276 		}
277 
278 	if(tri==1)
279 		{
280 		// 3x3 triangle
281 
282 		B[1+0*bs] = alpha*A0[2+0*bs];
283 		B[2+0*bs] = alpha*A0[3+0*bs];
284 		B[3+0*bs] = alpha*A1[0+0*bs];
285 
286 		B[2+1*bs] = alpha*A0[3+1*bs];
287 		B[3+1*bs] = alpha*A1[0+1*bs];
288 
289 		B[3+2*bs] = alpha*A1[0+2*bs];
290 
291 		}
292 
293 	}
294 
295 
296 
297 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_dgecpsc_4_2_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)298 void kernel_dgecpsc_4_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
299 	{
300 
301 	if(tri==1)
302 		{
303 		// A and C are lower triangular
304 		// kmax+1 4-wide + end 3x3 triangle
305 
306 		kmax += 1;
307 		}
308 
309 	if(kmax<=0)
310 		return;
311 
312 	const int bs = 4;
313 
314 	double *A1 = A0 + bs*sda;
315 
316 	int k;
317 
318 	for(k=0; k<kmax-3; k+=4)
319 		{
320 
321 		B[0+bs*0] = alpha*A0[2+bs*0];
322 		B[1+bs*0] = alpha*A0[3+bs*0];
323 		B[2+bs*0] = alpha*A1[0+bs*0];
324 		B[3+bs*0] = alpha*A1[1+bs*0];
325 
326 		B[0+bs*1] = alpha*A0[2+bs*1];
327 		B[1+bs*1] = alpha*A0[3+bs*1];
328 		B[2+bs*1] = alpha*A1[0+bs*1];
329 		B[3+bs*1] = alpha*A1[1+bs*1];
330 
331 		B[0+bs*2] = alpha*A0[2+bs*2];
332 		B[1+bs*2] = alpha*A0[3+bs*2];
333 		B[2+bs*2] = alpha*A1[0+bs*2];
334 		B[3+bs*2] = alpha*A1[1+bs*2];
335 
336 		B[0+bs*3] = alpha*A0[2+bs*3];
337 		B[1+bs*3] = alpha*A0[3+bs*3];
338 		B[2+bs*3] = alpha*A1[0+bs*3];
339 		B[3+bs*3] = alpha*A1[1+bs*3];
340 
341 		A0 += 16;
342 		A1 += 16;
343 		B  += 16;
344 
345 		}
346 	for(; k<kmax; k++)
347 		{
348 
349 		B[0+bs*0] = alpha*A0[2+bs*0];
350 		B[1+bs*0] = alpha*A0[3+bs*0];
351 		B[2+bs*0] = alpha*A1[0+bs*0];
352 		B[3+bs*0] = alpha*A1[1+bs*0];
353 
354 		A0 += 4;
355 		A1 += 4;
356 		B  += 4;
357 
358 		}
359 
360 	if(tri==1)
361 		{
362 		// 3x3 triangle}
363 
364 		B[1+bs*0] = alpha*A0[3+bs*0];
365 		B[2+bs*0] = alpha*A1[0+bs*0];
366 		B[3+bs*0] = alpha*A1[1+bs*0];
367 
368 		B[2+bs*1] = alpha*A1[0+bs*1];
369 		B[3+bs*1] = alpha*A1[1+bs*1];
370 
371 		B[3+bs*2] = alpha*A1[1+bs*2];
372 
373 		}
374 
375 	}
376 
377 
378 
379 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_dgecpsc_4_3_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)380 void kernel_dgecpsc_4_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
381 	{
382 
383 	if(tri==1)
384 		{
385 		// A and C are lower triangular
386 		// kmax+1 4-wide + end 3x3 triangle
387 
388 		kmax += 1;
389 		}
390 
391 	if(kmax<=0)
392 		return;
393 
394 	const int bs = 4;
395 
396 	double *A1 = A0 + bs*sda;
397 
398 	int k;
399 
400 	for(k=0; k<kmax-3; k+=4)
401 		{
402 
403 		B[0+bs*0] = alpha*A0[3+bs*0];
404 		B[1+bs*0] = alpha*A1[0+bs*0];
405 		B[2+bs*0] = alpha*A1[1+bs*0];
406 		B[3+bs*0] = alpha*A1[2+bs*0];
407 
408 		B[0+bs*1] = alpha*A0[3+bs*1];
409 		B[1+bs*1] = alpha*A1[0+bs*1];
410 		B[2+bs*1] = alpha*A1[1+bs*1];
411 		B[3+bs*1] = alpha*A1[2+bs*1];
412 
413 		B[0+bs*2] = alpha*A0[3+bs*2];
414 		B[1+bs*2] = alpha*A1[0+bs*2];
415 		B[2+bs*2] = alpha*A1[1+bs*2];
416 		B[3+bs*2] = alpha*A1[2+bs*2];
417 
418 		B[0+bs*3] = alpha*A0[3+bs*3];
419 		B[1+bs*3] = alpha*A1[0+bs*3];
420 		B[2+bs*3] = alpha*A1[1+bs*3];
421 		B[3+bs*3] = alpha*A1[2+bs*3];
422 
423 		A0 += 16;
424 		A1 += 16;
425 		B  += 16;
426 
427 		}
428 	for(; k<kmax; k++)
429 		{
430 
431 		B[0+bs*0] = alpha*A0[3+bs*0];
432 		B[1+bs*0] = alpha*A1[0+bs*0];
433 		B[2+bs*0] = alpha*A1[1+bs*0];
434 		B[3+bs*0] = alpha*A1[2+bs*0];
435 
436 		A0 += 4;
437 		A1 += 4;
438 		B  += 4;
439 
440 		}
441 
442 	if(tri==1)
443 		{
444 		// 3x3 triangle
445 
446 		B[1+bs*0] = alpha*A1[0+bs*0];
447 		B[2+bs*0] = alpha*A1[1+bs*0];
448 		B[3+bs*0] = alpha*A1[2+bs*0];
449 
450 		B[2+bs*1] = alpha*A1[1+bs*1];
451 		B[3+bs*1] = alpha*A1[2+bs*1];
452 
453 		B[3+bs*2] = alpha*A1[2+bs*2];
454 
455 		}
456 
457 	}
458 
459 
460 
461 // both A and B are aligned to 64-bit boundaries
kernel_dgecpsc_3_0_lib4(int tri,int kmax,double alpha,double * A,double * B)462 void kernel_dgecpsc_3_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
463 	{
464 
465 	if(tri==1)
466 		{
467 		// A and C are lower triangular
468 		// kmax+1 3-wide + end 2x2 triangle
469 
470 		kmax += 1;
471 		}
472 
473 	if(kmax<=0)
474 		return;
475 
476 	const int bs = 4;
477 
478 	int k;
479 
480 	for(k=0; k<kmax-3; k+=4)
481 		{
482 		B[0+bs*0] = alpha*A[0+bs*0];
483 		B[1+bs*0] = alpha*A[1+bs*0];
484 		B[2+bs*0] = alpha*A[2+bs*0];
485 
486 		B[0+bs*1] = alpha*A[0+bs*1];
487 		B[1+bs*1] = alpha*A[1+bs*1];
488 		B[2+bs*1] = alpha*A[2+bs*1];
489 
490 		B[0+bs*2] = alpha*A[0+bs*2];
491 		B[1+bs*2] = alpha*A[1+bs*2];
492 		B[2+bs*2] = alpha*A[2+bs*2];
493 
494 		B[0+bs*3] = alpha*A[0+bs*3];
495 		B[1+bs*3] = alpha*A[1+bs*3];
496 		B[2+bs*3] = alpha*A[2+bs*3];
497 
498 		A += 16;
499 		B += 16;
500 
501 		}
502 	for(; k<kmax; k++)
503 		{
504 
505 		B[0+bs*0] = alpha*A[0+bs*0];
506 		B[1+bs*0] = alpha*A[1+bs*0];
507 		B[2+bs*0] = alpha*A[2+bs*0];
508 
509 		A += 4;
510 		B += 4;
511 
512 		}
513 
514 	if(tri==1)
515 		{
516 		// 2x2 triangle
517 
518 		B[1+bs*0] = alpha*A[1+bs*0];
519 		B[2+bs*0] = alpha*A[2+bs*0];
520 
521 		B[2+bs*1] = alpha*A[2+bs*1];
522 
523 		}
524 
525 	}
526 
527 
528 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_dgecpsc_3_2_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)529 void kernel_dgecpsc_3_2_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
530 	{
531 
532 	if(tri==1)
533 		{
534 		// A and C are lower triangular
535 		// kmax+1 3-wide + end 2x2 triangle
536 
537 		kmax += 1;
538 		}
539 
540 	if(kmax<=0)
541 		return;
542 
543 	const int bs = 4;
544 
545 	double *A1 = A0 + bs*sda;
546 
547 	int k;
548 
549 	for(k=0; k<kmax-3; k+=4)
550 		{
551 
552 		B[0+bs*0] = alpha*A0[2+bs*0];
553 		B[1+bs*0] = alpha*A0[3+bs*0];
554 		B[2+bs*0] = alpha*A1[0+bs*0];
555 
556 		B[0+bs*1] = alpha*A0[2+bs*1];
557 		B[1+bs*1] = alpha*A0[3+bs*1];
558 		B[2+bs*1] = alpha*A1[0+bs*1];
559 
560 		B[0+bs*2] = alpha*A0[2+bs*2];
561 		B[1+bs*2] = alpha*A0[3+bs*2];
562 		B[2+bs*2] = alpha*A1[0+bs*2];
563 
564 		B[0+bs*3] = alpha*A0[2+bs*3];
565 		B[1+bs*3] = alpha*A0[3+bs*3];
566 		B[2+bs*3] = alpha*A1[0+bs*3];
567 
568 		A0 += 16;
569 		A1 += 16;
570 		B  += 16;
571 
572 		}
573 	for(; k<kmax; k++)
574 		{
575 
576 		B[0+bs*0] = alpha*A0[2+bs*0];
577 		B[1+bs*0] = alpha*A0[3+bs*0];
578 		B[2+bs*0] = alpha*A1[0+bs*0];
579 
580 		A0 += 4;
581 		A1 += 4;
582 		B  += 4;
583 
584 		}
585 
586 	if(tri==1)
587 		{
588 		// 2x2 triangle
589 
590 		B[1+bs*0] = alpha*A0[3+bs*0];
591 		B[2+bs*0] = alpha*A1[0+bs*0];
592 
593 		B[2+bs*1] = alpha*A1[0+bs*1];
594 
595 		}
596 
597 	}
598 
599 
600 
601 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_dgecpsc_3_3_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)602 void kernel_dgecpsc_3_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
603 	{
604 
605 	if(tri==1)
606 		{
607 		// A and C are lower triangular
608 		// kmax+1 3-wide + end 2x2 triangle
609 
610 		kmax += 1;
611 		}
612 
613 	if(kmax<=0)
614 		return;
615 
616 	const int bs = 4;
617 
618 	double *A1 = A0 + bs*sda;
619 
620 	int k;
621 
622 	for(k=0; k<kmax-3; k+=4)
623 		{
624 
625 		B[0+bs*0] = alpha*A0[3+bs*0];
626 		B[1+bs*0] = alpha*A1[0+bs*0];
627 		B[2+bs*0] = alpha*A1[1+bs*0];
628 
629 		B[0+bs*1] = alpha*A0[3+bs*1];
630 		B[1+bs*1] = alpha*A1[0+bs*1];
631 		B[2+bs*1] = alpha*A1[1+bs*1];
632 
633 		B[0+bs*2] = alpha*A0[3+bs*2];
634 		B[1+bs*2] = alpha*A1[0+bs*2];
635 		B[2+bs*2] = alpha*A1[1+bs*2];
636 
637 		B[0+bs*3] = alpha*A0[3+bs*3];
638 		B[1+bs*3] = alpha*A1[0+bs*3];
639 		B[2+bs*3] = alpha*A1[1+bs*3];
640 
641 		A0 += 16;
642 		A1 += 16;
643 		B  += 16;
644 
645 		}
646 	for(; k<kmax; k++)
647 		{
648 
649 		B[0+bs*0] = alpha*A0[3+bs*0];
650 		B[1+bs*0] = alpha*A1[0+bs*0];
651 		B[2+bs*0] = alpha*A1[1+bs*0];
652 
653 		A0 += 4;
654 		A1 += 4;
655 		B  += 4;
656 
657 		}
658 
659 	if(tri==1)
660 		{
661 		// 2x2 triangle
662 
663 		B[1+bs*0] = alpha*A1[0+bs*0];
664 		B[2+bs*0] = alpha*A1[1+bs*0];
665 
666 		B[2+bs*1] = alpha*A1[1+bs*1];
667 
668 		}
669 
670 	}
671 
672 
673 
674 // both A and B are aligned to 64-bit boundaries
kernel_dgecpsc_2_0_lib4(int tri,int kmax,double alpha,double * A,double * B)675 void kernel_dgecpsc_2_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
676 	{
677 
678 	if(tri==1)
679 		{
680 		// A and C are lower triangular
681 		// kmax+1 2-wide + end 1x1 triangle
682 
683 		kmax += 1;
684 		}
685 
686 	if(kmax<=0)
687 		return;
688 
689 	const int bs = 4;
690 
691 	int k;
692 
693 	for(k=0; k<kmax-3; k+=4)
694 		{
695 		B[0+bs*0] = alpha*A[0+bs*0];
696 		B[1+bs*0] = alpha*A[1+bs*0];
697 
698 		B[0+bs*1] = alpha*A[0+bs*1];
699 		B[1+bs*1] = alpha*A[1+bs*1];
700 
701 		B[0+bs*2] = alpha*A[0+bs*2];
702 		B[1+bs*2] = alpha*A[1+bs*2];
703 
704 		B[0+bs*3] = alpha*A[0+bs*3];
705 		B[1+bs*3] = alpha*A[1+bs*3];
706 
707 		A += 16;
708 		B += 16;
709 
710 		}
711 	for(; k<kmax; k++)
712 		{
713 
714 		B[0+bs*0] = alpha*A[0+bs*0];
715 		B[1+bs*0] = alpha*A[1+bs*0];
716 
717 		A += 4;
718 		B += 4;
719 
720 		}
721 
722 	if(tri==1)
723 		{
724 		// 1x1 triangle
725 
726 		B[1+bs*0] = alpha*A[1+bs*0];
727 
728 		}
729 
730 	}
731 
732 
733 
734 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_dgecpsc_2_3_lib4(int tri,int kmax,double alpha,double * A0,int sda,double * B)735 void kernel_dgecpsc_2_3_lib4(int tri, int kmax, double alpha, double *A0, int sda, double *B)
736 	{
737 
738 	if(tri==1)
739 		{
740 		// A and C are lower triangular
741 		// kmax+1 2-wide + end 1x1 triangle
742 
743 		kmax += 1;
744 		}
745 
746 	if(kmax<=0)
747 		return;
748 
749 	const int bs = 4;
750 
751 	double *A1 = A0 + bs*sda;
752 
753 	int k;
754 
755 	for(k=0; k<kmax-3; k+=4)
756 		{
757 
758 		B[0+bs*0] = alpha*A0[3+bs*0];
759 		B[1+bs*0] = alpha*A1[0+bs*0];
760 
761 		B[0+bs*1] = alpha*A0[3+bs*1];
762 		B[1+bs*1] = alpha*A1[0+bs*1];
763 
764 		B[0+bs*2] = alpha*A0[3+bs*2];
765 		B[1+bs*2] = alpha*A1[0+bs*2];
766 
767 		B[0+bs*3] = alpha*A0[3+bs*3];
768 		B[1+bs*3] = alpha*A1[0+bs*3];
769 
770 		A0 += 16;
771 		A1 += 16;
772 		B  += 16;
773 
774 		}
775 	for(; k<kmax; k++)
776 		{
777 
778 		B[0+bs*0] = alpha*A0[3+bs*0];
779 		B[1+bs*0] = alpha*A1[0+bs*0];
780 
781 		A0 += 4;
782 		A1 += 4;
783 		B  += 4;
784 
785 		}
786 
787 	if(tri==1)
788 		{
789 		// 1x1 triangle
790 
791 		B[1+bs*0] = alpha*A1[0+bs*0];
792 
793 		}
794 
795 	}
796 
797 
798 
799 // both A and B are aligned 64-bit boundaries
kernel_dgecpsc_1_0_lib4(int tri,int kmax,double alpha,double * A,double * B)800 void kernel_dgecpsc_1_0_lib4(int tri, int kmax, double alpha, double *A, double *B)
801 	{
802 
803 	if(tri==1)
804 		{
805 		// A and C are lower triangular
806 		// kmax+1 1-wide
807 
808 		kmax += 1;
809 		}
810 
811 	if(kmax<=0)
812 		return;
813 
814 	const int bs = 4;
815 
816 	int k;
817 
818 	for(k=0; k<kmax-3; k+=4)
819 		{
820 		B[0+bs*0] = alpha*A[0+bs*0];
821 		B[0+bs*1] = alpha*A[0+bs*1];
822 		B[0+bs*2] = alpha*A[0+bs*2];
823 		B[0+bs*3] = alpha*A[0+bs*3];
824 
825 		A += 16;
826 		B += 16;
827 
828 		}
829 	for(; k<kmax; k++)
830 		{
831 
832 		B[0+bs*0] = alpha*A[0+bs*0];
833 
834 		A += 4;
835 		B += 4;
836 
837 		}
838 
839 	}
840 
841 
842 
843 
844 
845 /*
846  * Add and scale
847  *
848  */
849 
850 
851 
852 
853 // both A and B are aligned to 256-bit boundaries
kernel_dgead_4_0_lib4(int kmax,double alpha,double * A,double * B)854 void kernel_dgead_4_0_lib4(int kmax, double alpha, double *A, double *B)
855 	{
856 
857 	if(kmax<=0)
858 		return;
859 
860 	const int bs = 4;
861 
862 	int k;
863 
864 	for(k=0; k<kmax-3; k+=4)
865 		{
866 		B[0+bs*0] += alpha * A[0+bs*0];
867 		B[1+bs*0] += alpha * A[1+bs*0];
868 		B[2+bs*0] += alpha * A[2+bs*0];
869 		B[3+bs*0] += alpha * A[3+bs*0];
870 
871 		B[0+bs*1] += alpha * A[0+bs*1];
872 		B[1+bs*1] += alpha * A[1+bs*1];
873 		B[2+bs*1] += alpha * A[2+bs*1];
874 		B[3+bs*1] += alpha * A[3+bs*1];
875 
876 		B[0+bs*2] += alpha * A[0+bs*2];
877 		B[1+bs*2] += alpha * A[1+bs*2];
878 		B[2+bs*2] += alpha * A[2+bs*2];
879 		B[3+bs*2] += alpha * A[3+bs*2];
880 
881 		B[0+bs*3] += alpha * A[0+bs*3];
882 		B[1+bs*3] += alpha * A[1+bs*3];
883 		B[2+bs*3] += alpha * A[2+bs*3];
884 		B[3+bs*3] += alpha * A[3+bs*3];
885 
886 		A += 16;
887 		B += 16;
888 
889 		}
890 	for(; k<kmax; k++)
891 		{
892 
893 		B[0+bs*0] += alpha * A[0+bs*0];
894 		B[1+bs*0] += alpha * A[1+bs*0];
895 		B[2+bs*0] += alpha * A[2+bs*0];
896 		B[3+bs*0] += alpha * A[3+bs*0];
897 
898 		A += 4;
899 		B += 4;
900 
901 		}
902 
903 	}
904 
905 
906 
907 // both A and B are aligned to 256-bit boundaries, 1 element of A must be skipped
kernel_dgead_4_1_lib4(int kmax,double alpha,double * A0,int sda,double * B)908 void kernel_dgead_4_1_lib4(int kmax, double alpha, double *A0, int sda, double *B)
909 	{
910 
911 	if(kmax<=0)
912 		return;
913 
914 	const int bs = 4;
915 
916 	double *A1 = A0 + bs*sda;
917 
918 	int k;
919 
920 	for(k=0; k<kmax-3; k+=4)
921 		{
922 
923 		B[0+bs*0] += alpha * A0[1+bs*0];
924 		B[1+bs*0] += alpha * A0[2+bs*0];
925 		B[2+bs*0] += alpha * A0[3+bs*0];
926 		B[3+bs*0] += alpha * A1[0+bs*0];
927 
928 		B[0+bs*1] += alpha * A0[1+bs*1];
929 		B[1+bs*1] += alpha * A0[2+bs*1];
930 		B[2+bs*1] += alpha * A0[3+bs*1];
931 		B[3+bs*1] += alpha * A1[0+bs*1];
932 
933 		B[0+bs*2] += alpha * A0[1+bs*2];
934 		B[1+bs*2] += alpha * A0[2+bs*2];
935 		B[2+bs*2] += alpha * A0[3+bs*2];
936 		B[3+bs*2] += alpha * A1[0+bs*2];
937 
938 		B[0+bs*3] += alpha * A0[1+bs*3];
939 		B[1+bs*3] += alpha * A0[2+bs*3];
940 		B[2+bs*3] += alpha * A0[3+bs*3];
941 		B[3+bs*3] += alpha * A1[0+bs*3];
942 
943 		A0 += 16;
944 		A1 += 16;
945 		B  += 16;
946 
947 		}
948 	for(; k<kmax; k++)
949 		{
950 
951 		B[0+bs*0] += alpha * A0[1+bs*0];
952 		B[1+bs*0] += alpha * A0[2+bs*0];
953 		B[2+bs*0] += alpha * A0[3+bs*0];
954 		B[3+bs*0] += alpha * A1[0+bs*0];
955 
956 		A0 += 4;
957 		A1 += 4;
958 		B  += 4;
959 
960 		}
961 
962 	}
963 
964 
965 
966 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_dgead_4_2_lib4(int kmax,double alpha,double * A0,int sda,double * B)967 void kernel_dgead_4_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
968 	{
969 
970 	if(kmax<=0)
971 		return;
972 
973 	const int bs = 4;
974 
975 	double *A1 = A0 + bs*sda;
976 
977 	int k;
978 
979 	for(k=0; k<kmax-3; k+=4)
980 		{
981 
982 		B[0+bs*0] += alpha * A0[2+bs*0];
983 		B[1+bs*0] += alpha * A0[3+bs*0];
984 		B[2+bs*0] += alpha * A1[0+bs*0];
985 		B[3+bs*0] += alpha * A1[1+bs*0];
986 
987 		B[0+bs*1] += alpha * A0[2+bs*1];
988 		B[1+bs*1] += alpha * A0[3+bs*1];
989 		B[2+bs*1] += alpha * A1[0+bs*1];
990 		B[3+bs*1] += alpha * A1[1+bs*1];
991 
992 		B[0+bs*2] += alpha * A0[2+bs*2];
993 		B[1+bs*2] += alpha * A0[3+bs*2];
994 		B[2+bs*2] += alpha * A1[0+bs*2];
995 		B[3+bs*2] += alpha * A1[1+bs*2];
996 
997 		B[0+bs*3] += alpha * A0[2+bs*3];
998 		B[1+bs*3] += alpha * A0[3+bs*3];
999 		B[2+bs*3] += alpha * A1[0+bs*3];
1000 		B[3+bs*3] += alpha * A1[1+bs*3];
1001 
1002 		A0 += 16;
1003 		A1 += 16;
1004 		B  += 16;
1005 
1006 		}
1007 	for(; k<kmax; k++)
1008 		{
1009 
1010 		B[0+bs*0] += alpha * A0[2+bs*0];
1011 		B[1+bs*0] += alpha * A0[3+bs*0];
1012 		B[2+bs*0] += alpha * A1[0+bs*0];
1013 		B[3+bs*0] += alpha * A1[1+bs*0];
1014 
1015 		A0 += 4;
1016 		A1 += 4;
1017 		B  += 4;
1018 
1019 		}
1020 
1021 	}
1022 
1023 
1024 
1025 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_dgead_4_3_lib4(int kmax,double alpha,double * A0,int sda,double * B)1026 void kernel_dgead_4_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
1027 	{
1028 
1029 	if(kmax<=0)
1030 		return;
1031 
1032 	const int bs = 4;
1033 
1034 	double *A1 = A0 + bs*sda;
1035 
1036 	int k;
1037 
1038 	for(k=0; k<kmax-3; k+=4)
1039 		{
1040 
1041 		B[0+bs*0] += alpha * A0[3+bs*0];
1042 		B[1+bs*0] += alpha * A1[0+bs*0];
1043 		B[2+bs*0] += alpha * A1[1+bs*0];
1044 		B[3+bs*0] += alpha * A1[2+bs*0];
1045 
1046 		B[0+bs*1] += alpha * A0[3+bs*1];
1047 		B[1+bs*1] += alpha * A1[0+bs*1];
1048 		B[2+bs*1] += alpha * A1[1+bs*1];
1049 		B[3+bs*1] += alpha * A1[2+bs*1];
1050 
1051 		B[0+bs*2] += alpha * A0[3+bs*2];
1052 		B[1+bs*2] += alpha * A1[0+bs*2];
1053 		B[2+bs*2] += alpha * A1[1+bs*2];
1054 		B[3+bs*2] += alpha * A1[2+bs*2];
1055 
1056 		B[0+bs*3] += alpha * A0[3+bs*3];
1057 		B[1+bs*3] += alpha * A1[0+bs*3];
1058 		B[2+bs*3] += alpha * A1[1+bs*3];
1059 		B[3+bs*3] += alpha * A1[2+bs*3];
1060 
1061 		A0 += 16;
1062 		A1 += 16;
1063 		B  += 16;
1064 
1065 		}
1066 	for(; k<kmax; k++)
1067 		{
1068 
1069 		B[0+bs*0] += alpha * A0[3+bs*0];
1070 		B[1+bs*0] += alpha * A1[0+bs*0];
1071 		B[2+bs*0] += alpha * A1[1+bs*0];
1072 		B[3+bs*0] += alpha * A1[2+bs*0];
1073 
1074 		A0 += 4;
1075 		A1 += 4;
1076 		B  += 4;
1077 
1078 		}
1079 
1080 	}
1081 
1082 
1083 
1084 // both A and B are aligned to 64-bit boundaries
kernel_dgead_3_0_lib4(int kmax,double alpha,double * A,double * B)1085 void kernel_dgead_3_0_lib4(int kmax, double alpha, double *A, double *B)
1086 	{
1087 
1088 	if(kmax<=0)
1089 		return;
1090 
1091 	const int bs = 4;
1092 
1093 	int k;
1094 
1095 	for(k=0; k<kmax-3; k+=4)
1096 		{
1097 		B[0+bs*0] += alpha * A[0+bs*0];
1098 		B[1+bs*0] += alpha * A[1+bs*0];
1099 		B[2+bs*0] += alpha * A[2+bs*0];
1100 
1101 		B[0+bs*1] += alpha * A[0+bs*1];
1102 		B[1+bs*1] += alpha * A[1+bs*1];
1103 		B[2+bs*1] += alpha * A[2+bs*1];
1104 
1105 		B[0+bs*2] += alpha * A[0+bs*2];
1106 		B[1+bs*2] += alpha * A[1+bs*2];
1107 		B[2+bs*2] += alpha * A[2+bs*2];
1108 
1109 		B[0+bs*3] += alpha * A[0+bs*3];
1110 		B[1+bs*3] += alpha * A[1+bs*3];
1111 		B[2+bs*3] += alpha * A[2+bs*3];
1112 
1113 		A += 16;
1114 		B += 16;
1115 
1116 		}
1117 	for(; k<kmax; k++)
1118 		{
1119 
1120 		B[0+bs*0] += alpha * A[0+bs*0];
1121 		B[1+bs*0] += alpha * A[1+bs*0];
1122 		B[2+bs*0] += alpha * A[2+bs*0];
1123 
1124 		A += 4;
1125 		B += 4;
1126 
1127 		}
1128 
1129 	}
1130 
1131 
1132 
1133 // both A and B are aligned to 256-bit boundaries, 2 elements of A must be skipped
kernel_dgead_3_2_lib4(int kmax,double alpha,double * A0,int sda,double * B)1134 void kernel_dgead_3_2_lib4(int kmax, double alpha, double *A0, int sda, double *B)
1135 	{
1136 
1137 	if(kmax<=0)
1138 		return;
1139 
1140 	const int bs = 4;
1141 
1142 	double *A1 = A0 + bs*sda;
1143 
1144 	int k;
1145 
1146 	for(k=0; k<kmax-3; k+=4)
1147 		{
1148 
1149 		B[0+bs*0] += alpha * A0[2+bs*0];
1150 		B[1+bs*0] += alpha * A0[3+bs*0];
1151 		B[2+bs*0] += alpha * A1[0+bs*0];
1152 
1153 		B[0+bs*1] += alpha * A0[2+bs*1];
1154 		B[1+bs*1] += alpha * A0[3+bs*1];
1155 		B[2+bs*1] += alpha * A1[0+bs*1];
1156 
1157 		B[0+bs*2] += alpha * A0[2+bs*2];
1158 		B[1+bs*2] += alpha * A0[3+bs*2];
1159 		B[2+bs*2] += alpha * A1[0+bs*2];
1160 
1161 		B[0+bs*3] += alpha * A0[2+bs*3];
1162 		B[1+bs*3] += alpha * A0[3+bs*3];
1163 		B[2+bs*3] += alpha * A1[0+bs*3];
1164 
1165 		A0 += 16;
1166 		A1 += 16;
1167 		B  += 16;
1168 
1169 		}
1170 	for(; k<kmax; k++)
1171 		{
1172 
1173 		B[0+bs*0] += alpha * A0[2+bs*0];
1174 		B[1+bs*0] += alpha * A0[3+bs*0];
1175 		B[2+bs*0] += alpha * A1[0+bs*0];
1176 
1177 		A0 += 4;
1178 		A1 += 4;
1179 		B  += 4;
1180 
1181 		}
1182 
1183 	}
1184 
1185 
1186 
1187 // both A and B are aligned to 256-bit boundaries, 3 elements of A must be skipped
kernel_dgead_3_3_lib4(int kmax,double alpha,double * A0,int sda,double * B)1188 void kernel_dgead_3_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
1189 	{
1190 
1191 	if(kmax<=0)
1192 		return;
1193 
1194 	const int bs = 4;
1195 
1196 	double *A1 = A0 + bs*sda;
1197 
1198 	int k;
1199 
1200 	for(k=0; k<kmax-3; k+=4)
1201 		{
1202 
1203 		B[0+bs*0] += alpha * A0[3+bs*0];
1204 		B[1+bs*0] += alpha * A1[0+bs*0];
1205 		B[2+bs*0] += alpha * A1[1+bs*0];
1206 
1207 		B[0+bs*1] += alpha * A0[3+bs*1];
1208 		B[1+bs*1] += alpha * A1[0+bs*1];
1209 		B[2+bs*1] += alpha * A1[1+bs*1];
1210 
1211 		B[0+bs*2] += alpha * A0[3+bs*2];
1212 		B[1+bs*2] += alpha * A1[0+bs*2];
1213 		B[2+bs*2] += alpha * A1[1+bs*2];
1214 
1215 		B[0+bs*3] += alpha * A0[3+bs*3];
1216 		B[1+bs*3] += alpha * A1[0+bs*3];
1217 		B[2+bs*3] += alpha * A1[1+bs*3];
1218 
1219 		A0 += 16;
1220 		A1 += 16;
1221 		B  += 16;
1222 
1223 		}
1224 	for(; k<kmax; k++)
1225 		{
1226 
1227 		B[0+bs*0] += alpha * A0[3+bs*0];
1228 		B[1+bs*0] += alpha * A1[0+bs*0];
1229 		B[2+bs*0] += alpha * A1[1+bs*0];
1230 
1231 		A0 += 4;
1232 		A1 += 4;
1233 		B  += 4;
1234 
1235 		}
1236 
1237 	}
1238 
1239 
1240 
1241 // both A and B are aligned to 64-bit boundaries
kernel_dgead_2_0_lib4(int kmax,double alpha,double * A,double * B)1242 void kernel_dgead_2_0_lib4(int kmax, double alpha, double *A, double *B)
1243 	{
1244 
1245 	if(kmax<=0)
1246 		return;
1247 
1248 	const int bs = 4;
1249 
1250 	int k;
1251 
1252 	for(k=0; k<kmax-3; k+=4)
1253 		{
1254 		B[0+bs*0] += alpha * A[0+bs*0];
1255 		B[1+bs*0] += alpha * A[1+bs*0];
1256 
1257 		B[0+bs*1] += alpha * A[0+bs*1];
1258 		B[1+bs*1] += alpha * A[1+bs*1];
1259 
1260 		B[0+bs*2] += alpha * A[0+bs*2];
1261 		B[1+bs*2] += alpha * A[1+bs*2];
1262 
1263 		B[0+bs*3] += alpha * A[0+bs*3];
1264 		B[1+bs*3] += alpha * A[1+bs*3];
1265 
1266 		A += 16;
1267 		B += 16;
1268 
1269 		}
1270 	for(; k<kmax; k++)
1271 		{
1272 
1273 		B[0+bs*0] += alpha * A[0+bs*0];
1274 		B[1+bs*0] += alpha * A[1+bs*0];
1275 
1276 		A += 4;
1277 		B += 4;
1278 
1279 		}
1280 
1281 	}
1282 
1283 
1284 
1285 // both A and B are aligned to 128-bit boundaries, 3 elements of A must be skipped
kernel_dgead_2_3_lib4(int kmax,double alpha,double * A0,int sda,double * B)1286 void kernel_dgead_2_3_lib4(int kmax, double alpha, double *A0, int sda, double *B)
1287 	{
1288 
1289 	if(kmax<=0)
1290 		return;
1291 
1292 	const int bs = 4;
1293 
1294 	double *A1 = A0 + bs*sda;
1295 
1296 	int k;
1297 
1298 	for(k=0; k<kmax-3; k+=4)
1299 		{
1300 
1301 		B[0+bs*0] += alpha * A0[3+bs*0];
1302 		B[1+bs*0] += alpha * A1[0+bs*0];
1303 
1304 		B[0+bs*1] += alpha * A0[3+bs*1];
1305 		B[1+bs*1] += alpha * A1[0+bs*1];
1306 
1307 		B[0+bs*2] += alpha * A0[3+bs*2];
1308 		B[1+bs*2] += alpha * A1[0+bs*2];
1309 
1310 		B[0+bs*3] += alpha * A0[3+bs*3];
1311 		B[1+bs*3] += alpha * A1[0+bs*3];
1312 
1313 		A0 += 16;
1314 		A1 += 16;
1315 		B  += 16;
1316 
1317 		}
1318 	for(; k<kmax; k++)
1319 		{
1320 
1321 		B[0+bs*0] += alpha * A0[3+bs*0];
1322 		B[1+bs*0] += alpha * A1[0+bs*0];
1323 
1324 		A0 += 4;
1325 		A1 += 4;
1326 		B  += 4;
1327 
1328 		}
1329 
1330 	}
1331 
1332 
1333 
1334 // both A and B are aligned 64-bit boundaries
kernel_dgead_1_0_lib4(int kmax,double alpha,double * A,double * B)1335 void kernel_dgead_1_0_lib4(int kmax, double alpha, double *A, double *B)
1336 	{
1337 
1338 	if(kmax<=0)
1339 		return;
1340 
1341 	const int bs = 4;
1342 
1343 	int k;
1344 
1345 	for(k=0; k<kmax-3; k+=4)
1346 		{
1347 		B[0+bs*0] += alpha * A[0+bs*0];
1348 
1349 		B[0+bs*1] += alpha * A[0+bs*1];
1350 
1351 		B[0+bs*2] += alpha * A[0+bs*2];
1352 
1353 		B[0+bs*3] += alpha * A[0+bs*3];
1354 
1355 		A += 16;
1356 		B += 16;
1357 
1358 		}
1359 	for(; k<kmax; k++)
1360 		{
1361 
1362 		B[0+bs*0] += alpha * A[0+bs*0];
1363 
1364 		A += 4;
1365 		B += 4;
1366 
1367 		}
1368 
1369 	}
1370