1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS for embedded optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36 #include <stdlib.h>
37 #include <stdio.h>
38
39 #include "../include/blasfeo_common.h"
40 #include "../include/blasfeo_d_kernel.h"
41 #include "../include/blasfeo_d_aux.h"
42 #include "../include/blasfeo_d_blasfeo_api.h"
43
44
45
46 /****************************
47 * old interface
48 ****************************/
49
50 #if 0
51 void dlauum_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd)
52 {
53
54 if(m<=0 || n<=0)
55 return;
56
57 // TODO remove
58 double alpha, beta;
59 if(alg==0)
60 {
61 alpha = 1.0;
62 beta = 0.0;
63 }
64 else if(alg==1)
65 {
66 alpha = 1.0;
67 beta = 1.0;
68 }
69 else
70 {
71 alpha = -1.0;
72 beta = 1.0;
73 }
74
75 // TODO remove
76 int k = cv[nv-1];
77
78 const int ps = 4;
79
80 int i, j, l;
81 int ii, iii, jj, kii, kiii, kjj, k0, k1;
82
83 i = 0;
84 ii = 0;
85 iii = 0;
86
87 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
88 for(; i<m-7; i+=8)
89 {
90
91 while(ii<nv && rv[ii]<i+8)
92 ii++;
93 if(ii<nv)
94 kii = cv[ii];
95 else
96 kii = cv[ii-1];
97
98 j = 0;
99 jj = 0;
100 for(; j<i && j<n-3; j+=4)
101 {
102
103 while(jj<nv && rv[jj]<j+4)
104 jj++;
105 if(jj<nv)
106 kjj = cv[jj];
107 else
108 kjj = cv[jj-1];
109 k0 = kii<kjj ? kii : kjj;
110
111 kernel_dgemm_nt_8x4_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
112 }
113 if(j<n)
114 {
115
116 while(jj<nv && rv[jj]<j+4)
117 jj++;
118 if(jj<nv)
119 kjj = cv[jj];
120 else
121 kjj = cv[jj-1];
122 k0 = kii<kjj ? kii : kjj;
123
124 if(j<i) // dgemm
125 {
126 kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
127 }
128 else // dsyrk
129 {
130 kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
131 if(j<n-4)
132 {
133 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], 4, n-j-4); // TODO
134 }
135 }
136 }
137 }
138 if(m>i)
139 {
140 if(m-i<=4)
141 {
142 goto left_4;
143 }
144 else
145 {
146 goto left_8;
147 }
148 }
149 #else
150 for(; i<m-3; i+=4)
151 {
152
153 while(ii<nv && rv[ii]<i+4)
154 ii++;
155 if(ii<nv)
156 kii = cv[ii];
157 else
158 kii = cv[ii-1];
159 // k0 = kii;
160 // printf("\nii %d %d %d %d %d\n", i, ii, rv[ii], cv[ii], kii);
161
162 j = 0;
163 jj = 0;
164 for(; j<i && j<n-3; j+=4)
165 {
166
167 while(jj<nv && rv[jj]<j+4)
168 jj++;
169 if(jj<nv)
170 kjj = cv[jj];
171 else
172 kjj = cv[jj-1];
173 k0 = kii<kjj ? kii : kjj;
174 // printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
175
176 kernel_dgemm_nt_4x4_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
177 }
178 if(j<n)
179 {
180
181 while(jj<nv && rv[jj]<j+4)
182 jj++;
183 if(jj<nv)
184 kjj = cv[jj];
185 else
186 kjj = cv[jj-1];
187 k0 = kii<kjj ? kii : kjj;
188 // printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
189
190 if(j<i) // dgemm
191 {
192 kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
193 }
194 else // dsyrk
195 {
196 kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
197 }
198 }
199 }
200 if(m>i)
201 {
202 goto left_4;
203 }
204 #endif
205
206 // common return if i==m
207 return;
208
209 // clean up loops definitions
210
211 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
212 left_8:
213
214 kii = cv[nv-1];
215
216 j = 0;
217 jj = 0;
218 for(; j<i && j<n-3; j+=4)
219 {
220
221 while(jj<nv && rv[jj]<j+4)
222 jj++;
223 if(jj<nv)
224 kjj = cv[jj];
225 else
226 kjj = cv[jj-1];
227 k0 = kii<kjj ? kii : kjj;
228
229 kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
230 }
231 if(j<n)
232 {
233
234 while(jj<nv && rv[jj]<j+4)
235 jj++;
236 if(jj<nv)
237 kjj = cv[jj];
238 else
239 kjj = cv[jj-1];
240 k0 = kii<kjj ? kii : kjj;
241
242 if(j<i) // dgemm
243 {
244 kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
245 }
246 else // dsyrk
247 {
248 kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
249 if(j<n-4)
250 {
251 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4); // TODO
252 }
253 }
254 }
255 return;
256 #endif
257
258 left_4:
259
260 kii = cv[nv-1];
261
262 j = 0;
263 jj = 0;
264 for(; j<i && j<n-3; j+=4)
265 {
266
267 while(jj<nv && rv[jj]<j+4)
268 jj++;
269 if(jj<nv)
270 kjj = cv[jj];
271 else
272 kjj = cv[jj-1];
273 k0 = kii<kjj ? kii : kjj;
274
275 kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
276 }
277 if(j<n)
278 {
279
280 while(jj<nv && rv[jj]<j+4)
281 jj++;
282 if(jj<nv)
283 kjj = cv[jj];
284 else
285 kjj = cv[jj-1];
286 k0 = kii<kjj ? kii : kjj;
287
288 if(j<i) // dgemm
289 {
290 kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
291 }
292 else // dsyrk
293 {
294 kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
295 }
296 }
297 return;
298
299 }
300 #endif
301
302
303
304 /****************************
305 * new interface
306 ****************************/
307
308
309
310 #if defined(LA_HIGH_PERFORMANCE)
311
312
313
314 // dgemm nn
blasfeo_dgemm_nn(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)315 void blasfeo_dgemm_nn(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
316 {
317 if(m<=0 || n<=0)
318 return;
319
320 // invalidate stored inverse diagonal of result matrix
321 sD->use_dA = 0;
322
323 const int ps = 4;
324
325 int sda = sA->cn;
326 int sdb = sB->cn;
327 int sdc = sC->cn;
328 int sdd = sD->cn;
329
330 int air = ai & (ps-1);
331 int bir = bi & (ps-1);
332
333 // pA, pB point to panels edges
334 double *pA = sA->pA + aj*ps + (ai-air)*sda;
335 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
336 double *pC = sC->pA + cj*ps;
337 double *pD = sD->pA + dj*ps;
338
339 int offsetB = bir;
340
341 int ci0 = ci-air;
342 int di0 = di-air;
343 int offsetC;
344 int offsetD;
345 if(ci0>=0)
346 {
347 pC += ci0/ps*ps*sdd;
348 offsetC = ci0%ps;
349 }
350 else
351 {
352 pC += -ps*sdc;
353 offsetC = ps+ci0;
354 }
355
356 if(di0>=0)
357 {
358 pD += di0/ps*ps*sdd;
359 offsetD = di0%ps;
360 }
361 else
362 {
363 pD += -ps*sdd;
364 offsetD = ps+di0;
365 }
366
367 int i, j, l;
368
369
370
371 // algorithm scheme
372 if(air!=0)
373 {
374 goto clear_air;
375 }
376 select_loop:
377 if(offsetC==0 & offsetD==0)
378 {
379 goto loop_00;
380 }
381 else
382 {
383 goto loop_CD;
384 }
385 // should never get here
386 return;
387
388
389
390 // clean up at the beginning
391 clear_air:
392 #if defined(TARGET_X64_INTEL_HASWELL)
393 if(air+m>8)
394 {
395 j = 0;
396 for(; j<n; j+=4)
397 {
398 kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[0], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
399 }
400 m -= 3*ps-air;
401 pA += 3*ps*sda;
402 pC += 3*ps*sdc;
403 pD += 3*ps*sdd;
404 }
405 else // air+m<=8
406 #endif
407 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
408 if(air+m>4) // (m>5)
409 {
410 j = 0;
411 for(; j<n; j+=4)
412 {
413 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[0], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
414 }
415 m -= 2*ps-air;
416 pA += 2*ps*sda;
417 pC += 2*ps*sdc;
418 pD += 2*ps*sdd;
419 }
420 else // air+m<=4 // m-i<=4
421 {
422 #endif
423 j = 0;
424 for(; j<n; j+=4)
425 {
426 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[0], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
427 }
428 m -= 1*ps-air;
429 pA += 1*ps*sda;
430 pC += 1*ps*sdc;
431 pD += 1*ps*sdd;
432 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
433 // nothing more to do
434 }
435 #endif
436 goto select_loop;
437
438
439
440 // main loop aligned
441 loop_00:
442 i = 0;
443 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
444 for(; i<m-11; i+=12)
445 {
446 j = 0;
447 for(; j<n-3; j+=4)
448 {
449 kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
450 }
451 if(j<n)
452 {
453 kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
454 }
455 }
456 if(m>i)
457 {
458 if(m-i<=4)
459 {
460 goto left_4;
461 }
462 else if(m-i<=8)
463 {
464 goto left_8;
465 }
466 else
467 {
468 goto left_12;
469 }
470 }
471 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
472 for(; i<m-12 | i==m-8; i+=8)
473 {
474 j = 0;
475 for(; j<n-3; j+=4)
476 {
477 kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
478 }
479 if(j<n)
480 {
481 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
482 }
483 }
484 if(m>i)
485 {
486 if(m-i<=4)
487 {
488 goto left_4;
489 }
490 else if(m-i<=8)
491 {
492 goto left_8;
493 }
494 else
495 {
496 goto left_12;
497 }
498 }
499 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
500 for(; i<m-7; i+=8)
501 {
502 j = 0;
503 for(; j<n-3; j+=4)
504 {
505 kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
506 }
507 if(j<n)
508 {
509 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
510 }
511 }
512 if(m>i)
513 {
514 if(m-i<=4)
515 {
516 goto left_4;
517 }
518 else
519 {
520 goto left_8;
521 }
522 }
523 #elif defined(TARGET_X86_AMD_BARCELONA)
524 for(; i<m-3; i+=4)
525 {
526 j = 0;
527 for(; j<n-1; j+=2)
528 {
529 kernel_dgemm_nn_4x2_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
530 }
531 if(j<n)
532 {
533 kernel_dgemm_nn_4x2_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
534 }
535 }
536 if(m>i)
537 {
538 goto left_4;
539 }
540 #else // all others
541 for(; i<m-3; i+=4)
542 {
543 j = 0;
544 for(; j<n-3; j+=4)
545 {
546 kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
547 }
548 if(j<n)
549 {
550 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
551 }
552 }
553 if(m>i)
554 {
555 goto left_4;
556 }
557 #endif
558 // common return if i==m
559 return;
560
561
562
563 // main loop C, D not aligned
564 loop_CD:
565 i = 0;
566 #if defined(TARGET_X64_INTEL_HASWELL)
567 for(; i<m-8; i+=12)
568 {
569 j = 0;
570 for(; j<n; j+=4)
571 {
572 kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
573 }
574 }
575 if(m>i)
576 {
577 if(m-i<=4)
578 {
579 goto left_4_g;
580 }
581 else
582 {
583 goto left_8_g;
584 }
585 }
586 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
587 for(; i<m-4; i+=8)
588 {
589 j = 0;
590 for(; j<n; j+=4)
591 {
592 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
593 }
594 }
595 if(m>i)
596 {
597 goto left_4_g;
598 }
599 #else
600 for(; i<m; i+=4)
601 {
602 j = 0;
603 for(; j<n; j+=4)
604 {
605 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
606 }
607 }
608 #endif
609 // common return if i==m
610 return;
611
612
613
614 // clean up loops definitions
615
616 #if defined(TARGET_X64_INTEL_HASWELL)
617 left_12_g:
618 j = 0;
619 for(; j<n; j+=4)
620 {
621 kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
622 }
623 return;
624 #endif
625
626
627
628 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
629 left_12:
630 j = 0;
631 for(; j<n; j+=4)
632 {
633 kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
634 }
635 return;
636 #endif
637
638
639
640 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_HASWELL)
641 left_8_g:
642 j = 0;
643 for(; j<n; j+=4)
644 {
645 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
646 }
647 return;
648 #endif
649
650
651
652 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
653 left_8:
654 j = 0;
655 for(; j<n; j+=4)
656 {
657 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
658 }
659 return;
660 #endif
661
662
663
664 left_4_g:
665 j = 0;
666 for(; j<n; j+=4)
667 {
668 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
669 }
670 return;
671
672
673
674 #if defined(TARGET_X64_INTEL_HASWELL)
675 left_4:
676 j = 0;
677 for(; j<n-8; j+=12)
678 {
679 kernel_dgemm_nn_4x12_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
680 }
681 if(j<n-4)
682 {
683 kernel_dgemm_nn_4x8_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
684 }
685 else if(j<n)
686 {
687 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
688 }
689 return;
690 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
691 left_4:
692 j = 0;
693 for(; j<n-4; j+=8)
694 {
695 kernel_dgemm_nn_4x8_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
696 }
697 if(j<n)
698 {
699 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
700 }
701 return;
702 #elif defined(TARGET_X86_AMD_BARCELONA)
703 left_4:
704 j = 0;
705 for(; j<n; j+=2)
706 {
707 kernel_dgemm_nn_4x2_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
708 }
709 return;
710 #else // all others
711 left_4:
712 j = 0;
713 for(; j<n; j+=4)
714 {
715 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
716 }
717 return;
718 #endif
719
720 return;
721
722 }
723
724
725
726 // dgemm nt
blasfeo_dgemm_nt(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)727 void blasfeo_dgemm_nt(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
728 {
729 if(m<=0 | n<=0)
730 return;
731
732 // invalidate stored inverse diagonal of result matrix
733 sD->use_dA = 0;
734
735 const int ps = 4;
736
737 int sda = sA->cn;
738 int sdb = sB->cn;
739 int sdc = sC->cn;
740 int sdd = sD->cn;
741 int air = ai & (ps-1);
742 int bir = bi & (ps-1);
743 double *pA = sA->pA + aj*ps + (ai-air)*sda;
744 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
745 double *pC = sC->pA + cj*ps;
746 double *pD = sD->pA + dj*ps;
747
748 int ci0 = ci-air;
749 int di0 = di-air;
750 int offsetC;
751 int offsetD;
752 if(ci0>=0)
753 {
754 pC += ci0/ps*ps*sdd;
755 offsetC = ci0%ps;
756 }
757 else
758 {
759 pC += -4*sdc;
760 offsetC = ps+ci0;
761 }
762 if(di0>=0)
763 {
764 pD += di0/ps*ps*sdd;
765 offsetD = di0%ps;
766 }
767 else
768 {
769 pD += -4*sdd;
770 offsetD = ps+di0;
771 }
772
773 int i, j;
774
775 int idxB;
776
777
778
779
780 // algorithm scheme
781 if(air!=0)
782 {
783 goto clear_air;
784 // TODO instaed use buffer to align A !!!
785 }
786 select_loop:
787 if(offsetC==0 & offsetD==0)
788 {
789 goto loop_00;
790 }
791 else
792 {
793 goto loop_CD;
794 }
795 // should never get here
796 return;
797
798
799
800 // clean up at the beginning
801 clear_air:
802 #if defined(TARGET_X64_INTEL_HASWELL)
803 if(air+m>8)
804 {
805 j = 0;
806 idxB = 0;
807 // clean up at the beginning
808 if(bir!=0)
809 {
810 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, bir+n-j);
811 j += ps-bir;
812 idxB += 4;
813 }
814 // main loop
815 for(; j<n; j+=4, idxB+=4)
816 {
817 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
818 }
819 m -= 3*ps-air;
820 pA += 3*ps*sda;
821 pC += 3*ps*sdc;
822 pD += 3*ps*sdd;
823 }
824 else // air+m<=8
825 #endif
826 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
827 if(air+m>4) // (m>5)
828 {
829 j = 0;
830 idxB = 0;
831 // clean up at the beginning
832 if(bir!=0)
833 {
834 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, bir+n-j);
835 j += ps-bir;
836 idxB += 4;
837 }
838 // main loop
839 for(; j<n; j+=4, idxB+=4)
840 {
841 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
842 }
843 m -= 2*ps-air;
844 pA += 2*ps*sda;
845 pC += 2*ps*sdc;
846 pD += 2*ps*sdd;
847 }
848 else // m<=4
849 {
850 #endif
851 j = 0;
852 idxB = 0;
853 // clean up at the beginning
854 if(bir!=0)
855 {
856 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, bir+n-j);
857 j += ps-bir;
858 idxB += 4;
859 }
860 // main loop
861 for(; j<n; j+=4, idxB+=4)
862 {
863 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
864 }
865 m -= ps-air;
866 pA += ps*sda;
867 pC += ps*sdc;
868 pD += ps*sdd;
869 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
870 // nothing more to do
871 }
872 #endif
873 goto select_loop;
874
875
876
877 // main loop aligned
878 loop_00:
879 i = 0;
880 #if defined(TARGET_X64_INTEL_HASWELL)
881 for(; i<m-11; i+=12)
882 {
883 j = 0;
884 idxB = 0;
885 // clean up at the beginning
886 if(bir!=0)
887 {
888 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
889 j += ps-bir;
890 idxB += 4;
891 }
892 // main loop
893 for(; j<n-3; j+=4, idxB+=4)
894 {
895 kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
896 }
897 if(j<n)
898 {
899 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
900 }
901 }
902 if(m>i)
903 {
904 if(m-i<=4)
905 {
906 goto left_4;
907 }
908 else if(m-i<=8)
909 {
910 goto left_8;
911 }
912 else
913 {
914 goto left_12;
915 }
916 }
917 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
918 for(; i<m-11; i+=12)
919 {
920 j = 0;
921 idxB = 0;
922 // clean up at the beginning
923 if(bir!=0)
924 {
925 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+0)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+0)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+0)*sdd]-bir*ps, sdd, 0, m-(i+0), bir, bir+n-j);
926 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+4)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+4)*sdd]-bir*ps, sdd, 0, m-(i+4), bir, bir+n-j);
927 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+8)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+8)*sdd]-bir*ps, sdd, 0, m-(i+8), bir, bir+n-j);
928 j += ps-bir;
929 idxB += 4;
930 }
931 // main loop
932 for(; j<n-3; j+=4, idxB+=4)
933 {
934 kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
935 }
936 if(j<n)
937 {
938 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
939 }
940 }
941 if(m>i)
942 {
943 if(m-i<=4)
944 {
945 goto left_4;
946 }
947 else if(m-i<=8)
948 {
949 goto left_8;
950 }
951 else
952 {
953 goto left_12;
954 }
955 }
956 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
957 for(; i<m-7; i+=8)
958 {
959 j = 0;
960 idxB = 0;
961 // clean up at the beginning
962 if(bir!=0)
963 {
964 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
965 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
966 #else
967 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+0)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+0)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+0)*sdd]-bir*ps, sdd, 0, m-(i+0), bir, bir+n-j);
968 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+4)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+4)*sdd]-bir*ps, sdd, 0, m-(i+4), bir, bir+n-j);
969 #endif
970 j += ps-bir;
971 idxB += 4;
972 }
973 // main loop
974 for(; j<n-3; j+=4, idxB+=4)
975 {
976 kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
977 }
978 if(j<n)
979 {
980 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
981 }
982 }
983 if(m>i)
984 {
985 if(m-i<=4)
986 {
987 goto left_4;
988 }
989 else
990 {
991 goto left_8;
992 }
993 }
994 #elif defined(TARGET_X86_AMD_BARCELONA)
995 for(; i<m-3; i+=4)
996 {
997 j = 0;
998 idxB = 0;
999 // clean up at the beginning
1000 if(bir!=0)
1001 {
1002 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1003 j += ps-bir;
1004 idxB += 4;
1005 }
1006 // main loop
1007 for(; j<n-3; j+=4, idxB+=4)
1008 {
1009 kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+0], &beta, &pC[(j+0)*ps+i*sdc], &pD[(j+0)*ps+i*sdd]);
1010 kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd]);
1011 }
1012 if(j<n-2)
1013 {
1014 kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+0], &beta, &pC[(j+0)*ps+i*sdc], &pD[(j+0)*ps+i*sdd]);
1015 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, n-j-2);
1016 }
1017 else if(j<n)
1018 {
1019 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1020 }
1021 }
1022 if(m>i)
1023 {
1024 goto left_4;
1025 }
1026 #else
1027 for(; i<m-3; i+=4)
1028 {
1029 j = 0;
1030 idxB = 0;
1031 // clean up at the beginning
1032 if(bir!=0)
1033 {
1034 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1035 j += ps-bir;
1036 idxB += 4;
1037 }
1038 // main loop
1039 for(; j<n-3; j+=4, idxB+=4)
1040 {
1041 kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
1042 }
1043 if(j<n)
1044 {
1045 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1046 }
1047 }
1048 if(m>i)
1049 {
1050 goto left_4;
1051 }
1052 #endif
1053 // common return if i==m
1054 return;
1055
1056
1057
1058 // main loop C, D not aligned
1059 loop_CD:
1060 i = 0;
1061 #if defined(TARGET_X64_INTEL_HASWELL)
1062 for(; i<m-8; i+=12)
1063 {
1064 j = 0;
1065 idxB = 0;
1066 // clean up at the beginning
1067 if(bir!=0)
1068 {
1069 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1070 j += ps-bir;
1071 idxB += 4;
1072 }
1073 // main loop
1074 for(; j<n; j+=4, idxB+=4)
1075 {
1076 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1077 }
1078 }
1079 if(m>i)
1080 {
1081 if(m-i<=4)
1082 {
1083 goto left_4_g;
1084 }
1085 else
1086 {
1087 goto left_8_g;
1088 }
1089 }
1090 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1091 for(; i<m-4; i+=8)
1092 {
1093 j = 0;
1094 idxB = 0;
1095 // clean up at the beginning
1096 if(bir!=0)
1097 {
1098 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1099 j += ps-bir;
1100 idxB += 4;
1101 }
1102 // main loop
1103 for(; j<n; j+=4, idxB+=4)
1104 {
1105 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1106 }
1107 }
1108 if(m>i)
1109 {
1110 goto left_4_g;
1111 }
1112 #else
1113 for(; i<m; i+=4)
1114 {
1115 j = 0;
1116 idxB = 0;
1117 // clean up at the beginning
1118 if(bir!=0)
1119 {
1120 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1121 j += ps-bir;
1122 idxB += 4;
1123 }
1124 // main loop
1125 for(; j<n; j+=4, idxB+=4)
1126 {
1127 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1128 }
1129 }
1130 #endif
1131 // common return if i==m
1132 return;
1133
1134
1135
1136 // clean up loops definitions
1137
1138 #if defined(TARGET_X64_INTEL_HASWELL)
1139 left_12:
1140 j = 0;
1141 idxB = 0;
1142 // clean up at the beginning
1143 if(bir!=0)
1144 {
1145 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1146 j += ps-bir;
1147 idxB += 4;
1148 }
1149 // main loop
1150 for(; j<n; j+=4, idxB+=4)
1151 {
1152 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1153 }
1154 return;
1155 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1156 left_12:
1157 j = 0;
1158 idxB = 0;
1159 // clean up at the beginning
1160 if(bir!=0)
1161 {
1162 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+0)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+0)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+0)*sdd]-bir*ps, sdd, 0, m-(i+0), bir, bir+n-j);
1163 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+4)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+4)*sdd]-bir*ps, sdd, 0, m-(i+4), bir, bir+n-j);
1164 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+8)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+8)*sdd]-bir*ps, sdd, 0, m-(i+8), bir, bir+n-j);
1165 j += ps-bir;
1166 idxB += 4;
1167 }
1168 // main loop
1169 for(; j<n; j+=4, idxB+=4)
1170 {
1171 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1172 }
1173 return;
1174 #endif
1175
1176
1177
1178 #if defined(TARGET_X64_INTEL_HASWELL)
1179 left_8:
1180 j = 0;
1181 idxB = 0;
1182 // clean up at the beginning
1183 if(bir!=0)
1184 {
1185 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1186 j += ps-bir;
1187 idxB += 4;
1188 }
1189 // main loop
1190 for(; j<n-8; j+=12, idxB+=12)
1191 {
1192 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1193 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(idxB+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
1194 }
1195 if(j<n)
1196 {
1197 if(n-j<=4)
1198 {
1199 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1200 }
1201 else
1202 {
1203 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1204 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(idxB+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
1205 }
1206 }
1207 return;
1208 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1209 left_8:
1210 j = 0;
1211 idxB = 0;
1212 // clean up at the beginning
1213 if(bir!=0)
1214 {
1215 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1216 j += ps-bir;
1217 idxB += 4;
1218 }
1219 // main loop
1220 for(; j<n; j+=4, idxB+=4)
1221 {
1222 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1223 }
1224 return;
1225 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1226 left_8:
1227 j = 0;
1228 idxB = 0;
1229 // clean up at the beginning
1230 if(bir!=0)
1231 {
1232 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+0)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+0)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+0)*sdd]-bir*ps, sdd, 0, m-(i+0), bir, bir+n-j);
1233 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+4)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+4)*sdd]-bir*ps, sdd, 0, m-(i+4), bir, bir+n-j);
1234 j += ps-bir;
1235 idxB += 4;
1236 }
1237 // main loop
1238 for(; j<n; j+=4, idxB+=4)
1239 {
1240 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1241 }
1242 return;
1243 #endif
1244
1245
1246
1247 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1248 left_8_g:
1249 j = 0;
1250 idxB = 0;
1251 // clean up at the beginning
1252 if(bir!=0)
1253 {
1254 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1255 j += ps-bir;
1256 idxB += 4;
1257 }
1258 // main loop
1259 for(; j<n; j+=4, idxB+=4)
1260 {
1261 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1262 }
1263 return;
1264 #endif
1265
1266
1267
1268 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1269 left_4:
1270 j = 0;
1271 idxB = 0;
1272 // clean up at the beginning
1273 if(bir!=0)
1274 {
1275 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1276 j += ps-bir;
1277 idxB += 4;
1278 }
1279 // main loop
1280 for(; j<n-8; j+=12, idxB+=12)
1281 {
1282 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1283 }
1284 if(j<n)
1285 {
1286 if(n-j<=4)
1287 {
1288 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1289 }
1290 else
1291 {
1292 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1293 }
1294 }
1295 return;
1296 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1297 left_4:
1298 j = 0;
1299 idxB = 0;
1300 // clean up at the beginning
1301 if(bir!=0)
1302 {
1303 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1304 j += ps-bir;
1305 idxB += 4;
1306 }
1307 // main loop
1308 for(; j<n-4; j+=8, idxB+=8)
1309 {
1310 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1311 }
1312 if(j<n)
1313 {
1314 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1315 }
1316 return;
1317 #elif defined(TARGET_X86_AMD_BARCELONA)
1318 left_4:
1319 j = 0;
1320 idxB = 0;
1321 // clean up at the beginning
1322 if(bir!=0)
1323 {
1324 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1325 j += ps-bir;
1326 idxB += 4;
1327 }
1328 // main loop
1329 for(; j<n-2; j+=4, idxB+=4)
1330 {
1331 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+0], &beta, &pC[(j+0)*ps+i*sdc], &pD[(j+0)*ps+i*sdd], m-i, n-j-0);
1332 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, n-j-2);
1333 }
1334 if(j<n)
1335 {
1336 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1337 }
1338 return;
1339 #else
1340 left_4:
1341 j = 0;
1342 idxB = 0;
1343 // clean up at the beginning
1344 if(bir!=0)
1345 {
1346 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1347 j += ps-bir;
1348 idxB += 4;
1349 }
1350 // main loop
1351 for(; j<n; j+=4, idxB+=4)
1352 {
1353 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1354 }
1355 return;
1356 #endif
1357
1358
1359
1360 left_4_g:
1361 j = 0;
1362 idxB = 0;
1363 // clean up at the beginning
1364 if(bir!=0)
1365 {
1366 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1367 j += ps-bir;
1368 idxB += 4;
1369 }
1370 // main loop
1371 for(; j<n; j+=4, idxB+=4)
1372 {
1373 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1374 }
1375 return;
1376
1377 }
1378
1379
1380
1381 // dgemm_tn
blasfeo_dgemm_tn(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)1382 void blasfeo_dgemm_tn(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
1383 {
1384 if(m<=0 || n<=0)
1385 return;
1386
1387 // invalidate stored inverse diagonal of result matrix
1388 sD->use_dA = 0;
1389
1390 const int ps = 4;
1391
1392 int sda = sA->cn;
1393 int sdb = sB->cn;
1394 int sdc = sC->cn;
1395 int sdd = sD->cn;
1396
1397 int air = ai & (ps-1);
1398 int bir = bi & (ps-1);
1399 int cir = ci & (ps-1);
1400 int dir = di & (ps-1);
1401
1402 double *pA = sA->pA + aj*ps + (ai-air)*sda;
1403 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
1404 double *pC = sC->pA + cj*ps + (ci-cir)*sdc;
1405 double *pD = sD->pA + dj*ps + (di-dir)*sdd;
1406
1407 int offsetA = air;
1408 int offsetB = bir;
1409 int offsetC = cir;
1410 int offsetD = dir;
1411
1412 // TODO visual studio alignment
1413 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1414 ALIGNED( double pU0[3*4*K_MAX_STACK], 64 );
1415 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1416 ALIGNED( double pU0[2*4*K_MAX_STACK], 64 );
1417 #elif defined(TARGET_GENERIC)
1418 double pU0[1*4*K_MAX_STACK];
1419 #else
1420 ALIGNED( double pU0[1*4*K_MAX_STACK], 64 );
1421 #endif
1422 int sdu0 = (k+3)/4*4;
1423 sdu0 = sdu0<K_MAX_STACK ? sdu0 : K_MAX_STACK;
1424
1425 struct blasfeo_dmat sAt;
1426 int sAt_size;
1427 void *mem;
1428 char *mem_align;
1429
1430 double *pU;
1431 int sdu;
1432
1433 int ii, jj;
1434
1435 if(k>K_MAX_STACK)
1436 {
1437 sAt_size = blasfeo_memsize_dmat(12, k);
1438 mem = malloc(sAt_size+64);
1439 blasfeo_align_64_byte(mem, (void **) &mem_align);
1440 blasfeo_create_dmat(12, k, &sAt, (void *) mem_align);
1441 pU = sAt.pA;
1442 sdu = sAt.cn;
1443 }
1444 else
1445 {
1446 pU = pU0;
1447 sdu = sdu0;
1448 }
1449
1450
1451 // algorithm scheme
1452 if(offsetC==0 & offsetD==0)
1453 {
1454 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_CORE) | defined(TARGET_GENERIC)
1455 if(m<=n)
1456 {
1457 goto loop_00_m0; // transpose A
1458 }
1459 else
1460 {
1461 goto loop_00_n0; // transpose B
1462 }
1463 #else
1464 goto loop_00_m0; // transpose A
1465 #endif
1466 }
1467 else
1468 {
1469 goto loop_CD_m0;
1470 }
1471 // should never get here
1472 return;
1473
1474
1475
1476 loop_00_m0:
1477 ii = 0;
1478 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1479 for(; ii<m-11; ii+=12)
1480 {
1481 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1482 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1483 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
1484 for(jj=0; jj<n-3; jj+=4)
1485 {
1486 kernel_dgemm_nn_12x4_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
1487 }
1488 if(jj<n)
1489 {
1490 kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1491 }
1492 }
1493 if(ii<m)
1494 {
1495 if(m-ii<=4)
1496 {
1497 goto left_4_m0;
1498 }
1499 if(m-ii<=8)
1500 {
1501 goto left_8_m0;
1502 }
1503 else
1504 {
1505 goto left_12_m0;
1506 }
1507 }
1508 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1509 for(; ii<m-7; ii+=8)
1510 {
1511 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1512 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1513 for(jj=0; jj<n-3; jj+=4)
1514 {
1515 kernel_dgemm_nn_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
1516 }
1517 if(jj<n)
1518 {
1519 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1520 }
1521 }
1522 if(ii<m)
1523 {
1524 if(m-ii<=4)
1525 {
1526 goto left_4_m0;
1527 }
1528 else
1529 {
1530 goto left_8_m0;
1531 }
1532 }
1533 #else
1534 for(; ii<m-3; ii+=4)
1535 {
1536 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1537 for(jj=0; jj<n-3; jj+=4)
1538 {
1539 kernel_dgemm_nn_4x4_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
1540 }
1541 if(jj<n)
1542 {
1543 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1544 }
1545 }
1546 if(ii<m)
1547 {
1548 goto left_4_m0;
1549 }
1550 #endif
1551 goto tn_return;
1552
1553
1554
1555 // non-malloc algorith, C, D not aligned
1556 loop_CD_m0:
1557 ii = 0;
1558 // clean up loops definitions
1559 #if defined(TARGET_X64_INTEL_HASWELL)
1560 for(; ii<m-8; ii+=12)
1561 {
1562 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1563 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1564 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
1565 for(jj=0; jj<n; jj+=4)
1566 {
1567 kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1568 }
1569 }
1570 if(ii<m)
1571 {
1572 if(m-ii<=4)
1573 {
1574 goto left_4_m0_g;
1575 }
1576 else
1577 {
1578 goto left_8_m0_g;
1579 }
1580 }
1581 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1582 for(; ii<m-4; ii+=8)
1583 {
1584 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1585 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1586 for(jj=0; jj<n; jj+=4)
1587 {
1588 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1589 }
1590 }
1591 if(ii<m)
1592 {
1593 goto left_4_m0_g;
1594 }
1595 #else
1596 for(; ii<m; ii+=4)
1597 {
1598 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1599 for(jj=0; jj<n; jj+=4)
1600 {
1601 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1602 }
1603 }
1604 #endif
1605 // common return if i==m
1606 goto tn_return;
1607
1608
1609
1610 loop_00_n0:
1611 jj = 0;
1612 #if defined(TARGET_X64_INTEL_HASWELL)
1613 for(; jj<n-11; jj+=12)
1614 {
1615 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+0)*ps, sdb, pU+0*sdu);
1616 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+4)*ps, sdb, pU+4*sdu);
1617 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+8)*ps, sdb, pU+8*sdu);
1618 for(ii=0; ii<m-3; ii+=4)
1619 {
1620 kernel_dgemm_tt_4x12_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
1621 }
1622 if(ii<m)
1623 {
1624 kernel_dgemm_tt_4x12_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1625 }
1626 }
1627 if(jj<n)
1628 {
1629 if(n-jj<=4)
1630 {
1631 goto left_4_n0;
1632 }
1633 if(n-jj<=8)
1634 {
1635 goto left_8_n0;
1636 }
1637 else
1638 {
1639 goto left_12_n0;
1640 }
1641 }
1642 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1643 for(; jj<n-7; jj+=8)
1644 {
1645 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+0)*ps, sdb, pU+0*sdu);
1646 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+4)*ps, sdb, pU+4*sdu);
1647 for(ii=0; ii<m-3; ii+=4)
1648 {
1649 kernel_dgemm_tt_4x8_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
1650 }
1651 if(ii<m)
1652 {
1653 kernel_dgemm_tt_4x8_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1654 }
1655 }
1656 if(jj<n)
1657 {
1658 if(n-jj<=4)
1659 {
1660 goto left_4_n0;
1661 }
1662 else
1663 {
1664 goto left_8_n0;
1665 }
1666 }
1667 #elif defined(TARGET_GENERIC) | defined(TARGET_X64_INTEL_CORE)
1668 for(; jj<n-3; jj+=4)
1669 {
1670 kernel_dpacp_tn_4_lib4(k, offsetB, pB+jj*ps, sdb, pU);
1671 for(ii=0; ii<m-3; ii+=4)
1672 {
1673 kernel_dgemm_tt_4x4_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
1674 }
1675 if(ii<m)
1676 {
1677 kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1678 }
1679 }
1680 if(jj<n)
1681 {
1682 goto left_4_n0;
1683 }
1684 #endif
1685 // common return if n==n
1686 goto tn_return;
1687
1688
1689
1690 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1691 left_12_m0:
1692 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1693 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1694 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
1695 for(jj=0; jj<n; jj+=4)
1696 {
1697 kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1698 }
1699 goto tn_return;
1700 #endif
1701
1702
1703
1704 #if defined(TARGET_X64_INTEL_HASWELL)
1705 left_12_n0:
1706 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+0)*ps, sdb, pU+0*sdu);
1707 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+4)*ps, sdb, pU+4*sdu);
1708 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+8)*ps, sdb, pU+8*sdu);
1709 for(ii=0; ii<m; ii+=4)
1710 {
1711 kernel_dgemm_tt_4x12_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1712 }
1713 goto tn_return;
1714 #endif
1715
1716
1717
1718 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1719 left_8_m0:
1720 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1721 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1722 for(jj=0; jj<n; jj+=4)
1723 {
1724 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1725 }
1726 goto tn_return;
1727 #endif
1728
1729
1730
1731 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1732 left_8_m0_g:
1733 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1734 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1735 for(jj=0; jj<n; jj+=4)
1736 {
1737 kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1738 }
1739 goto tn_return;
1740 #endif
1741
1742
1743
1744 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1745 left_8_n0:
1746 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+0)*ps, sdb, pU+0*sdu);
1747 kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+4)*ps, sdb, pU+4*sdu);
1748 for(ii=0; ii<m; ii+=4)
1749 {
1750 kernel_dgemm_tt_4x8_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1751 }
1752 goto tn_return;
1753 #endif
1754
1755
1756
1757 #if defined(TARGET_X64_INTEL_HASWELL)
1758 left_4_m0:
1759 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1760 for(jj=0; jj<n-8; jj+=12)
1761 {
1762 kernel_dgemm_nn_4x12_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1763 }
1764 if(jj<n-4)
1765 {
1766 kernel_dgemm_nn_4x8_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1767 }
1768 else if(jj<n)
1769 {
1770 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1771 }
1772 goto tn_return;
1773 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1774 left_4_m0:
1775 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1776 for(jj=0; jj<n-4; jj+=8)
1777 {
1778 kernel_dgemm_nn_4x8_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1779 }
1780 if(jj<n)
1781 {
1782 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1783 }
1784 goto tn_return;
1785 #else // all others
1786 left_4_m0:
1787 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1788 for(jj=0; jj<n; jj+=4)
1789 {
1790 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1791 }
1792 goto tn_return;
1793 #endif
1794
1795
1796
1797 left_4_m0_g:
1798 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1799 for(jj=0; jj<n; jj+=4)
1800 {
1801 kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1802 }
1803 goto tn_return;
1804
1805
1806
1807 #if defined(TARGET_X64_INTEL_HASWELL)
1808 left_4_n0:
1809 kernel_dpacp_tn_4_lib4(k, offsetB, pB+jj*ps, sdb, pU);
1810 for(ii=0; ii<m-8; ii+=12)
1811 {
1812 kernel_dgemm_tt_12x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1813 }
1814 if(ii<m-4)
1815 {
1816 kernel_dgemm_tt_8x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1817 }
1818 else if(ii<m)
1819 {
1820 kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1821 }
1822 goto tn_return;
1823 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1824 left_4_n0:
1825 kernel_dpacp_tn_4_lib4(k, offsetB, pB+jj*ps, sdb, pU);
1826 for(ii=0; ii<m-4; ii+=8)
1827 {
1828 kernel_dgemm_tt_8x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1829 }
1830 if(ii<m)
1831 {
1832 kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1833 }
1834 goto tn_return;
1835 #elif defined(TARGET_GENERIC) | defined(TARGET_X64_INTEL_CORE)
1836 left_4_n0:
1837 kernel_dpacp_tn_4_lib4(k, offsetB, pB+jj*ps, sdb, pU);
1838 for(ii=0; ii<m; ii+=4)
1839 {
1840 kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1841 }
1842 goto tn_return;
1843 #endif
1844
1845
1846
1847 tn_return:
1848 if(k>K_MAX_STACK)
1849 {
1850 free(mem);
1851 }
1852 return;
1853
1854 }
1855
1856
1857
1858 // dgemm_tt
1859 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_CORE) | defined(TARGET_GENERIC)
blasfeo_dgemm_tt(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)1860 void blasfeo_dgemm_tt(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
1861 {
1862 if(m<=0 || n<=0)
1863 return;
1864
1865 // invalidate stored inverse diagonal of result matrix
1866 sD->use_dA = 0;
1867
1868 const int ps = 4;
1869
1870 int sda = sA->cn;
1871 int sdb = sB->cn;
1872 int sdc = sC->cn;
1873 int sdd = sD->cn;
1874
1875 int air = ai & (ps-1);
1876 int bir = bi & (ps-1);
1877
1878 // pA, pB point to panels edges
1879 double *pA = sA->pA + aj*ps + (ai-air)*sda;
1880 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
1881 double *pC = sC->pA + (cj-bir)*ps;
1882 double *pD = sD->pA + (dj-bir)*ps;
1883
1884 int offsetA = air;
1885
1886 int ci0 = ci; //-bir;
1887 int di0 = di; //-bir;
1888 int offsetC;
1889 int offsetD;
1890 if(ci0>=0)
1891 {
1892 pC += ci0/ps*ps*sdd;
1893 offsetC = ci0%ps;
1894 }
1895 else
1896 {
1897 pC += -ps*sdc;
1898 offsetC = ps+ci0;
1899 }
1900
1901 if(di0>=0)
1902 {
1903 pD += di0/ps*ps*sdd;
1904 offsetD = di0%ps;
1905 }
1906 else
1907 {
1908 pD += -ps*sdd;
1909 offsetD = ps+di0;
1910 }
1911
1912 int i, j, l;
1913
1914
1915
1916 // algorithm scheme
1917 if(bir!=0)
1918 {
1919 goto clear_bir;
1920 }
1921 select_loop:
1922 if(offsetC==0 & offsetD==0)
1923 {
1924 goto loop_00;
1925 }
1926 else
1927 {
1928 goto loop_CD;
1929 }
1930 // should never get here
1931 return;
1932
1933
1934
1935 // clean up at the beginning
1936 clear_bir:
1937 #if defined(TARGET_X64_INTEL_HASWELL)
1938 if(bir+n>8) // (m>9)
1939 {
1940 i = 0;
1941 for(; i<m; i+=4)
1942 {
1943 kernel_dgemm_tt_4x12_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[0], sdb, &beta, offsetC, &pC[i*sdc], sdc, offsetD, &pD[i*sdd], sdd, 0, m-i, bir, bir+n);
1944 }
1945 n -= 3*ps-bir;
1946 pB += 3*ps*sdb;
1947 pC += 3*4*ps;
1948 pD += 3*4*ps;
1949 }
1950 else // bir+n<=8
1951 #endif
1952 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1953 if(bir+n>4) // (m>5)
1954 {
1955 i = 0;
1956 for(; i<m; i+=4)
1957 {
1958 kernel_dgemm_tt_4x8_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[0], sdb, &beta, offsetC, &pC[i*sdc], sdc, offsetD, &pD[i*sdd], sdd, 0, m-i, bir, bir+n);
1959 }
1960 n -= 2*ps-bir;
1961 pB += 2*ps*sdb;
1962 pC += 2*4*ps;
1963 pD += 2*4*ps;
1964 }
1965 else // air+m<=4 // m-i<=4
1966 {
1967 #endif
1968 i = 0;
1969 for(; i<m; i+=4)
1970 {
1971 kernel_dgemm_tt_4x4_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[0], &beta, offsetC, &pC[i*sdc], sdc, offsetD, &pD[i*sdd], sdd, 0, m-i, bir, bir+n);
1972 }
1973 n -= 1*ps-bir;
1974 pB += 1*ps*sdb;
1975 pC += 1*4*ps;
1976 pD += 1*4*ps;
1977 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1978 // nothing more to do
1979 }
1980 #endif
1981 goto select_loop;
1982
1983
1984
1985 // main loop aligned
1986 loop_00:
1987 j = 0;
1988 #if defined(TARGET_X64_INTEL_HASWELL) //| defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1989 for(; j<n-11; j+=12)
1990 {
1991 i = 0;
1992 for(; i<m-3; i+=4)
1993 {
1994 kernel_dgemm_tt_4x12_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
1995 }
1996 if(i<m)
1997 {
1998 kernel_dgemm_tt_4x12_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1999 }
2000 }
2001 if(n>j)
2002 {
2003 if(n-j<=4)
2004 {
2005 goto left_4;
2006 }
2007 else if(n-j<=8)
2008 {
2009 goto left_8;
2010 }
2011 else
2012 {
2013 goto left_12;
2014 }
2015 }
2016 #elif 0//defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2017 for(; i<m-12 | i==m-8; i+=8)
2018 {
2019 j = 0;
2020 for(; j<n-3; j+=4)
2021 {
2022 kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2023 }
2024 if(j<n)
2025 {
2026 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2027 }
2028 }
2029 if(m>i)
2030 {
2031 if(m-i<=4)
2032 {
2033 goto left_4;
2034 }
2035 else if(m-i<=8)
2036 {
2037 goto left_8;
2038 }
2039 else
2040 {
2041 goto left_12;
2042 }
2043 }
2044 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) //| defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2045 for(; j<n-7; j+=8)
2046 {
2047 i = 0;
2048 for(; i<m-3; i+=4)
2049 {
2050 kernel_dgemm_tt_4x8_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2051 }
2052 if(i<m)
2053 {
2054 kernel_dgemm_tt_4x8_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2055 }
2056 }
2057 if(n>j)
2058 {
2059 if(n-j<=4)
2060 {
2061 goto left_4;
2062 }
2063 else
2064 {
2065 goto left_8;
2066 }
2067 }
2068 #elif 0//defined(TARGET_X86_AMD_BARCELONA)
2069 for(; i<m-3; i+=4)
2070 {
2071 j = 0;
2072 for(; j<n-1; j+=2)
2073 {
2074 kernel_dgemm_nn_4x2_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2075 }
2076 if(j<n)
2077 {
2078 kernel_dgemm_nn_4x2_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2079 }
2080 }
2081 if(m>i)
2082 {
2083 goto left_4;
2084 }
2085 #else // all others
2086 for(; j<n-3; j+=4)
2087 {
2088 i = 0;
2089 for(; i<m-3; i+=4)
2090 {
2091 kernel_dgemm_tt_4x4_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2092 }
2093 if(i<m)
2094 {
2095 kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2096 }
2097 }
2098 if(n>j)
2099 {
2100 goto left_4;
2101 }
2102 #endif
2103 // common return if i==m
2104 return;
2105
2106
2107
2108 // main loop C, D not aligned
2109 loop_CD:
2110 j = 0;
2111 #if defined(TARGET_X64_INTEL_HASWELL)
2112 for(; j<n-8; j+=12)
2113 {
2114 i = 0;
2115 for(; i<m; i+=4)
2116 {
2117 kernel_dgemm_tt_4x12_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2118 }
2119 }
2120 if(n>j)
2121 {
2122 if(n-j<=4)
2123 {
2124 goto left_4_g;
2125 }
2126 else
2127 {
2128 goto left_8_g;
2129 }
2130 }
2131 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2132 for(; j<n-4; j+=8)
2133 {
2134 i = 0;
2135 for(; i<m; i+=4)
2136 {
2137 kernel_dgemm_tt_4x8_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2138 }
2139 }
2140 if(n>j)
2141 {
2142 goto left_4_g;
2143 }
2144 #else
2145 for(; j<n; j+=4)
2146 {
2147 i = 0;
2148 for(; i<m; i+=4)
2149 {
2150 kernel_dgemm_tt_4x4_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2151 }
2152 }
2153 #endif
2154 // common return if i==m
2155 return;
2156
2157
2158
2159 // clean up loops definitions
2160
2161 #if defined(TARGET_X64_INTEL_HASWELL)
2162 left_12_g:
2163 i = 0;
2164 for(; i<m; i+=4)
2165 {
2166 kernel_dgemm_tt_4x12_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2167 }
2168 return;
2169 #endif
2170
2171
2172
2173 #if defined(TARGET_X64_INTEL_HASWELL) //| defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2174 left_12:
2175 i = 0;
2176 for(; i<m; i+=4)
2177 {
2178 kernel_dgemm_tt_4x12_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2179 }
2180 return;
2181 #endif
2182
2183
2184
2185 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2186 left_8_g:
2187 i = 0;
2188 for(; i<m; i+=4)
2189 {
2190 kernel_dgemm_tt_4x8_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2191 }
2192 return;
2193 #endif
2194
2195
2196
2197 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2198 left_8:
2199 i = 0;
2200 for(; i<m; i+=4)
2201 {
2202 kernel_dgemm_tt_4x8_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2203 }
2204 return;
2205 #endif
2206
2207
2208
2209 left_4_g:
2210 i = 0;
2211 for(; i<m; i+=4)
2212 {
2213 kernel_dgemm_tt_4x4_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2214 }
2215 return;
2216
2217
2218
2219 #if defined(TARGET_X64_INTEL_HASWELL)
2220 left_4:
2221 i = 0;
2222 for(; i<m-8; i+=12)
2223 {
2224 kernel_dgemm_tt_12x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sda], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2225 }
2226 if(i<m-4)
2227 {
2228 kernel_dgemm_tt_8x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2229 }
2230 else if(i<m)
2231 {
2232 kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sda], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2233 }
2234 return;
2235 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2236 left_4:
2237 i = 0;
2238 for(; i<m-4; i+=8)
2239 {
2240 kernel_dgemm_tt_8x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sda], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2241 }
2242 if(i<m)
2243 {
2244 kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sda], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2245 }
2246 return;
2247 #elif 0//defined(TARGET_X86_AMD_BARCELONA)
2248 left_4:
2249 j = 0;
2250 for(; j<n; j+=2)
2251 {
2252 kernel_dgemm_nn_4x2_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2253 }
2254 return;
2255 #else // all others
2256 left_4:
2257 i = 0;
2258 for(; i<m; i+=4)
2259 {
2260 kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2261 }
2262 return;
2263 #endif
2264
2265 return;
2266
2267 }
2268
2269 #else
blasfeo_dgemm_tt(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)2270 void blasfeo_dgemm_tt(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
2271 {
2272 if(m<=0 || n<=0)
2273 return;
2274
2275 // invalidate stored inverse diagonal of result matrix
2276 sD->use_dA = 0;
2277
2278 const int ps = 4;
2279
2280 int sda = sA->cn;
2281 int sdb = sB->cn;
2282 int sdc = sC->cn;
2283 int sdd = sD->cn;
2284
2285 int air = ai & (ps-1);
2286 int bir = bi & (ps-1);
2287 int cir = ci & (ps-1);
2288 int dir = di & (ps-1);
2289
2290 double *pA = sA->pA + aj*ps + (ai-air)*sda;
2291 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
2292 double *pC = sC->pA + cj*ps + (ci-cir)*sdc;
2293 double *pD = sD->pA + dj*ps + (di-dir)*sdd;
2294
2295 int offsetA = air;
2296 int offsetB = bir;
2297 int offsetC = cir;
2298 int offsetD = dir;
2299
2300 // TODO visual studio alignment
2301 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2302 ALIGNED( double pU[3*4*K_MAX_STACK], 64 );
2303 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2304 ALIGNED( double pU[2*4*K_MAX_STACK], 64 );
2305 #elif defined(TARGET_GENERIC)
2306 double pU[1*4*K_MAX_STACK];
2307 #else
2308 ALIGNED( double pU[1*4*K_MAX_STACK], 64 );
2309 #endif
2310 int sdu = (k+3)/4*4;
2311 sdu = sdu<K_MAX_STACK ? sdu : K_MAX_STACK;
2312
2313 struct blasfeo_dmat sAt;
2314 int sdat;
2315 int sAt_size;
2316 void *mem;
2317 char *mem_align;
2318 double *pAt;
2319
2320 int ii, jj;
2321
2322 int idxB;
2323
2324
2325
2326 // algorithm scheme
2327 if(offsetC==0 & offsetD==0)
2328 {
2329 if(k>K_MAX_STACK)
2330 {
2331 goto loop_00_1;
2332 }
2333 else
2334 {
2335 goto loop_00_0;
2336 }
2337 }
2338 else
2339 {
2340 if(k>K_MAX_STACK)
2341 {
2342 goto loop_CD_1;
2343 }
2344 else
2345 {
2346 goto loop_CD_0;
2347 }
2348 }
2349 // should never get here
2350 return;
2351
2352
2353
2354 // main loop aligned
2355 loop_00_0:
2356 ii = 0;
2357 #if defined(TARGET_X64_INTEL_HASWELL)
2358 for(; ii<m-11; ii+=12)
2359 {
2360 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2361 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2362 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2363 jj = 0;
2364 idxB = 0;
2365 // clean up at the beginning
2366 if(bir!=0)
2367 {
2368 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2369 jj += ps-bir;
2370 idxB += 4;
2371 }
2372 // main loop
2373 for(; jj<n-3; jj+=4, idxB+=4)
2374 {
2375 kernel_dgemm_nt_12x4_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2376 }
2377 if(jj<n)
2378 {
2379 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2380 }
2381 }
2382 if(ii<m)
2383 {
2384 if(m-ii<=4)
2385 {
2386 goto left_4_0;
2387 }
2388 if(m-ii<=8)
2389 {
2390 goto left_8_0;
2391 }
2392 else
2393 {
2394 goto left_12_0;
2395 }
2396 }
2397 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2398 for(; ii<m-11; ii+=12)
2399 {
2400 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2401 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2402 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2403 jj = 0;
2404 idxB = 0;
2405 // clean up at the beginning
2406 if(bir!=0)
2407 {
2408 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+0*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2409 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+4*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2410 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+8*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+8)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+8)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+8), bir, bir+n-jj);
2411 jj += ps-bir;
2412 idxB += 4;
2413 }
2414 // main loop
2415 for(; jj<n-3; jj+=4, idxB+=4)
2416 {
2417 kernel_dgemm_nt_12x4_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2418 }
2419 if(jj<n)
2420 {
2421 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2422 }
2423 }
2424 if(ii<m)
2425 {
2426 if(m-ii<=4)
2427 {
2428 goto left_4_0;
2429 }
2430 if(m-ii<=8)
2431 {
2432 goto left_8_0;
2433 }
2434 else
2435 {
2436 goto left_12_0;
2437 }
2438 }
2439 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2440 for(; ii<m-7; ii+=8)
2441 {
2442 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2443 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2444 jj = 0;
2445 idxB = 0;
2446 // clean up at the beginning
2447 if(bir!=0)
2448 {
2449 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2450 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2451 #else
2452 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+0*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2453 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+4*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2454 #endif
2455 jj += ps-bir;
2456 idxB += 4;
2457 }
2458 // main loop
2459 for(; jj<n-3; jj+=4, idxB+=4)
2460 {
2461 kernel_dgemm_nt_8x4_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2462 }
2463 if(jj<n)
2464 {
2465 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2466 }
2467 }
2468 if(ii<m)
2469 {
2470 if(m-ii<=4)
2471 {
2472 goto left_4_0;
2473 }
2474 else
2475 {
2476 goto left_8_0;
2477 }
2478 }
2479 #elif defined(TARGET_X86_AMD_BARCELONA)
2480 for(; ii<m-3; ii+=4)
2481 {
2482 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2483 jj = 0;
2484 idxB = 0;
2485 // clean up at the beginning
2486 if(bir!=0)
2487 {
2488 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2489 jj += ps-bir;
2490 idxB += 4;
2491 }
2492 // main loop
2493 for(; jj<n-3; jj+=4, idxB+=4)
2494 {
2495 kernel_dgemm_nt_4x2_lib4(k, &alpha, pU, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps);
2496 kernel_dgemm_nt_4x2_lib4(k, &alpha, pU, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps);
2497 }
2498 if(jj<n-2)
2499 {
2500 kernel_dgemm_nt_4x2_lib4(k, &alpha, pU, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps);
2501 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps, m-ii, n-(jj+2));
2502 }
2503 else if(jj<n)
2504 {
2505 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2506 }
2507 }
2508 if(ii<m)
2509 {
2510 goto left_4_0;
2511 }
2512 #else
2513 for(; ii<m-3; ii+=4)
2514 {
2515 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2516 jj = 0;
2517 idxB = 0;
2518 // clean up at the beginning
2519 if(bir!=0)
2520 {
2521 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2522 jj += ps-bir;
2523 idxB += 4;
2524 }
2525 // main loop
2526 for(; jj<n-3; jj+=4, idxB+=4)
2527 {
2528 kernel_dgemm_nt_4x4_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
2529 }
2530 if(jj<n)
2531 {
2532 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2533 }
2534 }
2535 if(ii<m)
2536 {
2537 goto left_4_0;
2538 }
2539 #endif
2540 goto tt_0_return;
2541
2542
2543
2544 // main loop C, D not aligned
2545 loop_CD_0:
2546 ii = 0;
2547 #if defined(TARGET_X64_INTEL_HASWELL)
2548 for(; ii<m-8; ii+=12)
2549 {
2550 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2551 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2552 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2553 jj = 0;
2554 idxB = 0;
2555 // clean up at the beginning
2556 if(bir!=0)
2557 {
2558 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2559 jj += ps-bir;
2560 idxB += 4;
2561 }
2562 // main loop
2563 for(; jj<n; jj+=4, idxB+=4)
2564 {
2565 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2566 }
2567 }
2568 if(m>ii)
2569 {
2570 if(m-ii<=4)
2571 {
2572 goto left_4_0_g;
2573 }
2574 else
2575 {
2576 goto left_8_0_g;
2577 }
2578 }
2579 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2580 for(; ii<m-4; ii+=8)
2581 {
2582 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2583 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2584 jj = 0;
2585 idxB = 0;
2586 // clean up at the beginning
2587 if(bir!=0)
2588 {
2589 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2590 jj += ps-bir;
2591 idxB += 4;
2592 }
2593 // main loop
2594 for(; jj<n; jj+=4, idxB+=4)
2595 {
2596 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2597 }
2598 }
2599 if(m>ii)
2600 {
2601 goto left_4_0_g;
2602 }
2603 #else
2604 for(; ii<m; ii+=4)
2605 {
2606 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2607 jj = 0;
2608 idxB = 0;
2609 // clean up at the beginning
2610 if(bir!=0)
2611 {
2612 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2613 jj += ps-bir;
2614 idxB += 4;
2615 }
2616 // main loop
2617 for(; jj<n; jj+=4, idxB+=4)
2618 {
2619 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2620 }
2621 }
2622 #endif
2623 // common return if i==m
2624 goto tt_0_return;
2625
2626
2627
2628 #if defined(TARGET_X64_INTEL_HASWELL)
2629 left_12_0:
2630 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2631 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2632 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2633 jj = 0;
2634 idxB = 0;
2635 // clean up at the beginning
2636 if(bir!=0)
2637 {
2638 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2639 jj += ps-bir;
2640 idxB += 4;
2641 }
2642 // main loop
2643 for(; jj<n; jj+=4, idxB+=4)
2644 {
2645 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2646 }
2647 goto tt_0_return;
2648 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2649 left_12_0:
2650 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2651 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2652 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2653 jj = 0;
2654 idxB = 0;
2655 // clean up at the beginning
2656 if(bir!=0)
2657 {
2658 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+0*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2659 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+4*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2660 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+8*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+8)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+8)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+8), bir, bir+n-jj);
2661 jj += ps-bir;
2662 idxB += 4;
2663 }
2664 // main loop
2665 for(; jj<n; jj+=4, idxB+=4)
2666 {
2667 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2668 }
2669 goto tt_0_return;
2670 #endif
2671
2672
2673
2674 #if defined(TARGET_X64_INTEL_HASWELL)
2675 left_8_0:
2676 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2677 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2678 jj = 0;
2679 idxB = 0;
2680 // clean up at the beginning
2681 if(bir!=0)
2682 {
2683 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2684 jj += ps-bir;
2685 idxB += 4;
2686 }
2687 // main loop
2688 for(; jj<n-8; jj+=12, idxB+=12)
2689 {
2690 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pU, sdu, pB+(idxB+0)*sdb, sdb, &beta, pC+ii*sdc+(jj+0)*ps, sdc, pD+ii*sdd+(jj+0)*ps, sdd, m-ii, n-(jj+0));
2691 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, pU, sdu, pB+(idxB+4)*sdb, sdb, &beta, pC+ii*sdc+(jj+4)*ps, sdc, pD+ii*sdd+(jj+4)*ps, sdd, m-ii, n-(jj+4));
2692 }
2693 if(jj<n)
2694 {
2695 if(n-jj<=4)
2696 {
2697 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2698 }
2699 else
2700 {
2701 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pU, sdu, pB+(idxB+0)*sdb, sdb, &beta, pC+ii*sdc+(jj+0)*ps, sdc, pD+ii*sdd+(jj+0)*ps, sdd, m-ii, n-(jj+0));
2702 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+(idxB+4)*sdb, &beta, pC+ii*sdc+(jj+4)*ps, pD+ii*sdd+(jj+4)*ps, m-ii, n-(jj+4));
2703 }
2704 }
2705 goto tt_0_return;
2706 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2707 left_8_0:
2708 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2709 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2710 jj = 0;
2711 idxB = 0;
2712 // clean up at the beginning
2713 if(bir!=0)
2714 {
2715 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2716 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2717 #else
2718 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+0*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2719 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+4*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2720 #endif
2721 jj += ps-bir;
2722 idxB += 4;
2723 }
2724 // main loop
2725 for(; jj<n; jj+=4, idxB+=4)
2726 {
2727 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2728 }
2729 goto tt_0_return;
2730 #endif
2731
2732
2733
2734 #if defined(TARGET_X64_INTEL_HASWELL)
2735 left_8_0_g:
2736 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2737 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2738 jj = 0;
2739 idxB = 0;
2740 // clean up at the beginning
2741 if(bir!=0)
2742 {
2743 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2744 jj += ps-bir;
2745 idxB += 4;
2746 }
2747 // main loop
2748 for(; jj<n; jj+=4, idxB+=4)
2749 {
2750 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2751 }
2752 goto tt_0_return;
2753 #endif
2754
2755
2756
2757 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2758 left_4_0:
2759 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2760 jj = 0;
2761 idxB = 0;
2762 // clean up at the beginning
2763 if(bir!=0)
2764 {
2765 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2766 jj += ps-bir;
2767 idxB += 4;
2768 }
2769 // main loop
2770 for(; jj<n-8; jj+=12, idxB+=12)
2771 {
2772 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, pU, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2773 }
2774 if(jj<n)
2775 {
2776 if(n-jj<=4)
2777 {
2778 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2779 }
2780 else
2781 {
2782 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pU, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2783 }
2784 }
2785 goto tt_0_return;
2786 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2787 left_4_0:
2788 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2789 jj = 0;
2790 idxB = 0;
2791 // clean up at the beginning
2792 if(bir!=0)
2793 {
2794 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2795 jj += ps-bir;
2796 idxB += 4;
2797 }
2798 // main loop
2799 for(; jj<n-4; jj+=8, idxB+=8)
2800 {
2801 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pU, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2802 }
2803 if(jj<n)
2804 {
2805 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2806 }
2807 goto tt_0_return;
2808 #elif defined(TARGET_X86_AMD_BARCELONA)
2809 left_4_0:
2810 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2811 jj = 0;
2812 idxB = 0;
2813 // clean up at the beginning
2814 if(bir!=0)
2815 {
2816 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2817 jj += ps-bir;
2818 idxB += 4;
2819 }
2820 // main loop
2821 for(; jj<n-2; jj+=4, idxB+=4)
2822 {
2823 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps, m-ii, n-(jj+0));
2824 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps, m-ii, n-(jj+2));
2825 }
2826 if(jj<n)
2827 {
2828 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2829 }
2830 goto tt_0_return;
2831 #else
2832 left_4_0:
2833 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2834 jj = 0;
2835 idxB = 0;
2836 // clean up at the beginning
2837 if(bir!=0)
2838 {
2839 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2840 jj += ps-bir;
2841 idxB += 4;
2842 }
2843 // main loop
2844 for(; jj<n; jj+=4, idxB+=4)
2845 {
2846 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2847 }
2848 goto tt_0_return;
2849 #endif
2850
2851
2852
2853 left_4_0_g:
2854 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2855 jj = 0;
2856 idxB = 0;
2857 // clean up at the beginning
2858 if(bir!=0)
2859 {
2860 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2861 jj += ps-bir;
2862 idxB += 4;
2863 }
2864 // main loop
2865 for(; jj<n; jj+=4, idxB+=4)
2866 {
2867 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2868 }
2869 goto tt_0_return;
2870
2871
2872
2873 tt_0_return:
2874 return;
2875
2876
2877
2878 loop_00_1:
2879 sAt_size = blasfeo_memsize_dmat(12, k);
2880 mem = malloc(sAt_size+64);
2881 blasfeo_align_64_byte(mem, (void **) &mem_align);
2882 blasfeo_create_dmat(12, k, &sAt, (void *) mem_align);
2883 pAt = sAt.pA;
2884 sdat = sAt.cn;
2885
2886 ii = 0;
2887 #if defined(TARGET_X64_INTEL_HASWELL)
2888 for(; ii<m-11; ii+=12)
2889 {
2890 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
2891 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
2892 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
2893 jj = 0;
2894 idxB = 0;
2895 // clean up at the beginning
2896 if(bir!=0)
2897 {
2898 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2899 jj += ps-bir;
2900 idxB += 4;
2901 }
2902 // main loop
2903 for(; jj<n-3; jj+=4, idxB+=4)
2904 {
2905 kernel_dgemm_nt_12x4_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2906 }
2907 if(jj<n)
2908 {
2909 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdat, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2910 }
2911 }
2912 if(ii<m)
2913 {
2914 if(m-ii<=4)
2915 {
2916 goto left_4_1;
2917 }
2918 if(m-ii<=8)
2919 {
2920 goto left_8_1;
2921 }
2922 else
2923 {
2924 goto left_12_1;
2925 }
2926 }
2927 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2928 for(; ii<m-11; ii+=12)
2929 {
2930 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
2931 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
2932 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
2933 jj = 0;
2934 idxB = 0;
2935 // clean up at the beginning
2936 if(bir!=0)
2937 {
2938 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+0*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2939 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+4*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2940 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+8*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+8)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+8)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+8), bir, bir+n-jj);
2941 jj += ps-bir;
2942 idxB += 4;
2943 }
2944 // main loop
2945 for(; jj<n-3; jj+=4, idxB+=4)
2946 {
2947 kernel_dgemm_nt_12x4_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2948 }
2949 if(jj<n)
2950 {
2951 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdat, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2952 }
2953 }
2954 if(ii<m)
2955 {
2956 if(m-ii<=4)
2957 {
2958 goto left_4_1;
2959 }
2960 if(m-ii<=8)
2961 {
2962 goto left_8_1;
2963 }
2964 else
2965 {
2966 goto left_12_1;
2967 }
2968 }
2969 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2970 for(; ii<m-7; ii+=8)
2971 {
2972 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
2973 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
2974 jj = 0;
2975 idxB = 0;
2976 // clean up at the beginning
2977 if(bir!=0)
2978 {
2979 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2980 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2981 #else
2982 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+0*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2983 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+4*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2984 #endif
2985 jj += ps-bir;
2986 idxB += 4;
2987 }
2988 // main loop
2989 for(; jj<n-3; jj+=4, idxB+=4)
2990 {
2991 kernel_dgemm_nt_8x4_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2992 }
2993 if(jj<n)
2994 {
2995 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdat, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2996 }
2997 }
2998 if(ii<m)
2999 {
3000 if(m-ii<=4)
3001 {
3002 goto left_4_1;
3003 }
3004 else
3005 {
3006 goto left_8_1;
3007 }
3008 }
3009 #elif defined(TARGET_X86_AMD_BARCELONA)
3010 for(; ii<m-3; ii+=4)
3011 {
3012 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3013 jj = 0;
3014 idxB = 0;
3015 // clean up at the beginning
3016 if(bir!=0)
3017 {
3018 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3019 jj += ps-bir;
3020 idxB += 4;
3021 }
3022 // main loop
3023 for(; jj<n-3; jj+=4, idxB+=4)
3024 {
3025 kernel_dgemm_nt_4x2_lib4(k, &alpha, pAt, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps);
3026 kernel_dgemm_nt_4x2_lib4(k, &alpha, pAt, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps);
3027 }
3028 if(jj<n-2)
3029 {
3030 kernel_dgemm_nt_4x2_lib4(k, &alpha, pAt, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps);
3031 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps, m-ii, n-(jj+2));
3032 }
3033 else if(jj<n)
3034 {
3035 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3036 }
3037 }
3038 if(ii<m)
3039 {
3040 goto left_4_1;
3041 }
3042 #else
3043 for(; ii<m-3; ii+=4)
3044 {
3045 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3046 jj = 0;
3047 idxB = 0;
3048 // clean up at the beginning
3049 if(bir!=0)
3050 {
3051 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3052 jj += ps-bir;
3053 idxB += 4;
3054 }
3055 // main loop
3056 for(; jj<n-3; jj+=4, idxB+=4)
3057 {
3058 kernel_dgemm_nt_4x4_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
3059 }
3060 if(jj<n)
3061 {
3062 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3063 }
3064 }
3065 if(ii<m)
3066 {
3067 goto left_4_1;
3068 }
3069 #endif
3070 goto tt_1_return;
3071
3072
3073
3074 // main loop C, D not aligned
3075 loop_CD_1:
3076 sAt_size = blasfeo_memsize_dmat(12, k);
3077 mem = malloc(sAt_size+64);
3078 blasfeo_align_64_byte(mem, (void **) &mem_align);
3079 blasfeo_create_dmat(12, k, &sAt, (void *) mem_align);
3080 pAt = sAt.pA;
3081 sdat = sAt.cn;
3082
3083 ii = 0;
3084 #if defined(TARGET_X64_INTEL_HASWELL)
3085 for(; ii<m-8; ii+=12)
3086 {
3087 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3088 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3089 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
3090 jj = 0;
3091 idxB = 0;
3092 // clean up at the beginning
3093 if(bir!=0)
3094 {
3095 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3096 jj += ps-bir;
3097 idxB += 4;
3098 }
3099 // main loop
3100 for(; jj<n; jj+=4, idxB+=4)
3101 {
3102 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3103 }
3104 }
3105 if(m>ii)
3106 {
3107 if(m-ii<=4)
3108 {
3109 goto left_4_1_g;
3110 }
3111 else
3112 {
3113 goto left_8_1_g;
3114 }
3115 }
3116 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
3117 for(; ii<m-4; ii+=8)
3118 {
3119 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3120 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3121 jj = 0;
3122 idxB = 0;
3123 // clean up at the beginning
3124 if(bir!=0)
3125 {
3126 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3127 jj += ps-bir;
3128 idxB += 4;
3129 }
3130 // main loop
3131 for(; jj<n; jj+=4, idxB+=4)
3132 {
3133 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3134 }
3135 }
3136 if(m>ii)
3137 {
3138 goto left_4_1_g;
3139 }
3140 #else
3141 for(; ii<m; ii+=4)
3142 {
3143 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3144 jj = 0;
3145 idxB = 0;
3146 // clean up at the beginning
3147 if(bir!=0)
3148 {
3149 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3150 jj += ps-bir;
3151 idxB += 4;
3152 }
3153 // main loop
3154 for(; jj<n; jj+=4, idxB+=4)
3155 {
3156 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3157 }
3158 }
3159 #endif
3160 // common return if i==m
3161 goto tt_1_return;
3162
3163
3164
3165 #if defined(TARGET_X64_INTEL_HASWELL)
3166 left_12_1:
3167 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3168 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3169 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
3170 jj = 0;
3171 idxB = 0;
3172 // clean up at the beginning
3173 if(bir!=0)
3174 {
3175 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3176 jj += ps-bir;
3177 idxB += 4;
3178 }
3179 // main loop
3180 for(; jj<n; jj+=4, idxB+=4)
3181 {
3182 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
3183 }
3184 goto tt_1_return;
3185 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3186 left_12_1:
3187 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3188 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3189 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
3190 jj = 0;
3191 idxB = 0;
3192 // clean up at the beginning
3193 if(bir!=0)
3194 {
3195 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+0*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
3196 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+4*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
3197 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+8*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+8)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+8)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+8), bir, bir+n-jj);
3198 jj += ps-bir;
3199 idxB += 4;
3200 }
3201 // main loop
3202 for(; jj<n; jj+=4, idxB+=4)
3203 {
3204 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
3205 }
3206 goto tt_1_return;
3207 #endif
3208
3209
3210
3211 #if defined(TARGET_X64_INTEL_HASWELL)
3212 left_8_1:
3213 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3214 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3215 jj = 0;
3216 idxB = 0;
3217 // clean up at the beginning
3218 if(bir!=0)
3219 {
3220 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3221 jj += ps-bir;
3222 idxB += 4;
3223 }
3224 // main loop
3225 for(; jj<n-8; jj+=12, idxB+=12)
3226 {
3227 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pAt, sdat, pB+(idxB+0)*sdb, sdb, &beta, pC+ii*sdc+(jj+0)*ps, sdc, pD+ii*sdd+(jj+0)*ps, sdd, m-ii, n-(jj+0));
3228 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, pAt, sdat, pB+(idxB+4)*sdb, sdb, &beta, pC+ii*sdc+(jj+4)*ps, sdc, pD+ii*sdd+(jj+4)*ps, sdd, m-ii, n-(jj+4));
3229 }
3230 if(jj<n)
3231 {
3232 if(n-jj<=4)
3233 {
3234 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
3235 }
3236 else
3237 {
3238 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pAt, sdat, pB+(idxB+0)*sdb, sdb, &beta, pC+ii*sdc+(jj+0)*ps, sdc, pD+ii*sdd+(jj+0)*ps, sdd, m-ii, n-(jj+0));
3239 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+(idxB+4)*sdb, &beta, pC+ii*sdc+(jj+4)*ps, pD+ii*sdd+(jj+4)*ps, m-ii, n-(jj+4));
3240 }
3241 }
3242 goto tt_1_return;
3243 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3244 left_8_1:
3245 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3246 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3247 jj = 0;
3248 idxB = 0;
3249 // clean up at the beginning
3250 if(bir!=0)
3251 {
3252 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
3253 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3254 #else
3255 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+0*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
3256 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+4*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
3257 #endif
3258 jj += ps-bir;
3259 idxB += 4;
3260 }
3261 // main loop
3262 for(; jj<n; jj+=4, idxB+=4)
3263 {
3264 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
3265 }
3266 goto tt_1_return;
3267 #endif
3268
3269
3270
3271 #if defined(TARGET_X64_INTEL_HASWELL)
3272 left_8_1_g:
3273 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3274 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3275 jj = 0;
3276 idxB = 0;
3277 // clean up at the beginning
3278 if(bir!=0)
3279 {
3280 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3281 jj += ps-bir;
3282 idxB += 4;
3283 }
3284 // main loop
3285 for(; jj<n; jj+=4, idxB+=4)
3286 {
3287 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3288 }
3289 goto tt_1_return;
3290 #endif
3291
3292
3293
3294 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3295 left_4_1:
3296 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3297 jj = 0;
3298 idxB = 0;
3299 // clean up at the beginning
3300 if(bir!=0)
3301 {
3302 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3303 jj += ps-bir;
3304 idxB += 4;
3305 }
3306 // main loop
3307 for(; jj<n-8; jj+=12, idxB+=12)
3308 {
3309 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3310 }
3311 if(jj<n)
3312 {
3313 if(n-jj<=4)
3314 {
3315 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3316 }
3317 else
3318 {
3319 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3320 }
3321 }
3322 goto tt_1_return;
3323 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3324 left_4_1:
3325 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3326 jj = 0;
3327 idxB = 0;
3328 // clean up at the beginning
3329 if(bir!=0)
3330 {
3331 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3332 jj += ps-bir;
3333 idxB += 4;
3334 }
3335 // main loop
3336 for(; jj<n-4; jj+=8, idxB+=8)
3337 {
3338 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3339 }
3340 if(jj<n)
3341 {
3342 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3343 }
3344 goto tt_1_return;
3345 #elif defined(TARGET_X86_AMD_BARCELONA)
3346 left_4_1:
3347 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3348 jj = 0;
3349 idxB = 0;
3350 // clean up at the beginning
3351 if(bir!=0)
3352 {
3353 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3354 jj += ps-bir;
3355 idxB += 4;
3356 }
3357 // main loop
3358 for(; jj<n-2; jj+=4, idxB+=4)
3359 {
3360 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps, m-ii, n-(jj+0));
3361 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps, m-ii, n-(jj+2));
3362 }
3363 if(jj<n)
3364 {
3365 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3366 }
3367 goto tt_1_return;
3368 #else
3369 left_4_1:
3370 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3371 jj = 0;
3372 idxB = 0;
3373 // clean up at the beginning
3374 if(bir!=0)
3375 {
3376 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3377 jj += ps-bir;
3378 idxB += 4;
3379 }
3380 // main loop
3381 for(; jj<n; jj+=4, idxB+=4)
3382 {
3383 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3384 }
3385 goto tt_1_return;
3386 #endif
3387
3388
3389
3390 left_4_1_g:
3391 kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3392 jj = 0;
3393 idxB = 0;
3394 // clean up at the beginning
3395 if(bir!=0)
3396 {
3397 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3398 jj += ps-bir;
3399 idxB += 4;
3400 }
3401 // main loop
3402 for(; jj<n; jj+=4, idxB+=4)
3403 {
3404 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3405 }
3406 goto tt_1_return;
3407
3408
3409
3410 tt_1_return:
3411 free(mem);
3412 return;
3413
3414 }
3415 #endif
3416
3417
3418
3419 // dtrsm_llnn
blasfeo_dtrsm_llnn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3420 void blasfeo_dtrsm_llnn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3421 {
3422 // invalidate stored inverse diagonal of result matrix
3423 sD->use_dA = 0;
3424
3425 if(ai!=0 | bi!=0 | di!=0)
3426 {
3427 printf("\nblasfeo_dtrsm_llnn: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
3428 exit(1);
3429 }
3430
3431 const int ps = 4;
3432
3433 // TODO alpha
3434 int sda = sA->cn;
3435 int sdb = sB->cn;
3436 int sdd = sD->cn;
3437 double *pA = sA->pA + aj*ps;
3438 double *pB = sB->pA + bj*ps;
3439 double *pD = sD->pA + dj*ps;
3440 double *dA = sA->dA;
3441
3442 if(m<=0 || n<=0)
3443 return;
3444
3445 int i, j;
3446
3447 if(ai==0 & aj==0)
3448 {
3449 // recompute diagonal if size of operation grows
3450 if(sA->use_dA<m)
3451 {
3452 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
3453 for(i=0; i<m; i++)
3454 dA[i] = 1.0 / dA[i];
3455 sA->use_dA = m;
3456 }
3457 }
3458 // if submatrix recompute diagonal
3459 else
3460 {
3461 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
3462 for(i=0; i<m; i++)
3463 dA[i] = 1.0 / dA[i];
3464 sA->use_dA = 0;
3465 }
3466
3467 i = 0;
3468 #if defined(TARGET_X64_INTEL_HASWELL)
3469 for( ; i<m-11; i+=12)
3470 {
3471 j = 0;
3472 for( ; j<n-3; j+=4)
3473 {
3474 kernel_dtrsm_nn_ll_inv_12x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i);
3475 }
3476 if(j<n)
3477 {
3478 kernel_dtrsm_nn_ll_inv_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i, m-i, n-j);
3479 }
3480 }
3481 if(i<m)
3482 {
3483 if(m-i<=4)
3484 {
3485 goto left_4;
3486 }
3487 if(m-i<=8)
3488 {
3489 goto left_8;
3490 }
3491 else
3492 {
3493 goto left_12;
3494 }
3495 }
3496 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
3497 for( ; i<m-7; i+=8)
3498 {
3499 j = 0;
3500 for( ; j<n-3; j+=4)
3501 {
3502 kernel_dtrsm_nn_ll_inv_8x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i);
3503 }
3504 if(j<n)
3505 {
3506 kernel_dtrsm_nn_ll_inv_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i, m-i, n-j);
3507 }
3508 }
3509 if(i<m)
3510 {
3511 if(m-i<=4)
3512 {
3513 goto left_4;
3514 }
3515 else
3516 {
3517 goto left_8;
3518 }
3519 }
3520 #else
3521 for( ; i<m-3; i+=4)
3522 {
3523 j = 0;
3524 for( ; j<n-3; j+=4)
3525 {
3526 kernel_dtrsm_nn_ll_inv_4x4_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, dA+i);
3527 }
3528 if(j<n)
3529 {
3530 kernel_dtrsm_nn_ll_inv_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, dA+i, m-i, n-j);
3531 }
3532 }
3533 if(i<m)
3534 {
3535 goto left_4;
3536 }
3537 #endif
3538 // common return
3539 return;
3540
3541 #if defined(TARGET_X64_INTEL_HASWELL)
3542 left_12:
3543 j = 0;
3544 for( ; j<n; j+=4)
3545 {
3546 kernel_dtrsm_nn_ll_inv_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i, m-i, n-j);
3547 }
3548 return;
3549 #endif
3550
3551 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3552 left_8:
3553 j = 0;
3554 for( ; j<n; j+=4)
3555 {
3556 kernel_dtrsm_nn_ll_inv_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i, m-i, n-j);
3557 }
3558 return;
3559 #endif
3560
3561 left_4:
3562 j = 0;
3563 for( ; j<n; j+=4)
3564 {
3565 kernel_dtrsm_nn_ll_inv_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, dA+i, m-i, n-j);
3566 }
3567 return;
3568
3569 }
3570
3571
3572
3573 // dtrsm_llnu
blasfeo_dtrsm_llnu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3574 void blasfeo_dtrsm_llnu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3575 {
3576 // invalidate stored inverse diagonal of result matrix
3577 sD->use_dA = 0;
3578
3579 if(ai!=0 | bi!=0 | di!=0)
3580 {
3581 printf("\nblasfeo_dtrsm_llnu: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
3582 exit(1);
3583 }
3584
3585 const int ps = 4;
3586
3587 // TODO alpha
3588 int sda = sA->cn;
3589 int sdb = sB->cn;
3590 int sdd = sD->cn;
3591 double *pA = sA->pA + aj*ps;
3592 double *pB = sB->pA + bj*ps;
3593 double *pD = sD->pA + dj*ps;
3594
3595 if(m<=0 || n<=0)
3596 return;
3597
3598 int i, j;
3599
3600 i = 0;
3601 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3602 for( ; i<m-11; i+=12)
3603 {
3604 j = 0;
3605 for( ; j<n-3; j+=4)
3606 {
3607 kernel_dtrsm_nn_ll_one_12x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
3608 }
3609 if(j<n)
3610 {
3611 kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
3612 }
3613 }
3614 if(i<m)
3615 {
3616 if(m-i<=4)
3617 {
3618 goto left_4;
3619 }
3620 if(m-i<=8)
3621 {
3622 goto left_8;
3623 }
3624 else
3625 {
3626 goto left_12;
3627 }
3628 }
3629 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3630 for( ; i<m-7; i+=8)
3631 {
3632 j = 0;
3633 for( ; j<n-3; j+=4)
3634 {
3635 kernel_dtrsm_nn_ll_one_8x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
3636 }
3637 if(j<n)
3638 {
3639 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
3640 }
3641 }
3642 if(i<m)
3643 {
3644 if(m-i<=4)
3645 {
3646 goto left_4;
3647 }
3648 else
3649 {
3650 goto left_8;
3651 }
3652 }
3653 #else
3654 for( ; i<m-3; i+=4)
3655 {
3656 j = 0;
3657 for( ; j<n-3; j+=4)
3658 {
3659 kernel_dtrsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps);
3660 }
3661 if(j<n)
3662 {
3663 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
3664 }
3665 }
3666 if(i<m)
3667 {
3668 goto left_4;
3669 }
3670 #endif
3671 // common return
3672 return;
3673
3674 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3675 left_12:
3676 j = 0;
3677 for( ; j<n; j+=4)
3678 {
3679 kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
3680 }
3681 return;
3682 #endif
3683
3684 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3685 left_8:
3686 j = 0;
3687 for( ; j<n; j+=4)
3688 {
3689 kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
3690 }
3691 return;
3692 #endif
3693
3694 left_4:
3695 j = 0;
3696 for( ; j<n; j+=4)
3697 {
3698 kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
3699 }
3700 return;
3701
3702 }
3703
3704
3705
3706 // dtrsm_lltn
blasfeo_dtrsm_lltn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3707 void blasfeo_dtrsm_lltn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3708 {
3709 #ifndef BENCHMARKS_MODE
3710 printf("\nblasfeo_dtrsm_lltn: feature not implemented yet\n");
3711 exit(1);
3712 #endif
3713 return;
3714 }
3715
3716
3717
3718 // dtrsm_lltu
blasfeo_dtrsm_lltu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3719 void blasfeo_dtrsm_lltu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3720 {
3721 #ifndef BENCHMARKS_MODE
3722 printf("\nblasfeo_dtrsm_lltu: feature not implemented yet\n");
3723 exit(1);
3724 #endif
3725 return;
3726 }
3727
3728
3729
3730 // dtrsm_lunn
blasfeo_dtrsm_lunn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3731 void blasfeo_dtrsm_lunn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3732 {
3733
3734 if(m<=0 || n<=0)
3735 return;
3736
3737 if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
3738 {
3739 printf("\nblasfeo_dtrsm_lunn: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
3740 exit(1);
3741 }
3742
3743 // invalidate stored inverse diagonal of result matrix
3744 sD->use_dA = 0;
3745
3746 const int ps = 4;
3747 // TODO alpha
3748 int sda = sA->cn;
3749 int sdb = sB->cn;
3750 int sdd = sD->cn;
3751 double *pA = sA->pA + aj*ps;
3752 double *pB = sB->pA + bj*ps;
3753 double *pD = sD->pA + dj*ps;
3754 double *dA = sA->dA;
3755 int ii;
3756
3757 int i, j, idx;
3758 // double *dummy;
3759
3760 if(ai==0 & aj==0)
3761 {
3762 // recompute diagonal if size of operation grows
3763 if(sA->use_dA<m)
3764 {
3765 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
3766 for(ii=0; ii<m; ii++)
3767 dA[ii] = 1.0 / dA[ii];
3768 sA->use_dA = m;
3769 }
3770 }
3771 // if submatrix recompute diagonal
3772 else
3773 {
3774 ddiaex_lib(m, 1.0, ai, pA, sda, dA);
3775 for(ii=0; ii<m; ii++)
3776 dA[ii] = 1.0 / dA[ii];
3777 sA->use_dA = 0;
3778 }
3779
3780 i = 0;
3781 int rm = m%4;
3782 if(rm>0)
3783 {
3784 // TODO code expliticly the final case
3785 idx = m-rm; // position of the part to do
3786 j = 0;
3787 for( ; j<n; j+=4)
3788 {
3789 // kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, dA+idx, rm, n-j);
3790 // XXX pA & pD are dummy and should not be used internally !!!
3791 kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, pA, pD, sdd, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, dA+idx, rm, n-j);
3792 }
3793 // TODO
3794 i += rm;
3795 }
3796 // int em = m-rm;
3797 #if defined(TARGET_X64_INTEL_HASWELL)
3798 for( ; i<m-8; i+=12)
3799 {
3800 idx = m-i; // position of already done part
3801 j = 0;
3802 for( ; j<n-3; j+=4)
3803 {
3804 kernel_dtrsm_nn_lu_inv_12x4_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, dA+(idx-12));
3805 }
3806 if(j<n)
3807 {
3808 kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, dA+(idx-12), 12, n-j);
3809 // kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, dA+(idx-4), 4, n-j);
3810 // kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, dA+(idx-8), 4, n-j);
3811 // kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+8, pA+(idx-12)*sda+(idx-8)*ps, pD+(idx-8)*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, pD+(idx-12)*sdd+j*ps, pA+(idx-12)*sda+(idx-12)*ps, dA+(idx-12), 4, n-j);
3812 }
3813 }
3814 #endif
3815 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3816 for( ; i<m-4; i+=8)
3817 {
3818 idx = m-i; // position of already done part
3819 j = 0;
3820 for( ; j<n-3; j+=4)
3821 {
3822 kernel_dtrsm_nn_lu_inv_8x4_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, dA+(idx-8));
3823 }
3824 if(j<n)
3825 {
3826 kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, dA+(idx-8), 8, n-j);
3827 // kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, dA+(idx-4), 4, n-j);
3828 // kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, dA+(idx-8), 4, n-j);
3829 }
3830 }
3831 #endif
3832 for( ; i<m; i+=4)
3833 {
3834 idx = m-i; // position of already done part
3835 j = 0;
3836 for( ; j<n-3; j+=4)
3837 {
3838 kernel_dtrsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, dA+(idx-4));
3839 }
3840 if(j<n)
3841 {
3842 kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, dA+(idx-4), 4, n-j);
3843 }
3844 }
3845
3846 // common return
3847 return;
3848
3849 }
3850
3851
3852
3853 // dtrsm_lunu
blasfeo_dtrsm_lunu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3854 void blasfeo_dtrsm_lunu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3855 {
3856
3857 if(m<=0 || n<=0)
3858 return;
3859
3860 if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
3861 {
3862 printf("\nblasfeo_dtrsm_lunu: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
3863 exit(1);
3864 }
3865
3866 // invalidate stored inverse diagonal of result matrix
3867 sD->use_dA = 0;
3868
3869 const int ps = 4;
3870 // TODO alpha
3871 int sda = sA->cn;
3872 int sdb = sB->cn;
3873 int sdd = sD->cn;
3874 double *pA = sA->pA + aj*ps;
3875 double *pB = sB->pA + bj*ps;
3876 double *pD = sD->pA + dj*ps;
3877 int ii;
3878
3879 int i, j, idx;
3880 // double *dummy;
3881
3882 i = 0;
3883 int rm = m%4;
3884 if(rm>0)
3885 {
3886 // TODO code expliticly the final case
3887 idx = m-rm; // position of the part to do
3888 j = 0;
3889 for( ; j<n; j+=4)
3890 {
3891 // kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, dA+idx, rm, n-j);
3892 // XXX pA & pD are dummy and should not be used internally !!!
3893 kernel_dtrsm_nn_lu_one_4x4_vs_lib4(0, pA, pD, sdd, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, rm, n-j);
3894 }
3895 // TODO
3896 i += rm;
3897 }
3898 // int em = m-rm;
3899 #if 0//defined(TARGET_X64_INTEL_HASWELL)
3900 for( ; i<m-8; i+=12)
3901 {
3902 idx = m-i; // position of already done part
3903 j = 0;
3904 for( ; j<n-3; j+=4)
3905 {
3906 kernel_dtrsm_nn_lu_one_12x4_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda);
3907 }
3908 if(j<n)
3909 {
3910 kernel_dtrsm_nn_lu_one_12x4_vs_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, 12, n-j);
3911 // kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, 4, n-j);
3912 // kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, 4, n-j);
3913 // kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i+8, pA+(idx-12)*sda+(idx-8)*ps, pD+(idx-8)*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, pD+(idx-12)*sdd+j*ps, pA+(idx-12)*sda+(idx-12)*ps, 4, n-j);
3914 }
3915 }
3916 #endif
3917 #if 0//defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3918 for( ; i<m-4; i+=8)
3919 {
3920 idx = m-i; // position of already done part
3921 j = 0;
3922 for( ; j<n-3; j+=4)
3923 {
3924 kernel_dtrsm_nn_lu_one_8x4_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda));
3925 }
3926 if(j<n)
3927 {
3928 kernel_dtrsm_nn_lu_one_8x4_vs_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, 8, n-j);
3929 // kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, 4, n-j);
3930 // kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, 4, n-j);
3931 }
3932 }
3933 #endif
3934 for( ; i<m; i+=4)
3935 {
3936 idx = m-i; // position of already done part
3937 j = 0;
3938 for( ; j<n-3; j+=4)
3939 {
3940 kernel_dtrsm_nn_lu_one_4x4_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps);
3941 }
3942 if(j<n)
3943 {
3944 kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, 4, n-j);
3945 }
3946 }
3947
3948 // common return
3949 return;
3950
3951 }
3952
3953
3954
3955 // dtrsm_lutn
blasfeo_dtrsm_lutn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3956 void blasfeo_dtrsm_lutn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3957 {
3958 #ifndef BENCHMARKS_MODE
3959 printf("\nblasfeo_dtrsm_lutn: feature not implemented yet\n");
3960 exit(1);
3961 #endif
3962 return;
3963 }
3964
3965
3966
3967 // dtrsm_lutu
blasfeo_dtrsm_lutu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3968 void blasfeo_dtrsm_lutu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3969 {
3970 #ifndef BENCHMARKS_MODE
3971 printf("\nblasfeo_dtrsm_lutu: feature not implemented yet\n");
3972 exit(1);
3973 #endif
3974 return;
3975 }
3976
3977
3978
3979 // dtrsm_rlnn
blasfeo_dtrsm_rlnn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3980 void blasfeo_dtrsm_rlnn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3981 {
3982 #ifndef BENCHMARKS_MODE
3983 printf("\nblasfeo_dtrsm_rlnn: feature not implemented yet\n");
3984 exit(1);
3985 #endif
3986 return;
3987 }
3988
3989
3990
3991 // dtrsm_rlnu
blasfeo_dtrsm_rlnu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3992 void blasfeo_dtrsm_rlnu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3993 {
3994 #ifndef BENCHMARKS_MODE
3995 printf("\nblasfeo_dtrsm_rlnu: feature not implemented yet\n");
3996 exit(1);
3997 #endif
3998 return;
3999 }
4000
4001
4002
4003 // dtrsm_right_lower_transposed_notunit
blasfeo_dtrsm_rltn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4004 void blasfeo_dtrsm_rltn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4005 {
4006
4007 if(m<=0 || n<=0)
4008 return;
4009
4010 const int ps = 4;
4011
4012 // invalidate stored inverse diagonal of result matrix
4013 sD->use_dA = 0;
4014
4015 // TODO alpha !!!!!
4016
4017 int sda = sA->cn;
4018 int sdb = sB->cn;
4019 int sdd = sD->cn;
4020 int bir = bi & (ps-1);
4021 int dir = di & (ps-1);
4022 double *pA = sA->pA + aj*ps;
4023 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
4024 double *pD = sD->pA + dj*ps + (di-dir)*sdd;
4025 double *dA = sA->dA;
4026
4027 if(ai!=0 | bir!=0 | dir!=0 | alpha!=1.0)
4028 {
4029 printf("\nblasfeo_dtrsm_rltn: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
4030 exit(1);
4031 }
4032
4033 int i, j;
4034
4035 // TODO to avoid touching A, better temporarely use sD.dA ?????
4036 if(ai==0 & aj==0)
4037 {
4038 if(sA->use_dA<n)
4039 {
4040 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
4041 for(i=0; i<n; i++)
4042 dA[i] = 1.0 / dA[i];
4043 sA->use_dA = n;
4044 }
4045 }
4046 else
4047 {
4048 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
4049 for(i=0; i<n; i++)
4050 dA[i] = 1.0 / dA[i];
4051 sA->use_dA = 0;
4052 }
4053
4054 i = 0;
4055 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4056 for(; i<m-11; i+=12)
4057 {
4058 j = 0;
4059 for(; j<n-3; j+=4)
4060 {
4061 kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
4062 }
4063 if(j<n)
4064 {
4065 kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4066 }
4067 }
4068 if(m>i)
4069 {
4070 if(m-i<=4)
4071 {
4072 goto left_4;
4073 }
4074 else if(m-i<=8)
4075 {
4076 goto left_8;
4077 }
4078 else
4079 {
4080 goto left_12;
4081 }
4082 }
4083 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4084 for(; i<m-7; i+=8)
4085 {
4086 j = 0;
4087 for(; j<n-3; j+=4)
4088 {
4089 kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
4090 }
4091 if(j<n)
4092 {
4093 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4094 }
4095 }
4096 if(m>i)
4097 {
4098 if(m-i<=4)
4099 {
4100 goto left_4;
4101 }
4102 else
4103 {
4104 goto left_8;
4105 }
4106 }
4107 #elif defined(TARGET_X86_AMD_BARCELONA)
4108 for(; i<m-3; i+=4)
4109 {
4110 j = 0;
4111 for(; j<n-3; j+=4)
4112 {
4113 kernel_dtrsm_nt_rl_inv_4x2_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j]);
4114 kernel_dtrsm_nt_rl_inv_4x2_lib4(j+2, &pD[i*sdd], &pA[j*sda+2], &alpha, &pB[(j+2)*ps+i*sdb], &pD[(j+2)*ps+i*sdd], &pA[(j+2)*ps+j*sda+2], &dA[j+2]);
4115 }
4116 if(j<n)
4117 {
4118 kernel_dtrsm_nt_rl_inv_4x2_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4119 if(j<n-2)
4120 kernel_dtrsm_nt_rl_inv_4x2_vs_lib4(j+2, &pD[i*sdd], &pA[j*sda+2], &alpha, &pB[(j+2)*ps+i*sdb], &pD[(j+2)*ps+i*sdd], &pA[(j+2)*ps+j*sda+2], &dA[j+2], m-i, n-(j+2));
4121 }
4122 }
4123 if(m>i)
4124 {
4125 goto left_4;
4126 }
4127 #else
4128 for(; i<m-3; i+=4)
4129 {
4130 j = 0;
4131 for(; j<n-3; j+=4)
4132 {
4133 kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j]);
4134 }
4135 if(j<n)
4136 {
4137 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4138 }
4139 }
4140 if(m>i)
4141 {
4142 goto left_4;
4143 }
4144 #endif
4145
4146 // common return if i==m
4147 return;
4148
4149 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4150 left_12:
4151 j = 0;
4152 for(; j<n; j+=4)
4153 {
4154 kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4155 }
4156 return;
4157 #endif
4158
4159 #if defined(TARGET_X64_INTEL_HASWELL)
4160 left_8:
4161 j = 0;
4162 for(; j<n-8; j+=12)
4163 {
4164 kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
4165 kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pA[(j+4)*sda], sda, &pB[(j+4)*ps+i*sdb], sdb, &pD[(j+4)*ps+i*sdd], sdd, &pA[(j+4)*ps+(j+4)*sda], sda, &dA[(j+4)], m-i, n-(j+4));
4166 }
4167 if(j<n-4)
4168 {
4169 kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
4170 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pA[(j+4)*sda], &alpha, &pB[(j+4)*ps+i*sdb], &pD[(j+4)*ps+i*sdd], &pA[(j+4)*ps+(j+4)*sda], &dA[(j+4)], m-i, n-(j+4));
4171 j += 8;
4172 }
4173 else if(j<n)
4174 {
4175 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4176 j += 4;
4177 }
4178 return;
4179 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4180 left_8:
4181 j = 0;
4182 for(; j<n; j+=4)
4183 {
4184 kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4185 }
4186 return;
4187 #endif
4188
4189 #if defined(TARGET_X64_INTEL_HASWELL)
4190 left_4:
4191 j = 0;
4192 for(; j<n-8; j+=12)
4193 {
4194 kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
4195 }
4196 if(j<n-4)
4197 {
4198 kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
4199 j += 8;
4200 }
4201 else if(j<n)
4202 {
4203 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4204 j += 4;
4205 }
4206 return;
4207 #elif defined(TARGET_X86_AMD_BARCELONA)
4208 left_4:
4209 j = 0;
4210 for(; j<n; j+=4)
4211 {
4212 kernel_dtrsm_nt_rl_inv_4x2_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4213 if(j<n-2)
4214 {
4215 kernel_dtrsm_nt_rl_inv_4x2_vs_lib4(j+2, &pD[i*sdd], &pA[j*sda+2], &alpha, &pB[(j+2)*ps+i*sdb], &pD[(j+2)*ps+i*sdd], &pA[(j+2)*ps+j*sda+2], &dA[j+2], m-i, n-(j+2));
4216 }
4217 }
4218 return;
4219 #else
4220 left_4:
4221 j = 0;
4222 for(; j<n; j+=4)
4223 {
4224 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4225 }
4226 return;
4227 #endif
4228 }
4229
4230
4231
4232 // dtrsm_right_lower_transposed_unit
blasfeo_dtrsm_rltu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4233 void blasfeo_dtrsm_rltu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4234 {
4235
4236 if(m<=0 || n<=0)
4237 return;
4238
4239 if(ai!=0 | bi!=0 | di!=0)
4240 {
4241 printf("\nblasfeo_dtrsm_rltu: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
4242 exit(1);
4243 }
4244
4245 // invalidate stored inverse diagonal of result matrix
4246 sD->use_dA = 0;
4247
4248 const int ps = 4;
4249
4250 int sda = sA->cn;
4251 int sdb = sB->cn;
4252 int sdd = sD->cn;
4253 double *pA = sA->pA + aj*ps;
4254 double *pB = sB->pA + bj*ps;
4255 double *pD = sD->pA + dj*ps;
4256
4257 int i, j;
4258
4259 i = 0;
4260
4261 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4262 for(; i<m-11; i+=12)
4263 {
4264 j = 0;
4265 for(; j<n-3; j+=4)
4266 {
4267 kernel_dtrsm_nt_rl_one_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
4268 }
4269 if(j<n)
4270 {
4271 kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
4272 }
4273 }
4274 if(m>i)
4275 {
4276 if(m-i<=4)
4277 {
4278 goto left_4;
4279 }
4280 else if(m-i<=8)
4281 {
4282 goto left_8;
4283 }
4284 else
4285 {
4286 goto left_12;
4287 }
4288 }
4289 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4290 for(; i<m-7; i+=8)
4291 {
4292 j = 0;
4293 for(; j<n-3; j+=4)
4294 {
4295 kernel_dtrsm_nt_rl_one_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
4296 }
4297 if(j<n)
4298 {
4299 kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
4300 }
4301 }
4302 if(m>i)
4303 {
4304 if(m-i<=4)
4305 {
4306 goto left_4;
4307 }
4308 else
4309 {
4310 goto left_8;
4311 }
4312 }
4313 #else
4314 for(; i<m-3; i+=4)
4315 {
4316 j = 0;
4317 for(; j<n-3; j+=4)
4318 {
4319 kernel_dtrsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda]);
4320 }
4321 if(j<n)
4322 {
4323 kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
4324 }
4325 }
4326 if(m>i)
4327 {
4328 goto left_4;
4329 }
4330 #endif
4331
4332 // common return if i==m
4333 return;
4334
4335 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4336 left_12:
4337 j = 0;
4338 for(; j<n; j+=4)
4339 {
4340 kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
4341 }
4342 return;
4343 #endif
4344
4345 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4346 left_8:
4347 j = 0;
4348 for(; j<n; j+=4)
4349 {
4350 kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
4351 }
4352 return;
4353 #endif
4354
4355 left_4:
4356 j = 0;
4357 for(; j<n; j+=4)
4358 {
4359 kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
4360 }
4361
4362 return;
4363
4364 }
4365
4366
4367
4368 // dtrsm_runn
blasfeo_dtrsm_runn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4369 void blasfeo_dtrsm_runn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4370 {
4371 #ifndef BENCHMARKS_MODE
4372 printf("\nblasfeo_dtrsm_runn: feature not implemented yet\n");
4373 exit(1);
4374 #endif
4375 return;
4376 }
4377
4378
4379
4380 // dtrsm_runu
blasfeo_dtrsm_runu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4381 void blasfeo_dtrsm_runu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4382 {
4383 #ifndef BENCHMARKS_MODE
4384 printf("\nblasfeo_dtrsm_runu: feature not implemented yet\n");
4385 exit(1);
4386 #endif
4387 return;
4388 }
4389
4390
4391
4392 // dtrsm_right_upper_transposed_notunit
blasfeo_dtrsm_rutn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4393 void blasfeo_dtrsm_rutn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4394 {
4395 if(ai!=0 | bi!=0 | di!=0)
4396 {
4397 printf("\nblasfeo_dtrsm_rutn: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
4398 exit(1);
4399 }
4400
4401 // invalidate stored inverse diagonal of result matrix
4402 sD->use_dA = 0;
4403
4404 const int ps = 4;
4405
4406 int sda = sA->cn;
4407 int sdb = sB->cn;
4408 int sdd = sD->cn;
4409 double *pA = sA->pA + aj*ps;
4410 double *pB = sB->pA + bj*ps;
4411 double *pD = sD->pA + dj*ps;
4412 double *dA = sA->dA;
4413
4414 int ii;
4415
4416 if(ai==0 & aj==0)
4417 {
4418 if(sA->use_dA<n)
4419 {
4420 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
4421 for(ii=0; ii<n; ii++)
4422 dA[ii] = 1.0 / dA[ii];
4423 sA->use_dA = n;
4424 }
4425 }
4426 else
4427 {
4428 ddiaex_lib(n, 1.0, ai, pA, sda, dA);
4429 for(ii=0; ii<n; ii++)
4430 dA[ii] = 1.0 / dA[ii];
4431 sA->use_dA = 0;
4432 }
4433 // dtrsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
4434
4435 if(m<=0 || n<=0)
4436 return;
4437
4438 int i, j, idx;
4439
4440 int rn = n%4;
4441
4442 double *dummy = NULL;
4443
4444 i = 0;
4445
4446 #if defined(TARGET_X64_INTEL_HASWELL)
4447 for(; i<m-11; i+=12)
4448 {
4449 j = 0;
4450 // clean at the end
4451 if(rn>0)
4452 {
4453 idx = n-rn;
4454 kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4455 j += rn;
4456 }
4457 for(; j<n; j+=4)
4458 {
4459 idx = n-j-4;
4460 kernel_dtrsm_nt_ru_inv_12x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx]);
4461 }
4462 }
4463 if(m>i)
4464 {
4465 if(m-i<=4)
4466 {
4467 goto left_4;
4468 }
4469 else if(m-i<=8)
4470 {
4471 goto left_8;
4472 }
4473 else
4474 {
4475 goto left_12;
4476 }
4477 }
4478 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4479 for(; i<m-7; i+=8)
4480 {
4481 j = 0;
4482 // clean at the end
4483 if(rn>0)
4484 {
4485 idx = n-rn;
4486 kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4487 j += rn;
4488 }
4489 for(; j<n; j+=4)
4490 {
4491 idx = n-j-4;
4492 kernel_dtrsm_nt_ru_inv_8x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx]);
4493 }
4494 }
4495 if(m>i)
4496 {
4497 if(m-i<=4)
4498 {
4499 goto left_4;
4500 }
4501 else
4502 {
4503 goto left_8;
4504 }
4505 }
4506 #else
4507 for(; i<m-3; i+=4)
4508 {
4509 j = 0;
4510 // clean at the end
4511 if(rn>0)
4512 {
4513 idx = n-rn;
4514 kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &alpha, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4515 j += rn;
4516 }
4517 for(; j<n; j+=4)
4518 {
4519 idx = n-j-4;
4520 kernel_dtrsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &dA[idx]);
4521 }
4522 }
4523 if(m>i)
4524 {
4525 goto left_4;
4526 }
4527 #endif
4528 // common return if i==m
4529 return;
4530
4531 #if defined(TARGET_X64_INTEL_HASWELL)
4532 left_12:
4533 j = 0;
4534 // TODO
4535 // clean at the end
4536 if(rn>0)
4537 {
4538 idx = n-rn;
4539 kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4540 j += rn;
4541 }
4542 for(; j<n; j+=4)
4543 {
4544 idx = n-j-4;
4545 kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, 4);
4546 }
4547 return;
4548 #endif
4549
4550 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4551 left_8:
4552 j = 0;
4553 // TODO
4554 // clean at the end
4555 if(rn>0)
4556 {
4557 idx = n-rn;
4558 kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4559 j += rn;
4560 }
4561 for(; j<n; j+=4)
4562 {
4563 idx = n-j-4;
4564 kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, 4);
4565 }
4566 return;
4567 #endif
4568
4569 left_4:
4570 j = 0;
4571 // TODO
4572 // clean at the end
4573 if(rn>0)
4574 {
4575 idx = n-rn;
4576 kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &alpha, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4577 j += rn;
4578 }
4579 for(; j<n; j+=4)
4580 {
4581 idx = n-j-4;
4582 kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &dA[idx], m-i, 4);
4583 }
4584 return;
4585 }
4586
4587
4588
4589 // dtrsm_rutu
blasfeo_dtrsm_rutu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4590 void blasfeo_dtrsm_rutu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4591 {
4592 #ifndef BENCHMARKS_MODE
4593 printf("\nblasfeo_dtrsm_rutu: feature not implemented yet\n");
4594 exit(1);
4595 #endif
4596 return;
4597 }
4598
4599
4600
4601 // dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
blasfeo_dtrmm_rutn(int m,int n,double alpha,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sD,int di,int dj)4602 void blasfeo_dtrmm_rutn(int m, int n, double alpha, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sD, int di, int dj)
4603 {
4604 // invalidate stored inverse diagonal of result matrix
4605 sD->use_dA = 0;
4606
4607 if(ai!=0 | bi!=0 | di!=0)
4608 {
4609 printf("\nblasfeo_dtrmm_rutn: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
4610 exit(1);
4611 }
4612
4613 if(m<=0 || n<=0)
4614 return;
4615
4616 const int ps = 4;
4617
4618 int sda = sA->cn;
4619 int sdb = sB->cn;
4620 int sdd = sD->cn;
4621 double *pA = sA->pA + aj*ps;
4622 double *pB = sB->pA + bj*ps;
4623 double *pD = sD->pA + dj*ps;
4624
4625 int i, j;
4626
4627 i = 0;
4628 #if defined(TARGET_X64_INTEL_HASWELL)
4629 // XXX there is a bug here !!!!!!
4630 for(; i<m-11; i+=12)
4631 {
4632 j = 0;
4633 for(; j<n-3; j+=4)
4634 {
4635 kernel_dtrmm_nt_ru_12x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd);
4636 }
4637 if(j<n)
4638 {
4639 kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd, m-i, n-j);
4640 }
4641 }
4642 if(i<m)
4643 {
4644 if(m-i<5)
4645 {
4646 goto left_4;
4647 }
4648 if(m-i<9)
4649 {
4650 goto left_8;
4651 }
4652 else
4653 {
4654 goto left_12;
4655 }
4656 }
4657
4658 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4659 for(; i<m-7; i+=8)
4660 {
4661 j = 0;
4662 for(; j<n-3; j+=4)
4663 {
4664 kernel_dtrmm_nt_ru_8x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd);
4665 }
4666 if(j<n)
4667 {
4668 kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd, m-i, n-j);
4669 }
4670 }
4671 if(i<m)
4672 {
4673 if(m-i<5)
4674 {
4675 goto left_4;
4676 }
4677 else
4678 {
4679 goto left_8;
4680 }
4681 }
4682
4683 #else
4684 for(; i<m-3; i+=4)
4685 {
4686 j = 0;
4687 for(; j<n-3; j+=4)
4688 {
4689 kernel_dtrmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &pD[j*ps+i*sdd]);
4690 }
4691 if(j<n)
4692 {
4693 kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], m-i, n-j);
4694 }
4695 }
4696 if(i<m)
4697 {
4698 goto left_4;
4699 }
4700 #endif
4701
4702 // common return
4703 return;
4704
4705 #if defined(TARGET_X64_INTEL_HASWELL)
4706 // clean up
4707 left_12:
4708 j = 0;
4709 for(; j<n; j+=4)
4710 {
4711 kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd, m-i, n-j);
4712 }
4713 return;
4714 #endif
4715
4716 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4717 // clean up
4718 left_8:
4719 j = 0;
4720 for(; j<n; j+=4)
4721 {
4722 kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd, m-i, n-j);
4723 }
4724 return;
4725 #endif
4726
4727 left_4:
4728 j = 0;
4729 for(; j<n; j+=4)
4730 {
4731 kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], m-i, n-j);
4732 }
4733 return;
4734
4735 }
4736
4737
4738
4739 // dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
blasfeo_dtrmm_rlnn(int m,int n,double alpha,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sD,int di,int dj)4740 void blasfeo_dtrmm_rlnn(int m, int n, double alpha, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sD, int di, int dj)
4741 {
4742
4743 const int ps = 4;
4744
4745 int sda = sA->cn;
4746 int sdb = sB->cn;
4747 int sdd = sD->cn;
4748 int air = ai & (ps-1);
4749 int bir = bi & (ps-1);
4750 double *pA = sA->pA + aj*ps + (ai-air)*sda;
4751 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
4752 double *pD = sD->pA + dj*ps;
4753
4754 int offsetB = bir;
4755
4756 int di0 = di-air;
4757 int offsetD;
4758
4759 // invalidate stored inverse diagonal of result matrix
4760 sD->use_dA = 0;
4761
4762 if(di0>=0)
4763 {
4764 pD += di0/ps*ps*sdd;
4765 offsetD = di0%ps;
4766 }
4767 else
4768 {
4769 pD += -4*sdd;
4770 offsetD = ps+di0;
4771 }
4772
4773 int ii, jj;
4774
4775 if(air!=0)
4776 {
4777 jj = 0;
4778 for(; jj<n; jj+=4)
4779 {
4780 kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[jj*ps], sdd, air, air+m, 0, n-jj);
4781 }
4782 m -= ps-air;
4783 pA += ps*sda;
4784 pD += ps*sdd;
4785 }
4786 ii = 0;
4787 if(offsetD==0)
4788 {
4789 #if defined(TARGET_X64_INTEL_HASWELL)
4790 for(; ii<m-11; ii+=12)
4791 {
4792 jj = 0;
4793 for(; jj<n-5; jj+=4)
4794 {
4795 kernel_dtrmm_nn_rl_12x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd); // n-j>=6 !!!!!
4796 }
4797 for(; jj<n; jj+=4)
4798 {
4799 kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 12, n-jj);
4800 }
4801 }
4802 if(ii<m)
4803 {
4804 if(ii<m-8)
4805 goto left_12;
4806 else if(ii<m-4)
4807 goto left_8;
4808 else
4809 goto left_4;
4810 }
4811 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4812 for(; ii<m-7; ii+=8)
4813 {
4814 jj = 0;
4815 for(; jj<n-5; jj+=4)
4816 {
4817 kernel_dtrmm_nn_rl_8x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd);
4818 }
4819 for(; jj<n; jj+=4)
4820 {
4821 kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 8, n-jj);
4822 }
4823 }
4824 if(ii<m)
4825 {
4826 if(ii<m-4)
4827 goto left_8;
4828 else
4829 goto left_4;
4830 }
4831 #elif defined(TARGET_X86_AMD_BARCELONA)
4832 for(; ii<m-3; ii+=4)
4833 {
4834 jj = 0;
4835 for(; jj<n-3; jj+=4)
4836 {
4837 kernel_dtrmm_nn_rl_4x2_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps]);
4838 if(offsetB+2<4)
4839 kernel_dtrmm_nn_rl_4x2_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2, &pB[jj*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps]);
4840 else
4841 kernel_dtrmm_nn_rl_4x2_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2-ps, &pB[(jj+ps)*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps]);
4842 }
4843 for(; jj<n; jj+=4)
4844 {
4845 kernel_dtrmm_nn_rl_4x2_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], 4, n-jj);
4846 if(jj<n-2)
4847 {
4848 if(offsetB+2<4)
4849 kernel_dtrmm_nn_rl_4x2_vs_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2, &pB[jj*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps], 4, n-(jj+2));
4850 else
4851 kernel_dtrmm_nn_rl_4x2_vs_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2-ps, &pB[(jj+ps)*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps], 4, n-(jj+2));
4852 }
4853 }
4854 }
4855 if(ii<m)
4856 {
4857 goto left_4;
4858 }
4859 #else
4860 for(; ii<m-3; ii+=4)
4861 {
4862 jj = 0;
4863 for(; jj<n-5; jj+=4)
4864 {
4865 kernel_dtrmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps]);
4866 }
4867 for(; jj<n; jj+=4)
4868 {
4869 kernel_dtrmm_nn_rl_4x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], 4, n-jj);
4870 }
4871 }
4872 if(ii<m)
4873 {
4874 goto left_4;
4875 }
4876 #endif
4877 }
4878 else
4879 {
4880 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4881 for(; ii<m-4; ii+=8)
4882 {
4883 jj = 0;
4884 for(; jj<n; jj+=4)
4885 {
4886 kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
4887 }
4888 }
4889 if(ii<m)
4890 {
4891 goto left_4_gen;
4892 }
4893 #else
4894 for(; ii<m; ii+=4)
4895 {
4896 jj = 0;
4897 for(; jj<n; jj+=4)
4898 {
4899 kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
4900 }
4901 }
4902 #endif
4903 }
4904
4905 // common return if i==m
4906 return;
4907
4908 // clean up loops definitions
4909
4910 #if defined(TARGET_X64_INTEL_HASWELL)
4911 left_12:
4912 jj = 0;
4913 for(; jj<n; jj+=4)
4914 {
4915 kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
4916 }
4917 return;
4918 #endif
4919
4920 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4921 left_8:
4922 jj = 0;
4923 for(; jj<n; jj+=4)
4924 {
4925 kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
4926 }
4927 return;
4928 #endif
4929
4930 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4931 left_8_gen:
4932 jj = 0;
4933 for(; jj<n; jj+=4)
4934 {
4935 kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
4936 }
4937 return;
4938 #endif
4939
4940 #if defined(TARGET_X86_AMD_BARCELONA)
4941 left_4:
4942 jj = 0;
4943 for(; jj<n; jj+=4)
4944 {
4945 kernel_dtrmm_nn_rl_4x2_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], m-ii, n-jj);
4946 if(jj<n-2)
4947 {
4948 if(offsetB+2<4)
4949 kernel_dtrmm_nn_rl_4x2_vs_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2, &pB[jj*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps], m-ii, n-(jj+2));
4950 else
4951 kernel_dtrmm_nn_rl_4x2_vs_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2-ps, &pB[(jj+ps)*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps], m-ii, n-(jj+2));
4952 }
4953 }
4954 return;
4955 #else
4956 left_4:
4957 jj = 0;
4958 for(; jj<n; jj+=4)
4959 {
4960 kernel_dtrmm_nn_rl_4x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], m-ii, n-jj);
4961 }
4962 return;
4963 #endif
4964
4965 left_4_gen:
4966 jj = 0;
4967 for(; jj<n; jj+=4)
4968 {
4969 kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
4970 }
4971 return;
4972 }
4973
4974
4975
blasfeo_dsyrk_ln(int m,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)4976 void blasfeo_dsyrk_ln(int m, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
4977 {
4978
4979 // fast return
4980 if(m<=0)
4981 return;
4982
4983 // invalidate stored inverse diagonal of result matrix
4984 sD->use_dA = 0;
4985
4986 const int ps = 4;
4987
4988 int sda = sA->cn;
4989 int sdb = sB->cn;
4990 int sdc = sC->cn;
4991 int sdd = sD->cn;
4992 int air = ai & (ps-1);
4993 int bir = bi & (ps-1);
4994 double *pA = sA->pA + aj*ps + (ai-air)*sda;
4995 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
4996 double *pC = sC->pA + cj*ps;
4997 double *pD = sD->pA + dj*ps;
4998
4999 int ci0 = ci;//-air;
5000 int di0 = di;//-air;
5001 int offsetC;
5002 int offsetD;
5003 if(ci0>=0)
5004 {
5005 pC += ci0/ps*ps*sdd;
5006 offsetC = ci0%ps;
5007 }
5008 else
5009 {
5010 pC += -4*sdc;
5011 offsetC = ps+ci0;
5012 }
5013 if(di0>=0)
5014 {
5015 pD += di0/ps*ps*sdd;
5016 offsetD = di0%ps;
5017 }
5018 else
5019 {
5020 pD += -4*sdd;
5021 offsetD = ps+di0;
5022 }
5023
5024 void *mem;
5025 double *pU, *pA2;
5026 int sdu, sda2;
5027
5028 // TODO visual studio alignment
5029 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5030 ALIGNED( double pU0[3*4*K_MAX_STACK], 64 );
5031 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5032 ALIGNED( double pU0[2*4*K_MAX_STACK], 64 );
5033 #elif defined(TARGET_GENERIC)
5034 double pU0[1*4*K_MAX_STACK];
5035 #else
5036 ALIGNED( double pU0[1*4*K_MAX_STACK], 64 );
5037 #endif
5038 int sdu0 = (k+3)/4*4;
5039 sdu0 = sdu0<K_MAX_STACK ? sdu0 : K_MAX_STACK;
5040
5041 // allocate memory
5042 if(k>K_MAX_STACK)
5043 {
5044 sdu = (k+ps-1)/ps*ps;
5045 mem = malloc(12*sdu*sizeof(double)+63);
5046 blasfeo_align_64_byte(mem, (void **) &pU);
5047 }
5048 else
5049 {
5050 pU = pU0;
5051 sdu = sdu0;
5052 }
5053
5054
5055 int i, j, n1;
5056
5057 int idxB;
5058
5059
5060
5061 // algorithm scheme
5062 if(offsetC==0 & offsetD==0)
5063 {
5064 if(bir==0)
5065 {
5066 // printf("\n000\n");
5067 goto loop_000;
5068 }
5069 else
5070 {
5071 // printf("\nB00\n");
5072 goto loop_B00;
5073 }
5074 }
5075 else
5076 {
5077 if(bir==0)
5078 {
5079 // printf("\n0CD\n");
5080 goto loop_0CD;
5081 }
5082 else
5083 {
5084 // printf("\nBCD\n");
5085 goto loop_BCD;
5086 }
5087 }
5088 // should never get here
5089 goto end;
5090
5091
5092
5093 // main loop aligned
5094 loop_000:
5095 i = 0;
5096 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5097 for(; i<m-11; i+=12)
5098 {
5099 if(air==0)
5100 {
5101 pA2 = pA+i*sda;
5102 sda2 = sda;
5103 }
5104 else
5105 {
5106 #if defined(TARGET_X64_INTEL_HASWELL)
5107 kernel_dpacp_nn_12_lib4(k, air, pA+i*sda, sda, pU, sdu);
5108 #else
5109 kernel_dpacp_nn_4_lib4(k, air, pA+(i+0)*sda, sda, pU+0*sdu);
5110 kernel_dpacp_nn_4_lib4(k, air, pA+(i+4)*sda, sda, pU+4*sdu);
5111 kernel_dpacp_nn_4_lib4(k, air, pA+(i+8)*sda, sda, pU+8*sdu);
5112 #endif
5113 pA2 = pU;
5114 sda2 = sdu;
5115 }
5116 j = 0;
5117 // main loop
5118 for(; j<i; j+=4)
5119 {
5120 kernel_dgemm_nt_12x4_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5121 }
5122 kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5123 #if defined(TARGET_X64_INTEL_HASWELL)
5124 kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
5125 #else
5126 kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
5127 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, pA2+8*sda2, &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd]);
5128 #endif
5129 }
5130 if(m>i)
5131 {
5132 if(m-i<=4)
5133 {
5134 goto left_4;
5135 }
5136 else if(m-i<=8)
5137 {
5138 goto left_8;
5139 }
5140 else
5141 {
5142 goto left_12;
5143 }
5144 }
5145 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5146 for(; i<m-7; i+=8)
5147 {
5148 if(air==0)
5149 {
5150 pA2 = pA+i*sda;
5151 sda2 = sda;
5152 }
5153 else
5154 {
5155 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
5156 kernel_dpacp_nn_8_lib4(k, air, pA+i*sda, sda, pU, sdu);
5157 #else
5158 kernel_dpacp_nn_4_lib4(k, air, pA+(i+0)*sda, sda, pU+0*sdu);
5159 kernel_dpacp_nn_4_lib4(k, air, pA+(i+4)*sda, sda, pU+4*sdu);
5160 #endif
5161 pA2 = pU;
5162 sda2 = sdu;
5163 }
5164 j = 0;
5165 // main loop
5166 for(; j<i; j+=4)
5167 {
5168 kernel_dgemm_nt_8x4_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5169 }
5170 kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5171 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, pA2+4*sda2, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
5172 }
5173 if(m>i)
5174 {
5175 if(m-i<=4)
5176 {
5177 goto left_4;
5178 }
5179 else
5180 {
5181 goto left_8;
5182 }
5183 }
5184 #else
5185 for(; i<m-3; i+=4)
5186 {
5187 if(air==0)
5188 {
5189 pA2 = pA+i*sda;
5190 sda2 = sda;
5191 }
5192 else
5193 {
5194 kernel_dpacp_nn_4_lib4(k, air, pA+i*sda, sda, pU);
5195 pA2 = pU;
5196 sda2 = sdu;
5197 }
5198 j = 0;
5199 // main loop
5200 for(; j<i; j+=4)
5201 {
5202 kernel_dgemm_nt_4x4_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5203 }
5204 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5205 }
5206 if(m>i)
5207 {
5208 goto left_4;
5209 }
5210 #endif
5211 // common return if i==m
5212 goto end;
5213
5214
5215
5216 // main loop aligned
5217 loop_B00:
5218 i = 0;
5219 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_HASWELL)
5220 for(; i<m-7; i+=8)
5221 {
5222 if(air==0)
5223 {
5224 pA2 = pA+i*sda;
5225 sda2 = sda;
5226 }
5227 else
5228 {
5229 kernel_dpacp_nn_8_vs_lib4(k, air, pA+i*sda, sda, pU, sdu, m-i);
5230 pA2 = pU;
5231 sda2 = sdu;
5232 }
5233 j = 0;
5234 idxB = 0;
5235 if(j<i)
5236 {
5237 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5238 j += ps-bir;
5239 idxB += 4;
5240 // main loop
5241 for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5242 {
5243 kernel_dgemm_nt_8x4_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5244 }
5245 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, bir);
5246 j += bir;
5247 }
5248 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5249 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[(j+4)*sdb], &beta, 0, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, 0, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5250 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2+4*sda2, &pB[(j+8)*sdb], &beta, 0, &pC[(j+4)*ps+(i+4)*sdc]+(ps-bir)*ps, sdc, 0, &pD[(j+4)*ps+(i+4)*sdd]+(ps-bir)*ps, sdd, ps-bir, m-(i+4), 0, m-(j+4));
5251 }
5252 if(m>i)
5253 {
5254 if(m-i<=4)
5255 {
5256 goto left_4_g;
5257 }
5258 else
5259 {
5260 goto left_8_g;
5261 }
5262 }
5263 #else
5264 for(; i<m-3; i+=4)
5265 {
5266 if(air==0)
5267 {
5268 pA2 = pA+i*sda;
5269 sda2 = sda;
5270 }
5271 else
5272 {
5273 kernel_dpacp_nn_4_lib4(k, air, pA+i*sda, sda, pU);
5274 pA2 = pU;
5275 sda2 = sdu;
5276 }
5277 j = 0;
5278 idxB = 0;
5279 if(j<i)
5280 {
5281 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, 4);
5282 j += ps-bir;
5283 idxB += 4;
5284 // main loop
5285 for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5286 {
5287 kernel_dgemm_nt_4x4_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5288 }
5289 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, bir);
5290 j += bir;
5291 }
5292 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, 4);
5293 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[(j+4)*sdb], &beta, 0, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, 0, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, 4);
5294 }
5295 if(m>i)
5296 {
5297 goto left_4_g;
5298 }
5299 #endif
5300 // common return if i==m
5301 goto end;
5302
5303
5304
5305 // main loop C, D not aligned
5306 loop_0CD:
5307 i = 0;
5308 #if 0//defined(TARGET_X64_INTEL_HASWELL)
5309 for(; i<m-8; i+=12)
5310 {
5311 if(air==0)
5312 {
5313 pA2 = pA+i*sda;
5314 sda2 = sda;
5315 }
5316 else
5317 {
5318 kernel_dpacp_nn_12_lib4(k, air, pA+i*sda, sda, pU, sdu);
5319 pA2 = pU;
5320 sda2 = sdu;
5321 }
5322 j = 0;
5323 // main loop
5324 for(; j<i; j+=4)
5325 {
5326 kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5327 }
5328 kernel_dsyrk_nt_l_12x4_gen_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5329 kernel_dsyrk_nt_l_8x8_gen_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], sdb, &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
5330 }
5331 if(m>i)
5332 {
5333 if(m-i<=4)
5334 {
5335 goto left_4_g;
5336 }
5337 else
5338 {
5339 goto left_8_g;
5340 }
5341 }
5342 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_HASWELL)
5343 for(; i<m-4; i+=8)
5344 {
5345 if(air==0)
5346 {
5347 pA2 = pA+i*sda;
5348 sda2 = sda;
5349 }
5350 else
5351 {
5352 kernel_dpacp_nn_8_lib4(k, air, pA+i*sda, sda, pU, sdu);
5353 pA2 = pU;
5354 sda2 = sdu;
5355 }
5356 j = 0;
5357 // main loop
5358 for(; j<i; j+=4)
5359 {
5360 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5361 }
5362 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5363 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2+4*sda2, &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
5364 }
5365 if(m>i)
5366 {
5367 goto left_4_g;
5368 }
5369 #else
5370 for(; i<m; i+=4)
5371 {
5372 if(air==0)
5373 {
5374 pA2 = pA+i*sda;
5375 sda2 = sda;
5376 }
5377 else
5378 {
5379 kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5380 pA2 = pU;
5381 sda2 = sdu;
5382 }
5383 j = 0;
5384 // main loop
5385 for(; j<i; j+=4)
5386 {
5387 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5388 }
5389 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5390 }
5391 #endif
5392 // common return if i==m
5393 goto end;
5394
5395
5396
5397 // main loop aligned
5398 loop_BCD:
5399 i = 0;
5400 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_HASWELL)
5401 for(; i<m-4; i+=8)
5402 {
5403 if(air==0)
5404 {
5405 pA2 = pA+i*sda;
5406 sda2 = sda;
5407 }
5408 else
5409 {
5410 kernel_dpacp_nn_8_vs_lib4(k, air, pA+i*sda, sda, pU, sdu, m-i);
5411 pA2 = pU;
5412 sda2 = sdu;
5413 }
5414 j = 0;
5415 idxB = 0;
5416 if(j<i)
5417 {
5418 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5419 j += ps-bir;
5420 idxB += 4;
5421 // main loop
5422 for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5423 {
5424 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5425 }
5426 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, bir); // XXX n1
5427 j += bir;
5428 }
5429 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5430 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[(j+4)*sdb], &beta, offsetC, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5431 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2+4*sda2, &pB[(j+8)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd]+(ps-bir)*ps, sdd, ps-bir, m-(i+4), 0, m-(j+4));
5432 }
5433 if(m>i)
5434 {
5435 goto left_4_g;
5436 }
5437 #else
5438 for(; i<m; i+=4)
5439 {
5440 if(air==0)
5441 {
5442 pA2 = pA+i*sda;
5443 sda2 = sda;
5444 }
5445 else
5446 {
5447 kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5448 pA2 = pU;
5449 sda2 = sdu;
5450 }
5451 j = 0;
5452 idxB = 0;
5453 if(j<i)
5454 {
5455 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5456 j += ps-bir;
5457 idxB += 4;
5458 // main loop
5459 for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5460 {
5461 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5462 }
5463 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, bir); // XXX n1
5464 j += bir;
5465 }
5466 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5467 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[(j+4)*sdb], &beta, offsetC, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5468 }
5469 #endif
5470 // common return if i==m
5471 goto end;
5472
5473
5474
5475 // clean up loops definitions
5476
5477 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5478 left_12:
5479 if(air==0)
5480 {
5481 pA2 = pA+i*sda;
5482 sda2 = sda;
5483 }
5484 else
5485 {
5486 #if defined(TARGET_X64_INTEL_HASWELL)
5487 kernel_dpacp_nn_12_lib4(k, air, pA+i*sda, sda, pU, sdu);
5488 #else
5489 kernel_dpacp_nn_4_lib4(k, air, pA+(i+0)*sda, sda, pU+0*sdu);
5490 kernel_dpacp_nn_4_lib4(k, air, pA+(i+4)*sda, sda, pU+4*sdu);
5491 kernel_dpacp_nn_4_lib4(k, air, pA+(i+8)*sda, sda, pU+8*sdu);
5492 #endif
5493 pA2 = pU;
5494 sda2 = sdu;
5495 }
5496 j = 0;
5497 // main loop
5498 for(; j<i; j+=4)
5499 {
5500 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5501 }
5502 kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5503 #if defined(TARGET_X64_INTEL_HASWELL)
5504 kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
5505 #else
5506 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
5507 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2+8*sda2, &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, m-j-8);
5508 #endif
5509 goto end;
5510 #endif
5511
5512
5513
5514 #if defined(TARGET_X64_INTEL_HASWELL)
5515 left_8:
5516 if(air==0)
5517 {
5518 pA2 = pA+i*sda;
5519 sda2 = sda;
5520 }
5521 else
5522 {
5523 kernel_dpacp_nn_8_lib4(k, air, pA+i*sda, sda, pU, sdu);
5524 pA2 = pU;
5525 sda2 = sdu;
5526 }
5527 j = 0;
5528 // main loop
5529 for(; j<i-8; j+=12)
5530 {
5531 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5532 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, pA2, sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, m-(j+4));
5533 }
5534 if(j<i-4)
5535 {
5536 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5537 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, m-(j+4));
5538 j += 8;
5539 }
5540 else if(j<i)
5541 {
5542 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5543 j += 4;
5544 }
5545 kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5546 goto end;
5547 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
5548 left_8:
5549 if(air==0)
5550 {
5551 pA2 = pA+i*sda;
5552 sda2 = sda;
5553 }
5554 else
5555 {
5556 kernel_dpacp_nn_8_lib4(k, air, pA+i*sda, sda, pU, sdu);
5557 pA2 = pU;
5558 sda2 = sdu;
5559 }
5560 j = 0;
5561 // main loop
5562 for(; j<i; j+=4)
5563 {
5564 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5565 }
5566 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5567 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2+4*sda2, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
5568 goto end;
5569 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5570 left_8:
5571 if(air==0)
5572 {
5573 pA2 = pA+i*sda;
5574 sda2 = sda;
5575 }
5576 else
5577 {
5578 kernel_dpacp_nn_4_lib4(k, air, pA+(i+0)*sda, sda, pU+0*sdu);
5579 kernel_dpacp_nn_4_lib4(k, air, pA+(i+4)*sda, sda, pU+4*sdu);
5580 pA2 = pU;
5581 sda2 = sdu;
5582 }
5583 j = 0;
5584 // main loop
5585 for(; j<i; j+=4)
5586 {
5587 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5588 }
5589 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5590 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2+4*sda2, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
5591 goto end;
5592 #endif
5593
5594
5595
5596 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
5597 left_8_g:
5598 if(air==0)
5599 {
5600 pA2 = pA+i*sda;
5601 sda2 = sda;
5602 }
5603 else
5604 {
5605 kernel_dpacp_nn_8_vs_lib4(k, air, pA+i*sda, sda, pU, sdu, m-i);
5606 pA2 = pU;
5607 sda2 = sdu;
5608 }
5609 j = 0;
5610 idxB = 0;
5611 if(j<i)
5612 {
5613 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5614 j += ps-bir;
5615 idxB += 4;
5616 // main loop
5617 for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5618 {
5619 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5620 }
5621 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, bir); // XXX n1
5622 j += bir;
5623 }
5624 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5625 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[(j+4)*sdb], &beta, offsetC, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5626 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2+4*sda2, &pB[(j+8)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd]+(ps-bir)*ps, sdd, ps-bir, m-(i+4), 0, m-(j+4));
5627 goto end;
5628 #endif
5629
5630
5631
5632 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5633 left_4:
5634 if(air==0)
5635 {
5636 pA2 = pA+i*sda;
5637 sda2 = sda;
5638 }
5639 else
5640 {
5641 kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5642 pA2 = pU;
5643 sda2 = sdu;
5644 }
5645 j = 0;
5646 // main loop
5647 for(; j<i-8; j+=12)
5648 {
5649 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, pA2, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5650 }
5651 if(j<i-4)
5652 {
5653 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pA2, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5654 j += 8;
5655 }
5656 else if(j<i)
5657 {
5658 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5659 j += 4;
5660 }
5661 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5662 goto end;
5663 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5664 left_4:
5665 if(air==0)
5666 {
5667 pA2 = pA+i*sda;
5668 sda2 = sda;
5669 }
5670 else
5671 {
5672 kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5673 pA2 = pU;
5674 sda2 = sdu;
5675 }
5676 j = 0;
5677 // main loop
5678 for(; j<i-4; j+=8)
5679 {
5680 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pA2, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5681 }
5682 if(j<i)
5683 {
5684 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5685 j+=4;
5686 }
5687 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5688 goto end;
5689 #else
5690 left_4:
5691 if(air==0)
5692 {
5693 pA2 = pA+i*sda;
5694 sda2 = sda;
5695 }
5696 else
5697 {
5698 kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5699 pA2 = pU;
5700 sda2 = sdu;
5701 }
5702 j = 0;
5703 // main loop
5704 for(; j<i; j+=4)
5705 {
5706 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5707 }
5708 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5709 goto end;
5710 #endif
5711
5712
5713
5714 left_4_g:
5715 j = 0;
5716 if(air==0)
5717 {
5718 pA2 = pA+i*sda;
5719 sda2 = sda;
5720 }
5721 else
5722 {
5723 kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5724 pA2 = pU;
5725 sda2 = sdu;
5726 }
5727 if(bir!=0)
5728 {
5729 idxB = 0;
5730 if(j<i)
5731 {
5732 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5733 j += ps-bir;
5734 idxB += 4;
5735 // main loop
5736 for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5737 {
5738 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5739 }
5740 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, bir); // XXX n1
5741 j += bir;
5742 }
5743 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5744 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[(j+4)*sdb], &beta, offsetC, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5745 }
5746 else
5747 {
5748 // main loop
5749 for(; j<i; j+=4)
5750 {
5751 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5752 }
5753 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5754 }
5755 goto end;
5756
5757
5758
5759 end:
5760 if(k>K_MAX_STACK)
5761 {
5762 free(mem);
5763 }
5764 return;
5765
5766
5767
5768 #if 0
5769 // main loop
5770 i = 0;
5771 if(offsetC==0 & offsetD==0)
5772 {
5773 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5774 for(; i<m-11; i+=12)
5775 {
5776 j = 0;
5777 for(; j<i; j+=4)
5778 {
5779 kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5780 }
5781 kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5782 #if defined(TARGET_X64_INTEL_HASWELL)
5783 kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
5784 #else
5785 kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
5786 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd]);
5787 #endif
5788 }
5789 if(m>i)
5790 {
5791 if(m-i<=4)
5792 {
5793 goto left_4;
5794 }
5795 else if(m-i<=8)
5796 {
5797 goto left_8;
5798 }
5799 else
5800 {
5801 goto left_12;
5802 }
5803 }
5804 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5805 for(; i<m-7; i+=8)
5806 {
5807 j = 0;
5808 for(; j<i; j+=4)
5809 {
5810 kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5811 }
5812 kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5813 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
5814 }
5815 if(m>i)
5816 {
5817 if(m-i<=4)
5818 {
5819 goto left_4;
5820 }
5821 else
5822 {
5823 goto left_8;
5824 }
5825 }
5826 #elif defined(TARGET_X86_AMD_BARCELONA)
5827 for(; i<m-3; i+=4)
5828 {
5829 j = 0;
5830 for(; j<i; j+=4)
5831 {
5832 kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5833 kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd]);
5834 }
5835 kernel_dsyrk_nt_l_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5836 kernel_dsyrk_nt_l_2x2_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2]);
5837 }
5838 if(m>i)
5839 {
5840 goto left_4;
5841 }
5842 #else
5843 for(; i<m-3; i+=4)
5844 {
5845 j = 0;
5846 for(; j<i; j+=4)
5847 {
5848 kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5849 }
5850 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5851 }
5852 if(m>i)
5853 {
5854 goto left_4;
5855 }
5856 #endif
5857 }
5858 else
5859 {
5860 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
5861 for(; i<m-4; i+=8)
5862 {
5863 j = 0;
5864 for(; j<i; j+=4)
5865 {
5866 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5867 }
5868 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5869 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
5870 }
5871 if(m>i)
5872 {
5873 goto left_4_gen;
5874 }
5875 #else
5876 for(; i<m; i+=4)
5877 {
5878 j = 0;
5879 for(; j<i; j+=4)
5880 {
5881 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5882 }
5883 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5884 }
5885 #endif
5886 }
5887
5888 // common return if i==m
5889 return;
5890
5891 // clean up loops definitions
5892
5893 #if defined(TARGET_X64_INTEL_HASWELL)
5894 left_12:
5895 j = 0;
5896 for(; j<i; j+=4)
5897 {
5898 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5899 }
5900 kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5901 kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
5902 // kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
5903 return;
5904 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5905 left_12:
5906 j = 0;
5907 for(; j<i; j+=4)
5908 {
5909 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5910 }
5911 kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5912 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
5913 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, m-j-8);
5914 return;
5915 #endif
5916
5917 #if defined(TARGET_X64_INTEL_HASWELL)
5918 left_8:
5919 j = 0;
5920 for(; j<i-8; j+=12)
5921 {
5922 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5923 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, m-(j+4));
5924 }
5925 if(j<i-4)
5926 {
5927 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5928 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, m-(j+4));
5929 j += 8;
5930 }
5931 else if(j<i)
5932 {
5933 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5934 j += 4;
5935 }
5936 kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5937 // kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
5938 return;
5939 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5940 left_8:
5941 j = 0;
5942 for(; j<i; j+=4)
5943 {
5944 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5945 }
5946 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5947 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
5948 return;
5949 #endif
5950
5951 #if defined(TARGET_X64_INTEL_HASWELL)
5952 left_4:
5953 j = 0;
5954 for(; j<i-8; j+=12)
5955 {
5956 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5957 }
5958 if(j<i-4)
5959 {
5960 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5961 j += 8;
5962 }
5963 else if(j<i)
5964 {
5965 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5966 j += 4;
5967 }
5968 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5969 return;
5970 #elif defined(TARGET_X86_AMD_BARCELONA)
5971 left_4:
5972 j = 0;
5973 for(; j<i; j+=4)
5974 {
5975 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5976 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, m-(j+2));
5977 }
5978 kernel_dsyrk_nt_l_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5979 if(j<m-2)
5980 kernel_dsyrk_nt_l_2x2_vs_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2], m-(i+2), m-(j+2));
5981 return;
5982 #else
5983 left_4:
5984 j = 0;
5985 for(; j<i; j+=4)
5986 {
5987 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5988 }
5989 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5990 return;
5991 #endif
5992
5993 left_4_gen:
5994 j = 0;
5995 for(; j<i; j+=4)
5996 {
5997 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5998 }
5999 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
6000 return;
6001 #endif
6002
6003 }
6004
6005
6006
blasfeo_dsyrk_ln_mn(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)6007 void blasfeo_dsyrk_ln_mn(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
6008 {
6009 if(m<=0 | n<=0)
6010 return;
6011
6012 if(ai!=0 | bi!=0)
6013 {
6014 printf("\nblasfeo_dsyrk_ln: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
6015 exit(1);
6016 }
6017
6018 // invalidate stored inverse diagonal of result matrix
6019 sD->use_dA = 0;
6020
6021 const int ps = 4;
6022
6023 int i, j;
6024
6025 int sda = sA->cn;
6026 int sdb = sB->cn;
6027 int sdc = sC->cn;
6028 int sdd = sD->cn;
6029 double *pA = sA->pA + aj*ps;
6030 double *pB = sB->pA + bj*ps;
6031 double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
6032 double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
6033
6034 // TODO ai and bi
6035 int offsetC;
6036 int offsetD;
6037 offsetC = ci&(ps-1);
6038 offsetD = di&(ps-1);
6039
6040 // main loop
6041 i = 0;
6042 if(offsetC==0 & offsetD==0)
6043 {
6044 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6045 for(; i<m-11; i+=12)
6046 {
6047 j = 0;
6048 for(; j<i & j<n-3; j+=4)
6049 {
6050 kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
6051 }
6052 if(j<n)
6053 {
6054 if(j<i) // dgemm
6055 {
6056 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6057 }
6058 else // dsyrk
6059 {
6060 if(j<n-11)
6061 {
6062 kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
6063 #if defined(TARGET_X64_INTEL_HASWELL)
6064 kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
6065 #else
6066 kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
6067 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd]);
6068 #endif
6069 }
6070 else
6071 {
6072 kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6073 if(j<n-4)
6074 {
6075 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
6076 if(j<n-8)
6077 {
6078 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
6079 }
6080 }
6081 }
6082 }
6083 }
6084 }
6085 if(m>i)
6086 {
6087 if(m-i<=4)
6088 {
6089 goto left_4;
6090 }
6091 else if(m-i<=8)
6092 {
6093 goto left_8;
6094 }
6095 else
6096 {
6097 goto left_12;
6098 }
6099 }
6100 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6101 for(; i<m-7; i+=8)
6102 {
6103 j = 0;
6104 for(; j<i & j<n-3; j+=4)
6105 {
6106 kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
6107 }
6108 if(j<n)
6109 {
6110 if(j<i) // dgemm
6111 {
6112 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6113 }
6114 else // dsyrk
6115 {
6116 if(j<n-7)
6117 {
6118 kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
6119 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
6120 }
6121 else
6122 {
6123 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6124 if(j<n-4)
6125 {
6126 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
6127 }
6128 }
6129 }
6130 }
6131 }
6132 if(m>i)
6133 {
6134 if(m-i<=4)
6135 {
6136 goto left_4;
6137 }
6138 else
6139 {
6140 goto left_8;
6141 }
6142 }
6143 #elif defined(TARGET_X86_AMD_BARCELONA)
6144 for(; i<m-3; i+=4)
6145 {
6146 j = 0;
6147 for(; j<i & j<n-3; j+=4)
6148 {
6149 kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
6150 kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd]);
6151 }
6152 if(j<n)
6153 {
6154 if(j<i) // dgemm
6155 {
6156 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6157 if(j<n-2)
6158 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, n-(j+2));
6159 }
6160 else // dsyrk
6161 {
6162 if(j<n-3)
6163 {
6164 kernel_dsyrk_nt_l_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
6165 kernel_dsyrk_nt_l_2x2_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2]);
6166 }
6167 else
6168 {
6169 kernel_dsyrk_nt_l_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6170 if(j<n-2)
6171 kernel_dsyrk_nt_l_2x2_vs_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2], m-(i+2), n-(j+2));
6172 }
6173 }
6174 }
6175 }
6176 if(m>i)
6177 {
6178 goto left_4;
6179 }
6180 #else
6181 for(; i<m-3; i+=4)
6182 {
6183 j = 0;
6184 for(; j<i & j<n-3; j+=4)
6185 {
6186 kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
6187 }
6188 if(j<n)
6189 {
6190 if(j<i) // dgemm
6191 {
6192 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6193 }
6194 else // dsyrk
6195 {
6196 if(j<n-3)
6197 {
6198 kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
6199 }
6200 else
6201 {
6202 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6203 }
6204 }
6205 }
6206 }
6207 if(m>i)
6208 {
6209 goto left_4;
6210 }
6211 #endif
6212 }
6213 else
6214 {
6215 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
6216 for(; i<m-4; i+=8)
6217 {
6218 j = 0;
6219 for(; j<i & j<n; j+=4)
6220 {
6221 kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6222 }
6223 if(j<n)
6224 {
6225 kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6226 if(j<n-4)
6227 {
6228 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, n-j-4);
6229 }
6230 }
6231 }
6232 if(m>i)
6233 {
6234 goto left_4_gen;
6235 }
6236 #else
6237 for(; i<m; i+=4)
6238 {
6239 j = 0;
6240 for(; j<i & j<n; j+=4)
6241 {
6242 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6243 }
6244 if(j<n)
6245 {
6246 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6247 }
6248 }
6249 #endif
6250 }
6251
6252 // common return if i==m
6253 return;
6254
6255 // clean up loops definitions
6256
6257 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6258 left_12:
6259 j = 0;
6260 for(; j<i & j<n; j+=4)
6261 {
6262 kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6263 }
6264 if(j<n)
6265 {
6266 kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6267 if(j<n-4)
6268 {
6269 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
6270 if(j<n-8)
6271 {
6272 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
6273 }
6274 }
6275 }
6276 return;
6277 #endif
6278
6279 #if defined(TARGET_X64_INTEL_HASWELL)
6280 left_8:
6281 j = 0;
6282 for(; j<i-8 & j<n-8; j+=12)
6283 {
6284 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6285 kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
6286 }
6287 if(j<i-4 & j<n-4)
6288 {
6289 kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6290 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
6291 j += 8;
6292 }
6293 if(j<i & j<n)
6294 {
6295 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6296 j += 4;
6297 }
6298 if(j<n)
6299 {
6300 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6301 if(j<n-4)
6302 {
6303 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
6304 }
6305 }
6306 return;
6307 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6308 left_8:
6309 j = 0;
6310 for(; j<i & j<n; j+=4)
6311 {
6312 kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6313 }
6314 if(j<n)
6315 {
6316 kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6317 if(j<n-4)
6318 {
6319 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
6320 }
6321 }
6322 return;
6323 #endif
6324
6325 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6326 left_4:
6327 j = 0;
6328 for(; j<i-8 & j<n-8; j+=12)
6329 {
6330 kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6331 }
6332 if(j<i-4 & j<n-4)
6333 {
6334 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6335 j += 8;
6336 }
6337 else if(j<i & j<n)
6338 {
6339 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6340 j += 4;
6341 }
6342 if(j<n)
6343 {
6344 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6345 }
6346 return;
6347 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6348 left_4:
6349 j = 0;
6350 for(; j<i-4 & j<n-4; j+=8)
6351 {
6352 kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6353 }
6354 if(j<i & j<n)
6355 {
6356 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6357 j += 4;
6358 }
6359 if(j<n)
6360 {
6361 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6362 }
6363 return;
6364 #elif defined(TARGET_X86_AMD_BARCELONA)
6365 left_4:
6366 j = 0;
6367 for(; j<i & j<n; j+=4)
6368 {
6369 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6370 if(j<n-2)
6371 kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, n-(j+2));
6372 }
6373 if(j<n)
6374 {
6375 kernel_dsyrk_nt_l_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6376 if(j<n-2)
6377 kernel_dsyrk_nt_l_2x2_vs_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2], m-(i+2), n-(j+2));
6378 }
6379 return;
6380 #else
6381 left_4:
6382 j = 0;
6383 for(; j<i & j<n; j+=4)
6384 {
6385 kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6386 }
6387 if(j<n)
6388 {
6389 kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6390 }
6391 return;
6392 #endif
6393
6394 left_4_gen:
6395 j = 0;
6396 for(; j<i & j<n; j+=4)
6397 {
6398 kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6399 }
6400 if(j<n)
6401 {
6402 kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6403 }
6404 return;
6405
6406 }
6407
6408
6409
blasfeo_dsyrk_lt(int m,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)6410 void blasfeo_dsyrk_lt(int m, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
6411 {
6412 #ifndef BENCHMARKS_MODE
6413 printf("\nblasfeo_dsyrk_lt: feature not implemented yet\n");
6414 exit(1);
6415 #endif
6416 return;
6417 }
6418
6419
6420
blasfeo_dsyrk_un(int m,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)6421 void blasfeo_dsyrk_un(int m, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
6422 {
6423 #ifndef BENCHMARKS_MODE
6424 printf("\nblasfeo_dsyrk_un: feature not implemented yet\n");
6425 exit(1);
6426 #endif
6427 return;
6428 }
6429
6430
6431
blasfeo_dsyrk_ut(int m,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)6432 void blasfeo_dsyrk_ut(int m, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
6433 {
6434
6435 // fast return
6436 if(m<=0)
6437 return;
6438
6439 // invalidate stored inverse diagonal of result matrix
6440 sD->use_dA = 0;
6441
6442 const int ps = 4;
6443
6444 int sda = sA->cn;
6445 int sdb = sB->cn;
6446 int sdc = sC->cn;
6447 int sdd = sD->cn;
6448
6449 int air = ai & (ps-1);
6450 int bir = bi & (ps-1);
6451 int cir = ci & (ps-1);
6452 int dir = di & (ps-1);
6453
6454 double *pA = sA->pA + aj*ps + (ai-air)*sda;
6455 double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
6456 double *pC = sC->pA + cj*ps + (ci-cir)*sdc;
6457 double *pD = sD->pA + dj*ps + (di-dir)*sdd;
6458
6459 int offsetA = air;
6460 int offsetB = bir;
6461 int offsetC = cir;
6462 int offsetD = dir;
6463
6464 void *mem;
6465 double *pU, *pA2;
6466 int sdu, sda2;
6467
6468 // TODO visual studio alignment
6469 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6470 ALIGNED( double pU0[3*4*K_MAX_STACK], 64 );
6471 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6472 ALIGNED( double pU0[2*4*K_MAX_STACK], 64 );
6473 #elif defined(TARGET_GENERIC)
6474 double pU0[1*4*K_MAX_STACK];
6475 #else
6476 ALIGNED( double pU0[1*4*K_MAX_STACK], 64 );
6477 #endif
6478 int sdu0 = (k+3)/4*4;
6479 sdu0 = sdu0<K_MAX_STACK ? sdu0 : K_MAX_STACK;
6480
6481 // allocate memory
6482 if(k>K_MAX_STACK)
6483 {
6484 sdu = (k+ps-1)/ps*ps;
6485 mem = malloc(12*sdu*sizeof(double)+63);
6486 blasfeo_align_64_byte(mem, (void **) &pU);
6487 }
6488 else
6489 {
6490 pU = pU0;
6491 sdu = sdu0;
6492 }
6493
6494
6495 int i, j, n1;
6496
6497 int idxB;
6498
6499
6500 if(ci!=0 | di!=0)
6501 {
6502 printf("\nblasfeo_dsyrk_ut: feature not implemented yet: ci!=0 | di!=0\n");
6503 exit(1);
6504 }
6505
6506 // algorithm scheme
6507 goto loop_00;
6508 #if 0
6509 if(offsetC==0 & offsetD==0)
6510 {
6511 // printf("\n00\n");
6512 goto loop_00;
6513 }
6514 else
6515 {
6516 // printf("\nCD\n");
6517 goto loop_CD;
6518 }
6519 #endif
6520 // should never get here
6521 goto end;
6522
6523
6524
6525 // main loop aligned
6526 loop_00:
6527 i = 0;
6528 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6529 for(; i<m-11; i+=12)
6530 {
6531 kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6532 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+4)*ps, sda, pU+4*sdu);
6533 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+8)*ps, sda, pU+8*sdu);
6534 #if defined(TARGET_X64_INTEL_HASWELL)
6535 kernel_dsyrk_nn_u_8x8_lib4(k, &alpha, pU, sdu, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, sdc, pD+i*sdd+i*ps, sdd);
6536 #else
6537 kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6538 kernel_dsyrk_nn_u_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+4)*ps, sdb, &beta, pC+i*sdc+(i+4)*ps, sdc, pD+i*sdd+(i+4)*ps, sdd);
6539 #endif
6540 kernel_dsyrk_nn_u_12x4_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+8)*ps, sdb, &beta, pC+i*sdc+(i+8)*ps, sdc, pD+i*sdd+(i+8)*ps, sdd);
6541 for(j=i+12; j<m-3; j+=4)
6542 {
6543 kernel_dgemm_nn_12x4_lib4(k, &alpha, pU, sdu, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, sdc, pD+i*sdd+j*ps, sdc);
6544 }
6545 if(j<m)
6546 {
6547 kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, sdc, pD+i*sdd+j*ps, sdc, m-i, m-j);
6548 }
6549 }
6550 if(i<m)
6551 {
6552 if(m-i<=4)
6553 {
6554 goto left_4;
6555 }
6556 if(m-i<=8)
6557 {
6558 goto left_8;
6559 }
6560 else
6561 {
6562 goto left_12;
6563 }
6564 }
6565 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6566 for(; i<m-7; i+=8)
6567 {
6568 kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6569 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+4)*ps, sda, pU+4*sdu);
6570 kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6571 kernel_dsyrk_nn_u_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+4)*ps, sdb, &beta, pC+i*sdc+(i+4)*ps, sdc, pD+i*sdd+(i+4)*ps, sdd);
6572 for(j=i+8; j<m-3; j+=4)
6573 {
6574 kernel_dgemm_nn_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, sdc, pD+i*sdd+j*ps, sdc);
6575 }
6576 if(j<m)
6577 {
6578 kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, sdc, pD+i*sdd+j*ps, sdc, m-i, m-j);
6579 }
6580 }
6581 if(i<m)
6582 {
6583 if(m-i<=4)
6584 {
6585 goto left_4;
6586 }
6587 else
6588 {
6589 goto left_8;
6590 }
6591 }
6592 #else
6593 for(; i<m-3; i+=4)
6594 {
6595 kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6596 kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6597 for(j=i+4; j<m-3; j+=4)
6598 {
6599 kernel_dgemm_nn_4x4_lib4(k, &alpha, pU, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, pD+i*sdd+j*ps);
6600 }
6601 if(j<m)
6602 {
6603 kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, pD+i*sdd+j*ps, m-i, m-j);
6604 }
6605 }
6606 if(i<m)
6607 {
6608 goto left_4;
6609 }
6610 #endif
6611 goto end;
6612
6613 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6614 left_12:
6615 kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6616 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+4)*ps, sda, pU+4*sdu);
6617 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+8)*ps, sda, pU+8*sdu);
6618 #if defined(TARGET_X64_INTEL_HASWELL)
6619 kernel_dsyrk_nn_u_8x8_lib4(k, &alpha, pU, sdu, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, sdc, pD+i*sdd+i*ps, sdd);
6620 #else
6621 kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6622 kernel_dsyrk_nn_u_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+4)*ps, sdb, &beta, pC+i*sdc+(i+4)*ps, sdc, pD+i*sdd+(i+4)*ps, sdd);
6623 #endif
6624 kernel_dsyrk_nn_u_12x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+8)*ps, sdb, &beta, pC+i*sdc+(i+8)*ps, sdc, pD+i*sdd+(i+8)*ps, sdd, m-i, m-i-8);
6625 goto end;
6626 #endif
6627
6628 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6629 left_8:
6630 kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6631 kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+4)*ps, sda, pU+4*sdu);
6632 kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6633 kernel_dsyrk_nn_u_8x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+4)*ps, sdb, &beta, pC+i*sdc+(i+4)*ps, sdc, pD+i*sdd+(i+4)*ps, sdd, m-i, m-i-4);
6634 goto end;
6635 #endif
6636
6637 left_4:
6638 kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6639 kernel_dsyrk_nn_u_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps, m-i, m-i);
6640 goto end;
6641
6642 end:
6643 if(k>K_MAX_STACK)
6644 {
6645 free(mem);
6646 }
6647 return;
6648
6649 }
6650
6651
6652
6653 #else
6654
6655 #error : wrong LA choice
6656
6657 #endif
6658
6659
6660
6661