1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS for embedded optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 #include <stdlib.h>
37 #include <stdio.h>
38 
39 #include "../include/blasfeo_common.h"
40 #include "../include/blasfeo_d_kernel.h"
41 #include "../include/blasfeo_d_aux.h"
42 #include "../include/blasfeo_d_blasfeo_api.h"
43 
44 
45 
46 /****************************
47 * old interface
48 ****************************/
49 
50 #if 0
51 void dlauum_blk_nt_l_lib(int m, int n, int nv, int *rv, int *cv, double *pA, int sda, double *pB, int sdb, int alg, double *pC, int sdc, double *pD, int sdd)
52 	{
53 
54 	if(m<=0 || n<=0)
55 		return;
56 
57 	// TODO remove
58 	double alpha, beta;
59 	if(alg==0)
60 		{
61 		alpha = 1.0;
62 		beta = 0.0;
63 		}
64 	else if(alg==1)
65 		{
66 		alpha = 1.0;
67 		beta = 1.0;
68 		}
69 	else
70 		{
71 		alpha = -1.0;
72 		beta = 1.0;
73 		}
74 
75 	// TODO remove
76 	int k = cv[nv-1];
77 
78 	const int ps = 4;
79 
80 	int i, j, l;
81 	int ii, iii, jj, kii, kiii, kjj, k0, k1;
82 
83 	i = 0;
84 	ii = 0;
85 	iii = 0;
86 
87 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
88 	for(; i<m-7; i+=8)
89 		{
90 
91 		while(ii<nv && rv[ii]<i+8)
92 			ii++;
93 		if(ii<nv)
94 			kii = cv[ii];
95 		else
96 			kii = cv[ii-1];
97 
98 		j = 0;
99 		jj = 0;
100 		for(; j<i && j<n-3; j+=4)
101 			{
102 
103 			while(jj<nv && rv[jj]<j+4)
104 				jj++;
105 			if(jj<nv)
106 				kjj = cv[jj];
107 			else
108 				kjj = cv[jj-1];
109 			k0 = kii<kjj ? kii : kjj;
110 
111 			kernel_dgemm_nt_8x4_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
112 			}
113 		if(j<n)
114 			{
115 
116 			while(jj<nv && rv[jj]<j+4)
117 				jj++;
118 			if(jj<nv)
119 				kjj = cv[jj];
120 			else
121 				kjj = cv[jj-1];
122 			k0 = kii<kjj ? kii : kjj;
123 
124 			if(j<i) // dgemm
125 				{
126 				kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
127 				}
128 			else // dsyrk
129 				{
130 				kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, 8, n-j);
131 				if(j<n-4)
132 					{
133 					kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], 4, n-j-4); // TODO
134 					}
135 				}
136 			}
137 		}
138 	if(m>i)
139 		{
140 		if(m-i<=4)
141 			{
142 			goto left_4;
143 			}
144 		else
145 			{
146 			goto left_8;
147 			}
148 		}
149 #else
150 	for(; i<m-3; i+=4)
151 		{
152 
153 		while(ii<nv && rv[ii]<i+4)
154 			ii++;
155 		if(ii<nv)
156 			kii = cv[ii];
157 		else
158 			kii = cv[ii-1];
159 //		k0 = kii;
160 //		printf("\nii %d %d %d %d %d\n", i, ii, rv[ii], cv[ii], kii);
161 
162 		j = 0;
163 		jj = 0;
164 		for(; j<i && j<n-3; j+=4)
165 			{
166 
167 			while(jj<nv && rv[jj]<j+4)
168 				jj++;
169 			if(jj<nv)
170 				kjj = cv[jj];
171 			else
172 				kjj = cv[jj-1];
173 			k0 = kii<kjj ? kii : kjj;
174 //			printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
175 
176 			kernel_dgemm_nt_4x4_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
177 			}
178 		if(j<n)
179 			{
180 
181 			while(jj<nv && rv[jj]<j+4)
182 				jj++;
183 			if(jj<nv)
184 				kjj = cv[jj];
185 			else
186 				kjj = cv[jj-1];
187 			k0 = kii<kjj ? kii : kjj;
188 //			printf("\njj %d %d %d %d %d\n", j, jj, rv[jj], cv[jj], kjj);
189 
190 			if(j<i) // dgemm
191 				{
192 				kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
193 				}
194 			else // dsyrk
195 				{
196 				kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], 4, n-j);
197 				}
198 			}
199 		}
200 	if(m>i)
201 		{
202 		goto left_4;
203 		}
204 #endif
205 
206 	// common return if i==m
207 	return;
208 
209 	// clean up loops definitions
210 
211 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
212 	left_8:
213 
214 	kii = cv[nv-1];
215 
216 	j = 0;
217 	jj = 0;
218 	for(; j<i && j<n-3; j+=4)
219 		{
220 
221 		while(jj<nv && rv[jj]<j+4)
222 			jj++;
223 		if(jj<nv)
224 			kjj = cv[jj];
225 		else
226 			kjj = cv[jj-1];
227 		k0 = kii<kjj ? kii : kjj;
228 
229 		kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
230 		}
231 	if(j<n)
232 		{
233 
234 		while(jj<nv && rv[jj]<j+4)
235 			jj++;
236 		if(jj<nv)
237 			kjj = cv[jj];
238 		else
239 			kjj = cv[jj-1];
240 		k0 = kii<kjj ? kii : kjj;
241 
242 		if(j<i) // dgemm
243 			{
244 			kernel_dgemm_nt_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
245 			}
246 		else // dsyrk
247 			{
248 			kernel_dsyrk_nt_l_8x4_vs_lib4(k0, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
249 			if(j<n-4)
250 				{
251 				kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4); // TODO
252 				}
253 			}
254 		}
255 	return;
256 #endif
257 
258 	left_4:
259 
260 	kii = cv[nv-1];
261 
262 	j = 0;
263 	jj = 0;
264 	for(; j<i && j<n-3; j+=4)
265 		{
266 
267 		while(jj<nv && rv[jj]<j+4)
268 			jj++;
269 		if(jj<nv)
270 			kjj = cv[jj];
271 		else
272 			kjj = cv[jj-1];
273 		k0 = kii<kjj ? kii : kjj;
274 
275 		kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
276 		}
277 	if(j<n)
278 		{
279 
280 		while(jj<nv && rv[jj]<j+4)
281 			jj++;
282 		if(jj<nv)
283 			kjj = cv[jj];
284 		else
285 			kjj = cv[jj-1];
286 		k0 = kii<kjj ? kii : kjj;
287 
288 		if(j<i) // dgemm
289 			{
290 			kernel_dgemm_nt_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
291 			}
292 		else // dsyrk
293 			{
294 			kernel_dsyrk_nt_l_4x4_vs_lib4(k0, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
295 			}
296 		}
297 	return;
298 
299 	}
300 #endif
301 
302 
303 
304 /****************************
305 * new interface
306 ****************************/
307 
308 
309 
310 #if defined(LA_HIGH_PERFORMANCE)
311 
312 
313 
314 // dgemm nn
blasfeo_dgemm_nn(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)315 void blasfeo_dgemm_nn(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
316 	{
317 	if(m<=0 || n<=0)
318 		return;
319 
320 	// invalidate stored inverse diagonal of result matrix
321 	sD->use_dA = 0;
322 
323 	const int ps = 4;
324 
325 	int sda = sA->cn;
326 	int sdb = sB->cn;
327 	int sdc = sC->cn;
328 	int sdd = sD->cn;
329 
330 	int air = ai & (ps-1);
331 	int bir = bi & (ps-1);
332 
333 	// pA, pB point to panels edges
334 	double *pA = sA->pA + aj*ps + (ai-air)*sda;
335 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
336 	double *pC = sC->pA + cj*ps;
337 	double *pD = sD->pA + dj*ps;
338 
339 	int offsetB = bir;
340 
341 	int ci0 = ci-air;
342 	int di0 = di-air;
343 	int offsetC;
344 	int offsetD;
345 	if(ci0>=0)
346 		{
347 		pC += ci0/ps*ps*sdd;
348 		offsetC = ci0%ps;
349 		}
350 	else
351 		{
352 		pC += -ps*sdc;
353 		offsetC = ps+ci0;
354 		}
355 
356 	if(di0>=0)
357 		{
358 		pD += di0/ps*ps*sdd;
359 		offsetD = di0%ps;
360 		}
361 	else
362 		{
363 		pD += -ps*sdd;
364 		offsetD = ps+di0;
365 		}
366 
367 	int i, j, l;
368 
369 
370 
371 	// algorithm scheme
372 	if(air!=0)
373 		{
374 		goto clear_air;
375 		}
376 select_loop:
377 	if(offsetC==0 & offsetD==0)
378 		{
379 		goto loop_00;
380 		}
381 	else
382 		{
383 		goto loop_CD;
384 		}
385 	// should never get here
386 	return;
387 
388 
389 
390 	// clean up at the beginning
391 clear_air:
392 #if defined(TARGET_X64_INTEL_HASWELL)
393 	if(air+m>8)
394 		{
395 		j = 0;
396 		for(; j<n; j+=4)
397 			{
398 			kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[0], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
399 			}
400 		m -= 3*ps-air;
401 		pA += 3*ps*sda;
402 		pC += 3*ps*sdc;
403 		pD += 3*ps*sdd;
404 		}
405 	else // air+m<=8
406 #endif
407 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
408 	if(air+m>4) // (m>5)
409 		{
410 		j = 0;
411 		for(; j<n; j+=4)
412 			{
413 			kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[0], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
414 			}
415 		m -= 2*ps-air;
416 		pA += 2*ps*sda;
417 		pC += 2*ps*sdc;
418 		pD += 2*ps*sdd;
419 		}
420 	else // air+m<=4 // m-i<=4
421 		{
422 #endif
423 		j = 0;
424 		for(; j<n; j+=4)
425 			{
426 			kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[0], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
427 			}
428 		m -= 1*ps-air;
429 		pA += 1*ps*sda;
430 		pC += 1*ps*sdc;
431 		pD += 1*ps*sdd;
432 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
433 		// nothing more to do
434 		}
435 #endif
436 	goto select_loop;
437 
438 
439 
440 	// main loop aligned
441 loop_00:
442 	i = 0;
443 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
444 	for(; i<m-11; i+=12)
445 		{
446 		j = 0;
447 		for(; j<n-3; j+=4)
448 			{
449 			kernel_dgemm_nn_12x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
450 			}
451 		if(j<n)
452 			{
453 			kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
454 			}
455 		}
456 	if(m>i)
457 		{
458 		if(m-i<=4)
459 			{
460 			goto left_4;
461 			}
462 		else if(m-i<=8)
463 			{
464 			goto left_8;
465 			}
466 		else
467 			{
468 			goto left_12;
469 			}
470 		}
471 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
472 	for(; i<m-12 | i==m-8; i+=8)
473 		{
474 		j = 0;
475 		for(; j<n-3; j+=4)
476 			{
477 			kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
478 			}
479 		if(j<n)
480 			{
481 			kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
482 			}
483 		}
484 	if(m>i)
485 		{
486 		if(m-i<=4)
487 			{
488 			goto left_4;
489 			}
490 		else if(m-i<=8)
491 			{
492 			goto left_8;
493 			}
494 		else
495 			{
496 			goto left_12;
497 			}
498 		}
499 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A57)
500 	for(; i<m-7; i+=8)
501 		{
502 		j = 0;
503 		for(; j<n-3; j+=4)
504 			{
505 			kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
506 			}
507 		if(j<n)
508 			{
509 			kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
510 			}
511 		}
512 	if(m>i)
513 		{
514 		if(m-i<=4)
515 			{
516 			goto left_4;
517 			}
518 		else
519 			{
520 			goto left_8;
521 			}
522 		}
523 #elif defined(TARGET_X86_AMD_BARCELONA)
524 	for(; i<m-3; i+=4)
525 		{
526 		j = 0;
527 		for(; j<n-1; j+=2)
528 			{
529 			kernel_dgemm_nn_4x2_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
530 			}
531 		if(j<n)
532 			{
533 			kernel_dgemm_nn_4x2_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
534 			}
535 		}
536 	if(m>i)
537 		{
538 		goto left_4;
539 		}
540 #else // all others
541 	for(; i<m-3; i+=4)
542 		{
543 		j = 0;
544 		for(; j<n-3; j+=4)
545 			{
546 			kernel_dgemm_nn_4x4_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
547 			}
548 		if(j<n)
549 			{
550 			kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
551 			}
552 		}
553 	if(m>i)
554 		{
555 		goto left_4;
556 		}
557 #endif
558 	// common return if i==m
559 	return;
560 
561 
562 
563 	// main loop C, D not aligned
564 loop_CD:
565 	i = 0;
566 #if defined(TARGET_X64_INTEL_HASWELL)
567 	for(; i<m-8; i+=12)
568 		{
569 		j = 0;
570 		for(; j<n; j+=4)
571 			{
572 			kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
573 			}
574 		}
575 	if(m>i)
576 		{
577 		if(m-i<=4)
578 			{
579 			goto left_4_g;
580 			}
581 		else
582 			{
583 			goto left_8_g;
584 			}
585 		}
586 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
587 	for(; i<m-4; i+=8)
588 		{
589 		j = 0;
590 		for(; j<n; j+=4)
591 			{
592 			kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
593 			}
594 		}
595 	if(m>i)
596 		{
597 		goto left_4_g;
598 		}
599 #else
600 	for(; i<m; i+=4)
601 		{
602 		j = 0;
603 		for(; j<n; j+=4)
604 			{
605 			kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
606 			}
607 		}
608 #endif
609 	// common return if i==m
610 	return;
611 
612 
613 
614 	// clean up loops definitions
615 
616 #if defined(TARGET_X64_INTEL_HASWELL)
617 	left_12_g:
618 	j = 0;
619 	for(; j<n; j+=4)
620 		{
621 		kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
622 		}
623 	return;
624 #endif
625 
626 
627 
628 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
629 	left_12:
630 	j = 0;
631 	for(; j<n; j+=4)
632 		{
633 		kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
634 		}
635 	return;
636 #endif
637 
638 
639 
640 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_HASWELL)
641 	left_8_g:
642 	j = 0;
643 	for(; j<n; j+=4)
644 		{
645 		kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
646 		}
647 	return;
648 #endif
649 
650 
651 
652 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
653 	left_8:
654 	j = 0;
655 	for(; j<n; j+=4)
656 		{
657 		kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
658 		}
659 	return;
660 #endif
661 
662 
663 
664 	left_4_g:
665 	j = 0;
666 	for(; j<n; j+=4)
667 		{
668 		kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
669 		}
670 	return;
671 
672 
673 
674 #if defined(TARGET_X64_INTEL_HASWELL)
675 	left_4:
676 	j = 0;
677 	for(; j<n-8; j+=12)
678 		{
679 		kernel_dgemm_nn_4x12_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
680 		}
681 	if(j<n-4)
682 		{
683 		kernel_dgemm_nn_4x8_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
684 		}
685 	else if(j<n)
686 		{
687 		kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
688 		}
689 	return;
690 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
691 	left_4:
692 	j = 0;
693 	for(; j<n-4; j+=8)
694 		{
695 		kernel_dgemm_nn_4x8_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
696 		}
697 	if(j<n)
698 		{
699 		kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
700 		}
701 	return;
702 #elif defined(TARGET_X86_AMD_BARCELONA)
703 	left_4:
704 	j = 0;
705 	for(; j<n; j+=2)
706 		{
707 		kernel_dgemm_nn_4x2_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
708 		}
709 	return;
710 #else // all others
711 	left_4:
712 	j = 0;
713 	for(; j<n; j+=4)
714 		{
715 		kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
716 		}
717 	return;
718 #endif
719 
720 	return;
721 
722 	}
723 
724 
725 
726 // dgemm nt
blasfeo_dgemm_nt(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)727 void blasfeo_dgemm_nt(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
728 	{
729 	if(m<=0 | n<=0)
730 		return;
731 
732 	// invalidate stored inverse diagonal of result matrix
733 	sD->use_dA = 0;
734 
735 	const int ps = 4;
736 
737 	int sda = sA->cn;
738 	int sdb = sB->cn;
739 	int sdc = sC->cn;
740 	int sdd = sD->cn;
741 	int air = ai & (ps-1);
742 	int bir = bi & (ps-1);
743 	double *pA = sA->pA + aj*ps + (ai-air)*sda;
744 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
745 	double *pC = sC->pA + cj*ps;
746 	double *pD = sD->pA + dj*ps;
747 
748 	int ci0 = ci-air;
749 	int di0 = di-air;
750 	int offsetC;
751 	int offsetD;
752 	if(ci0>=0)
753 		{
754 		pC += ci0/ps*ps*sdd;
755 		offsetC = ci0%ps;
756 		}
757 	else
758 		{
759 		pC += -4*sdc;
760 		offsetC = ps+ci0;
761 		}
762 	if(di0>=0)
763 		{
764 		pD += di0/ps*ps*sdd;
765 		offsetD = di0%ps;
766 		}
767 	else
768 		{
769 		pD += -4*sdd;
770 		offsetD = ps+di0;
771 		}
772 
773 	int i, j;
774 
775 	int idxB;
776 
777 
778 
779 
780 	// algorithm scheme
781 	if(air!=0)
782 		{
783 		goto clear_air;
784 		// TODO instaed use buffer to align A !!!
785 		}
786 select_loop:
787 	if(offsetC==0 & offsetD==0)
788 		{
789 		goto loop_00;
790 		}
791 	else
792 		{
793 		goto loop_CD;
794 		}
795 	// should never get here
796 	return;
797 
798 
799 
800 	// clean up at the beginning
801 clear_air:
802 #if defined(TARGET_X64_INTEL_HASWELL)
803 	if(air+m>8)
804 		{
805 		j = 0;
806 		idxB = 0;
807 		// clean up at the beginning
808 		if(bir!=0)
809 			{
810 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, bir+n-j);
811 			j += ps-bir;
812 			idxB += 4;
813 			}
814 		// main loop
815 		for(; j<n; j+=4, idxB+=4)
816 			{
817 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
818 			}
819 		m -= 3*ps-air;
820 		pA += 3*ps*sda;
821 		pC += 3*ps*sdc;
822 		pD += 3*ps*sdd;
823 		}
824 	else // air+m<=8
825 #endif
826 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
827 	if(air+m>4) // (m>5)
828 		{
829 		j = 0;
830 		idxB = 0;
831 		// clean up at the beginning
832 		if(bir!=0)
833 			{
834 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, bir+n-j);
835 			j += ps-bir;
836 			idxB += 4;
837 			}
838 		// main loop
839 		for(; j<n; j+=4, idxB+=4)
840 			{
841 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[0], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
842 			}
843 		m -= 2*ps-air;
844 		pA += 2*ps*sda;
845 		pC += 2*ps*sdc;
846 		pD += 2*ps*sdd;
847 		}
848 	else // m<=4
849 		{
850 #endif
851 		j = 0;
852 		idxB = 0;
853 		// clean up at the beginning
854 		if(bir!=0)
855 			{
856 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps]-bir*ps, sdc, offsetD, &pD[j*ps]-bir*ps, sdd, air, air+m, bir, bir+n-j);
857 			j += ps-bir;
858 			idxB += 4;
859 			}
860 		// main loop
861 		for(; j<n; j+=4, idxB+=4)
862 			{
863 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[0], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps], sdc, offsetD, &pD[j*ps], sdd, air, air+m, 0, n-j);
864 			}
865 		m -= ps-air;
866 		pA += ps*sda;
867 		pC += ps*sdc;
868 		pD += ps*sdd;
869 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
870 		// nothing more to do
871 		}
872 #endif
873 	goto select_loop;
874 
875 
876 
877 	// main loop aligned
878 loop_00:
879 	i = 0;
880 #if defined(TARGET_X64_INTEL_HASWELL)
881 	for(; i<m-11; i+=12)
882 		{
883 		j = 0;
884 		idxB = 0;
885 		// clean up at the beginning
886 		if(bir!=0)
887 			{
888 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
889 			j += ps-bir;
890 			idxB += 4;
891 			}
892 		// main loop
893 		for(; j<n-3; j+=4, idxB+=4)
894 			{
895 			kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
896 			}
897 		if(j<n)
898 			{
899 			kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
900 			}
901 		}
902 	if(m>i)
903 		{
904 		if(m-i<=4)
905 			{
906 			goto left_4;
907 			}
908 		else if(m-i<=8)
909 			{
910 			goto left_8;
911 			}
912 		else
913 			{
914 			goto left_12;
915 			}
916 		}
917 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
918 	for(; i<m-11; i+=12)
919 		{
920 		j = 0;
921 		idxB = 0;
922 		// clean up at the beginning
923 		if(bir!=0)
924 			{
925 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+0)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+0)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+0)*sdd]-bir*ps, sdd, 0, m-(i+0), bir, bir+n-j);
926 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+4)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+4)*sdd]-bir*ps, sdd, 0, m-(i+4), bir, bir+n-j);
927 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+8)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+8)*sdd]-bir*ps, sdd, 0, m-(i+8), bir, bir+n-j);
928 			j += ps-bir;
929 			idxB += 4;
930 			}
931 		// main loop
932 		for(; j<n-3; j+=4, idxB+=4)
933 			{
934 			kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
935 			}
936 		if(j<n)
937 			{
938 			kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
939 			}
940 		}
941 	if(m>i)
942 		{
943 		if(m-i<=4)
944 			{
945 			goto left_4;
946 			}
947 		else if(m-i<=8)
948 			{
949 			goto left_8;
950 			}
951 		else
952 			{
953 			goto left_12;
954 			}
955 		}
956 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
957 	for(; i<m-7; i+=8)
958 		{
959 		j = 0;
960 		idxB = 0;
961 		// clean up at the beginning
962 		if(bir!=0)
963 			{
964 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
965 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
966 #else
967 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+0)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+0)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+0)*sdd]-bir*ps, sdd, 0, m-(i+0), bir, bir+n-j);
968 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[idxB*sdb], &beta, 0, &pC[j*ps+(i+4)*sdc]-bir*ps, sdc, 0, &pD[j*ps+(i+4)*sdd]-bir*ps, sdd, 0, m-(i+4), bir, bir+n-j);
969 #endif
970 			j += ps-bir;
971 			idxB += 4;
972 			}
973 		// main loop
974 		for(; j<n-3; j+=4, idxB+=4)
975 			{
976 			kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
977 			}
978 		if(j<n)
979 			{
980 			kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
981 			}
982 		}
983 	if(m>i)
984 		{
985 		if(m-i<=4)
986 			{
987 			goto left_4;
988 			}
989 		else
990 			{
991 			goto left_8;
992 			}
993 		}
994 #elif defined(TARGET_X86_AMD_BARCELONA)
995 	for(; i<m-3; i+=4)
996 		{
997 		j = 0;
998 		idxB = 0;
999 		// clean up at the beginning
1000 		if(bir!=0)
1001 			{
1002 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1003 			j += ps-bir;
1004 			idxB += 4;
1005 			}
1006 		// main loop
1007 		for(; j<n-3; j+=4, idxB+=4)
1008 			{
1009 			kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+0], &beta, &pC[(j+0)*ps+i*sdc], &pD[(j+0)*ps+i*sdd]);
1010 			kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd]);
1011 			}
1012 		if(j<n-2)
1013 			{
1014 			kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+0], &beta, &pC[(j+0)*ps+i*sdc], &pD[(j+0)*ps+i*sdd]);
1015 			kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, n-j-2);
1016 			}
1017 		else if(j<n)
1018 			{
1019 			kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1020 			}
1021 		}
1022 	if(m>i)
1023 		{
1024 		goto left_4;
1025 		}
1026 #else
1027 	for(; i<m-3; i+=4)
1028 		{
1029 		j = 0;
1030 		idxB = 0;
1031 		// clean up at the beginning
1032 		if(bir!=0)
1033 			{
1034 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1035 			j += ps-bir;
1036 			idxB += 4;
1037 			}
1038 		// main loop
1039 		for(; j<n-3; j+=4, idxB+=4)
1040 			{
1041 			kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
1042 			}
1043 		if(j<n)
1044 			{
1045 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1046 			}
1047 		}
1048 	if(m>i)
1049 		{
1050 		goto left_4;
1051 		}
1052 #endif
1053 	// common return if i==m
1054 	return;
1055 
1056 
1057 
1058 	// main loop C, D not aligned
1059 loop_CD:
1060 	i = 0;
1061 #if defined(TARGET_X64_INTEL_HASWELL)
1062 	for(; i<m-8; i+=12)
1063 		{
1064 		j = 0;
1065 		idxB = 0;
1066 		// clean up at the beginning
1067 		if(bir!=0)
1068 			{
1069 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1070 			j += ps-bir;
1071 			idxB += 4;
1072 			}
1073 		// main loop
1074 		for(; j<n; j+=4, idxB+=4)
1075 			{
1076 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1077 			}
1078 		}
1079 	if(m>i)
1080 		{
1081 		if(m-i<=4)
1082 			{
1083 			goto left_4_g;
1084 			}
1085 		else
1086 			{
1087 			goto left_8_g;
1088 			}
1089 		}
1090 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1091 	for(; i<m-4; i+=8)
1092 		{
1093 		j = 0;
1094 		idxB = 0;
1095 		// clean up at the beginning
1096 		if(bir!=0)
1097 			{
1098 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1099 			j += ps-bir;
1100 			idxB += 4;
1101 			}
1102 		// main loop
1103 		for(; j<n; j+=4, idxB+=4)
1104 			{
1105 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1106 			}
1107 		}
1108 	if(m>i)
1109 		{
1110 		goto left_4_g;
1111 		}
1112 #else
1113 	for(; i<m; i+=4)
1114 		{
1115 		j = 0;
1116 		idxB = 0;
1117 		// clean up at the beginning
1118 		if(bir!=0)
1119 			{
1120 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1121 			j += ps-bir;
1122 			idxB += 4;
1123 			}
1124 		// main loop
1125 		for(; j<n; j+=4, idxB+=4)
1126 			{
1127 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1128 			}
1129 		}
1130 #endif
1131 	// common return if i==m
1132 	return;
1133 
1134 
1135 
1136 	// clean up loops definitions
1137 
1138 #if defined(TARGET_X64_INTEL_HASWELL)
1139 	left_12:
1140 	j = 0;
1141 	idxB = 0;
1142 	// clean up at the beginning
1143 	if(bir!=0)
1144 		{
1145 		kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1146 		j += ps-bir;
1147 		idxB += 4;
1148 		}
1149 	// main loop
1150 	for(; j<n; j+=4, idxB+=4)
1151 		{
1152 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1153 		}
1154 	return;
1155 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1156 	left_12:
1157 	j = 0;
1158 	idxB = 0;
1159 	// clean up at the beginning
1160 	if(bir!=0)
1161 		{
1162 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+0)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+0)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+0)*sdd]-bir*ps, sdd, 0, m-(i+0), bir, bir+n-j);
1163 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+4)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+4)*sdd]-bir*ps, sdd, 0, m-(i+4), bir, bir+n-j);
1164 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+8)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+8)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+8)*sdd]-bir*ps, sdd, 0, m-(i+8), bir, bir+n-j);
1165 		j += ps-bir;
1166 		idxB += 4;
1167 		}
1168 	// main loop
1169 	for(; j<n; j+=4, idxB+=4)
1170 		{
1171 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1172 		}
1173 	return;
1174 #endif
1175 
1176 
1177 
1178 #if defined(TARGET_X64_INTEL_HASWELL)
1179 	left_8:
1180 	j = 0;
1181 	idxB = 0;
1182 	// clean up at the beginning
1183 	if(bir!=0)
1184 		{
1185 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1186 		j += ps-bir;
1187 		idxB += 4;
1188 		}
1189 	// main loop
1190 	for(; j<n-8; j+=12, idxB+=12)
1191 		{
1192 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1193 		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(idxB+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
1194 		}
1195 	if(j<n)
1196 		{
1197 		if(n-j<=4)
1198 			{
1199 			kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1200 			}
1201 		else
1202 			{
1203 			kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1204 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(idxB+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
1205 			}
1206 		}
1207 	return;
1208 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1209 	left_8:
1210 	j = 0;
1211 	idxB = 0;
1212 	// clean up at the beginning
1213 	if(bir!=0)
1214 		{
1215 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1216 		j += ps-bir;
1217 		idxB += 4;
1218 		}
1219 	// main loop
1220 	for(; j<n; j+=4, idxB+=4)
1221 		{
1222 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1223 		}
1224 	return;
1225 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1226 	left_8:
1227 	j = 0;
1228 	idxB = 0;
1229 	// clean up at the beginning
1230 	if(bir!=0)
1231 		{
1232 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+0)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+0)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+0)*sdd]-bir*ps, sdd, 0, m-(i+0), bir, bir+n-j);
1233 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+(i+4)*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+(i+4)*sdd]-bir*ps, sdd, 0, m-(i+4), bir, bir+n-j);
1234 		j += ps-bir;
1235 		idxB += 4;
1236 		}
1237 	// main loop
1238 	for(; j<n; j+=4, idxB+=4)
1239 		{
1240 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
1241 		}
1242 	return;
1243 #endif
1244 
1245 
1246 
1247 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1248 	left_8_g:
1249 	j = 0;
1250 	idxB = 0;
1251 	// clean up at the beginning
1252 	if(bir!=0)
1253 		{
1254 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1255 		j += ps-bir;
1256 		idxB += 4;
1257 		}
1258 	// main loop
1259 	for(; j<n; j+=4, idxB+=4)
1260 		{
1261 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1262 		}
1263 	return;
1264 #endif
1265 
1266 
1267 
1268 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1269 	left_4:
1270 	j = 0;
1271 	idxB = 0;
1272 	// clean up at the beginning
1273 	if(bir!=0)
1274 		{
1275 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1276 		j += ps-bir;
1277 		idxB += 4;
1278 		}
1279 	// main loop
1280 	for(; j<n-8; j+=12, idxB+=12)
1281 		{
1282 		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1283 		}
1284 	if(j<n)
1285 		{
1286 		if(n-j<=4)
1287 			{
1288 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1289 			}
1290 		else
1291 			{
1292 			kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1293 			}
1294 		}
1295 	return;
1296 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1297 	left_4:
1298 	j = 0;
1299 	idxB = 0;
1300 	// clean up at the beginning
1301 	if(bir!=0)
1302 		{
1303 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1304 		j += ps-bir;
1305 		idxB += 4;
1306 		}
1307 	// main loop
1308 	for(; j<n-4; j+=8, idxB+=8)
1309 		{
1310 		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1311 		}
1312 	if(j<n)
1313 		{
1314 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1315 		}
1316 	return;
1317 #elif defined(TARGET_X86_AMD_BARCELONA)
1318 	left_4:
1319 	j = 0;
1320 	idxB = 0;
1321 	// clean up at the beginning
1322 	if(bir!=0)
1323 		{
1324 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1325 		j += ps-bir;
1326 		idxB += 4;
1327 		}
1328 	// main loop
1329 	for(; j<n-2; j+=4, idxB+=4)
1330 		{
1331 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+0], &beta, &pC[(j+0)*ps+i*sdc], &pD[(j+0)*ps+i*sdd], m-i, n-j-0);
1332 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, n-j-2);
1333 		}
1334 	if(j<n)
1335 		{
1336 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1337 		}
1338 	return;
1339 #else
1340 	left_4:
1341 	j = 0;
1342 	idxB = 0;
1343 	// clean up at the beginning
1344 	if(bir!=0)
1345 		{
1346 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1347 		j += ps-bir;
1348 		idxB += 4;
1349 		}
1350 	// main loop
1351 	for(; j<n; j+=4, idxB+=4)
1352 		{
1353 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1354 		}
1355 	return;
1356 #endif
1357 
1358 
1359 
1360 	left_4_g:
1361 	j = 0;
1362 	idxB = 0;
1363 	// clean up at the beginning
1364 	if(bir!=0)
1365 		{
1366 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+n-j);
1367 		j += ps-bir;
1368 		idxB += 4;
1369 		}
1370 	// main loop
1371 	for(; j<n; j+=4, idxB+=4)
1372 		{
1373 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
1374 		}
1375 	return;
1376 
1377 	}
1378 
1379 
1380 
1381 // dgemm_tn
blasfeo_dgemm_tn(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)1382 void blasfeo_dgemm_tn(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
1383 	{
1384 	if(m<=0 || n<=0)
1385 		return;
1386 
1387 	// invalidate stored inverse diagonal of result matrix
1388 	sD->use_dA = 0;
1389 
1390 	const int ps = 4;
1391 
1392 	int sda = sA->cn;
1393 	int sdb = sB->cn;
1394 	int sdc = sC->cn;
1395 	int sdd = sD->cn;
1396 
1397 	int air = ai & (ps-1);
1398 	int bir = bi & (ps-1);
1399 	int cir = ci & (ps-1);
1400 	int dir = di & (ps-1);
1401 
1402 	double *pA = sA->pA + aj*ps + (ai-air)*sda;
1403 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
1404 	double *pC = sC->pA + cj*ps + (ci-cir)*sdc;
1405 	double *pD = sD->pA + dj*ps + (di-dir)*sdd;
1406 
1407 	int offsetA = air;
1408 	int offsetB = bir;
1409 	int offsetC = cir;
1410 	int offsetD = dir;
1411 
1412 // TODO visual studio alignment
1413 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1414 	ALIGNED( double pU0[3*4*K_MAX_STACK], 64 );
1415 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1416 	ALIGNED( double pU0[2*4*K_MAX_STACK], 64 );
1417 #elif defined(TARGET_GENERIC)
1418 	double pU0[1*4*K_MAX_STACK];
1419 #else
1420 	ALIGNED( double pU0[1*4*K_MAX_STACK], 64 );
1421 #endif
1422 	int sdu0 = (k+3)/4*4;
1423 	sdu0 = sdu0<K_MAX_STACK ? sdu0 : K_MAX_STACK;
1424 
1425 	struct blasfeo_dmat sAt;
1426 	int sAt_size;
1427 	void *mem;
1428 	char *mem_align;
1429 
1430 	double *pU;
1431 	int sdu;
1432 
1433 	int ii, jj;
1434 
1435 	if(k>K_MAX_STACK)
1436 		{
1437 		sAt_size = blasfeo_memsize_dmat(12, k);
1438 		mem = malloc(sAt_size+64);
1439 		blasfeo_align_64_byte(mem, (void **) &mem_align);
1440 		blasfeo_create_dmat(12, k, &sAt, (void *) mem_align);
1441 		pU = sAt.pA;
1442 		sdu = sAt.cn;
1443 		}
1444 	else
1445 		{
1446 		pU = pU0;
1447 		sdu = sdu0;
1448 		}
1449 
1450 
1451 	// algorithm scheme
1452 	if(offsetC==0 & offsetD==0)
1453 		{
1454 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_CORE) | defined(TARGET_GENERIC)
1455 		if(m<=n)
1456 			{
1457 			goto loop_00_m0; // transpose A
1458 			}
1459 		else
1460 			{
1461 			goto loop_00_n0; // transpose B
1462 			}
1463 #else
1464 		goto loop_00_m0; // transpose A
1465 #endif
1466 		}
1467 	else
1468 		{
1469 		goto loop_CD_m0;
1470 		}
1471 	// should never get here
1472 	return;
1473 
1474 
1475 
1476 loop_00_m0:
1477 	ii = 0;
1478 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1479 	for(; ii<m-11; ii+=12)
1480 		{
1481 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1482 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1483 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
1484 		for(jj=0; jj<n-3; jj+=4)
1485 			{
1486 			kernel_dgemm_nn_12x4_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
1487 			}
1488 		if(jj<n)
1489 			{
1490 			kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1491 			}
1492 		}
1493 	if(ii<m)
1494 		{
1495 		if(m-ii<=4)
1496 			{
1497 			goto left_4_m0;
1498 			}
1499 		if(m-ii<=8)
1500 			{
1501 			goto left_8_m0;
1502 			}
1503 		else
1504 			{
1505 			goto left_12_m0;
1506 			}
1507 		}
1508 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1509 	for(; ii<m-7; ii+=8)
1510 		{
1511 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1512 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1513 		for(jj=0; jj<n-3; jj+=4)
1514 			{
1515 			kernel_dgemm_nn_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
1516 			}
1517 		if(jj<n)
1518 			{
1519 			kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1520 			}
1521 		}
1522 	if(ii<m)
1523 		{
1524 		if(m-ii<=4)
1525 			{
1526 			goto left_4_m0;
1527 			}
1528 		else
1529 			{
1530 			goto left_8_m0;
1531 			}
1532 		}
1533 #else
1534 	for(; ii<m-3; ii+=4)
1535 		{
1536 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1537 		for(jj=0; jj<n-3; jj+=4)
1538 			{
1539 			kernel_dgemm_nn_4x4_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
1540 			}
1541 		if(jj<n)
1542 			{
1543 			kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1544 			}
1545 		}
1546 	if(ii<m)
1547 		{
1548 		goto left_4_m0;
1549 		}
1550 #endif
1551 	goto tn_return;
1552 
1553 
1554 
1555 	// non-malloc algorith, C, D not aligned
1556 loop_CD_m0:
1557 	ii = 0;
1558 	// clean up loops definitions
1559 #if defined(TARGET_X64_INTEL_HASWELL)
1560 	for(; ii<m-8; ii+=12)
1561 		{
1562 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1563 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1564 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
1565 		for(jj=0; jj<n; jj+=4)
1566 			{
1567 			kernel_dgemm_nn_12x4_gen_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1568 			}
1569 		}
1570 	if(ii<m)
1571 		{
1572 		if(m-ii<=4)
1573 			{
1574 			goto left_4_m0_g;
1575 			}
1576 		else
1577 			{
1578 			goto left_8_m0_g;
1579 			}
1580 		}
1581 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1582 	for(; ii<m-4; ii+=8)
1583 		{
1584 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1585 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1586 		for(jj=0; jj<n; jj+=4)
1587 			{
1588 			kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1589 			}
1590 		}
1591 	if(ii<m)
1592 		{
1593 		goto left_4_m0_g;
1594 		}
1595 #else
1596 	for(; ii<m; ii+=4)
1597 		{
1598 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1599 		for(jj=0; jj<n; jj+=4)
1600 			{
1601 			kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1602 			}
1603 		}
1604 #endif
1605 	// common return if i==m
1606 	goto tn_return;
1607 
1608 
1609 
1610 loop_00_n0:
1611 	jj = 0;
1612 #if defined(TARGET_X64_INTEL_HASWELL)
1613 	for(; jj<n-11; jj+=12)
1614 		{
1615 		kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+0)*ps, sdb, pU+0*sdu);
1616 		kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+4)*ps, sdb, pU+4*sdu);
1617 		kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+8)*ps, sdb, pU+8*sdu);
1618 		for(ii=0; ii<m-3; ii+=4)
1619 			{
1620 			kernel_dgemm_tt_4x12_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
1621 			}
1622 		if(ii<m)
1623 			{
1624 			kernel_dgemm_tt_4x12_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1625 			}
1626 		}
1627 	if(jj<n)
1628 		{
1629 		if(n-jj<=4)
1630 			{
1631 			goto left_4_n0;
1632 			}
1633 		if(n-jj<=8)
1634 			{
1635 			goto left_8_n0;
1636 			}
1637 		else
1638 			{
1639 			goto left_12_n0;
1640 			}
1641 		}
1642 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1643 	for(; jj<n-7; jj+=8)
1644 		{
1645 		kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+0)*ps, sdb, pU+0*sdu);
1646 		kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+4)*ps, sdb, pU+4*sdu);
1647 		for(ii=0; ii<m-3; ii+=4)
1648 			{
1649 			kernel_dgemm_tt_4x8_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
1650 			}
1651 		if(ii<m)
1652 			{
1653 			kernel_dgemm_tt_4x8_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1654 			}
1655 		}
1656 	if(jj<n)
1657 		{
1658 		if(n-jj<=4)
1659 			{
1660 			goto left_4_n0;
1661 			}
1662 		else
1663 			{
1664 			goto left_8_n0;
1665 			}
1666 		}
1667 #elif defined(TARGET_GENERIC) | defined(TARGET_X64_INTEL_CORE)
1668 	for(; jj<n-3; jj+=4)
1669 		{
1670 		kernel_dpacp_tn_4_lib4(k, offsetB, pB+jj*ps, sdb, pU);
1671 		for(ii=0; ii<m-3; ii+=4)
1672 			{
1673 			kernel_dgemm_tt_4x4_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
1674 			}
1675 		if(ii<m)
1676 			{
1677 			kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1678 			}
1679 		}
1680 	if(jj<n)
1681 		{
1682 		goto left_4_n0;
1683 		}
1684 #endif
1685 	// common return if n==n
1686 	goto tn_return;
1687 
1688 
1689 
1690 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1691 left_12_m0:
1692 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1693 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1694 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
1695 	for(jj=0; jj<n; jj+=4)
1696 		{
1697 		kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1698 		}
1699 	goto tn_return;
1700 #endif
1701 
1702 
1703 
1704 #if defined(TARGET_X64_INTEL_HASWELL)
1705 left_12_n0:
1706 	kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+0)*ps, sdb, pU+0*sdu);
1707 	kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+4)*ps, sdb, pU+4*sdu);
1708 	kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+8)*ps, sdb, pU+8*sdu);
1709 	for(ii=0; ii<m; ii+=4)
1710 		{
1711 		kernel_dgemm_tt_4x12_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1712 		}
1713 	goto tn_return;
1714 #endif
1715 
1716 
1717 
1718 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1719 left_8_m0:
1720 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1721 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1722 	for(jj=0; jj<n; jj+=4)
1723 		{
1724 		kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1725 		}
1726 	goto tn_return;
1727 #endif
1728 
1729 
1730 
1731 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1732 left_8_m0_g:
1733 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
1734 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
1735 	for(jj=0; jj<n; jj+=4)
1736 		{
1737 		kernel_dgemm_nn_8x4_gen_lib4(k, &alpha, pU, sdu, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1738 		}
1739 	goto tn_return;
1740 #endif
1741 
1742 
1743 
1744 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1745 left_8_n0:
1746 	kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+0)*ps, sdb, pU+0*sdu);
1747 	kernel_dpacp_tn_4_lib4(k, offsetB, pB+(jj+4)*ps, sdb, pU+4*sdu);
1748 	for(ii=0; ii<m; ii+=4)
1749 		{
1750 		kernel_dgemm_tt_4x8_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, sdu, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1751 		}
1752 	goto tn_return;
1753 #endif
1754 
1755 
1756 
1757 #if defined(TARGET_X64_INTEL_HASWELL)
1758 left_4_m0:
1759 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1760 	for(jj=0; jj<n-8; jj+=12)
1761 		{
1762 		kernel_dgemm_nn_4x12_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1763 		}
1764 	if(jj<n-4)
1765 		{
1766 		kernel_dgemm_nn_4x8_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1767 		}
1768 	else if(jj<n)
1769 		{
1770 		kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1771 		}
1772 	goto tn_return;
1773 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1774 left_4_m0:
1775 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1776 	for(jj=0; jj<n-4; jj+=8)
1777 		{
1778 		kernel_dgemm_nn_4x8_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1779 		}
1780 	if(jj<n)
1781 		{
1782 		kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1783 		}
1784 	goto tn_return;
1785 #else // all others
1786 left_4_m0:
1787 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1788 	for(jj=0; jj<n; jj+=4)
1789 		{
1790 		kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1791 		}
1792 	goto tn_return;
1793 #endif
1794 
1795 
1796 
1797 left_4_m0_g:
1798 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
1799 	for(jj=0; jj<n; jj+=4)
1800 		{
1801 		kernel_dgemm_nn_4x4_gen_lib4(k, &alpha, pU, offsetB, pB+jj*ps, sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
1802 		}
1803 	goto tn_return;
1804 
1805 
1806 
1807 #if defined(TARGET_X64_INTEL_HASWELL)
1808 left_4_n0:
1809 	kernel_dpacp_tn_4_lib4(k, offsetB, pB+jj*ps, sdb, pU);
1810 	for(ii=0; ii<m-8; ii+=12)
1811 		{
1812 		kernel_dgemm_tt_12x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1813 		}
1814 	if(ii<m-4)
1815 		{
1816 		kernel_dgemm_tt_8x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1817 		}
1818 	else if(ii<m)
1819 		{
1820 		kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1821 		}
1822 	goto tn_return;
1823 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1824 left_4_n0:
1825 	kernel_dpacp_tn_4_lib4(k, offsetB, pB+jj*ps, sdb, pU);
1826 	for(ii=0; ii<m-4; ii+=8)
1827 		{
1828 		kernel_dgemm_tt_8x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
1829 		}
1830 	if(ii<m)
1831 		{
1832 		kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1833 		}
1834 	goto tn_return;
1835 #elif defined(TARGET_GENERIC) | defined(TARGET_X64_INTEL_CORE)
1836 left_4_n0:
1837 	kernel_dpacp_tn_4_lib4(k, offsetB, pB+jj*ps, sdb, pU);
1838 	for(ii=0; ii<m; ii+=4)
1839 		{
1840 		kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, pA+ii*ps, sda, pU, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
1841 		}
1842 	goto tn_return;
1843 #endif
1844 
1845 
1846 
1847 tn_return:
1848 	if(k>K_MAX_STACK)
1849 		{
1850 		free(mem);
1851 		}
1852 	return;
1853 
1854 	}
1855 
1856 
1857 
1858 // dgemm_tt
1859 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_CORE) | defined(TARGET_GENERIC)
blasfeo_dgemm_tt(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)1860 void blasfeo_dgemm_tt(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
1861 	{
1862 	if(m<=0 || n<=0)
1863 		return;
1864 
1865 	// invalidate stored inverse diagonal of result matrix
1866 	sD->use_dA = 0;
1867 
1868 	const int ps = 4;
1869 
1870 	int sda = sA->cn;
1871 	int sdb = sB->cn;
1872 	int sdc = sC->cn;
1873 	int sdd = sD->cn;
1874 
1875 	int air = ai & (ps-1);
1876 	int bir = bi & (ps-1);
1877 
1878 	// pA, pB point to panels edges
1879 	double *pA = sA->pA + aj*ps + (ai-air)*sda;
1880 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
1881 	double *pC = sC->pA + (cj-bir)*ps;
1882 	double *pD = sD->pA + (dj-bir)*ps;
1883 
1884 	int offsetA = air;
1885 
1886 	int ci0 = ci; //-bir;
1887 	int di0 = di; //-bir;
1888 	int offsetC;
1889 	int offsetD;
1890 	if(ci0>=0)
1891 		{
1892 		pC += ci0/ps*ps*sdd;
1893 		offsetC = ci0%ps;
1894 		}
1895 	else
1896 		{
1897 		pC += -ps*sdc;
1898 		offsetC = ps+ci0;
1899 		}
1900 
1901 	if(di0>=0)
1902 		{
1903 		pD += di0/ps*ps*sdd;
1904 		offsetD = di0%ps;
1905 		}
1906 	else
1907 		{
1908 		pD += -ps*sdd;
1909 		offsetD = ps+di0;
1910 		}
1911 
1912 	int i, j, l;
1913 
1914 
1915 
1916 	// algorithm scheme
1917 	if(bir!=0)
1918 		{
1919 		goto clear_bir;
1920 		}
1921 select_loop:
1922 	if(offsetC==0 & offsetD==0)
1923 		{
1924 		goto loop_00;
1925 		}
1926 	else
1927 		{
1928 		goto loop_CD;
1929 		}
1930 	// should never get here
1931 	return;
1932 
1933 
1934 
1935 	// clean up at the beginning
1936 clear_bir:
1937 #if defined(TARGET_X64_INTEL_HASWELL)
1938 	if(bir+n>8) // (m>9)
1939 		{
1940 		i = 0;
1941 		for(; i<m; i+=4)
1942 			{
1943 			kernel_dgemm_tt_4x12_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[0], sdb, &beta, offsetC, &pC[i*sdc], sdc, offsetD, &pD[i*sdd], sdd, 0, m-i, bir, bir+n);
1944 			}
1945 		n -= 3*ps-bir;
1946 		pB += 3*ps*sdb;
1947 		pC += 3*4*ps;
1948 		pD += 3*4*ps;
1949 		}
1950 	else // bir+n<=8
1951 #endif
1952 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1953 	if(bir+n>4) // (m>5)
1954 		{
1955 		i = 0;
1956 		for(; i<m; i+=4)
1957 			{
1958 			kernel_dgemm_tt_4x8_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[0], sdb, &beta, offsetC, &pC[i*sdc], sdc, offsetD, &pD[i*sdd], sdd, 0, m-i, bir, bir+n);
1959 			}
1960 		n -= 2*ps-bir;
1961 		pB += 2*ps*sdb;
1962 		pC += 2*4*ps;
1963 		pD += 2*4*ps;
1964 		}
1965 	else // air+m<=4 // m-i<=4
1966 		{
1967 #endif
1968 		i = 0;
1969 		for(; i<m; i+=4)
1970 			{
1971 			kernel_dgemm_tt_4x4_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[0], &beta, offsetC, &pC[i*sdc], sdc, offsetD, &pD[i*sdd], sdd, 0, m-i, bir, bir+n);
1972 			}
1973 		n -= 1*ps-bir;
1974 		pB += 1*ps*sdb;
1975 		pC += 1*4*ps;
1976 		pD += 1*4*ps;
1977 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
1978 		// nothing more to do
1979 		}
1980 #endif
1981 	goto select_loop;
1982 
1983 
1984 
1985 	// main loop aligned
1986 loop_00:
1987 	j = 0;
1988 #if defined(TARGET_X64_INTEL_HASWELL) //| defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1989 	for(; j<n-11; j+=12)
1990 		{
1991 		i = 0;
1992 		for(; i<m-3; i+=4)
1993 			{
1994 			kernel_dgemm_tt_4x12_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
1995 			}
1996 		if(i<m)
1997 			{
1998 			kernel_dgemm_tt_4x12_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
1999 			}
2000 		}
2001 	if(n>j)
2002 		{
2003 		if(n-j<=4)
2004 			{
2005 			goto left_4;
2006 			}
2007 		else if(n-j<=8)
2008 			{
2009 			goto left_8;
2010 			}
2011 		else
2012 			{
2013 			goto left_12;
2014 			}
2015 		}
2016 #elif 0//defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2017 	for(; i<m-12 | i==m-8; i+=8)
2018 		{
2019 		j = 0;
2020 		for(; j<n-3; j+=4)
2021 			{
2022 			kernel_dgemm_nn_8x4_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
2023 			}
2024 		if(j<n)
2025 			{
2026 			kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2027 			}
2028 		}
2029 	if(m>i)
2030 		{
2031 		if(m-i<=4)
2032 			{
2033 			goto left_4;
2034 			}
2035 		else if(m-i<=8)
2036 			{
2037 			goto left_8;
2038 			}
2039 		else
2040 			{
2041 			goto left_12;
2042 			}
2043 		}
2044 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) //| defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2045 	for(; j<n-7; j+=8)
2046 		{
2047 		i = 0;
2048 		for(; i<m-3; i+=4)
2049 			{
2050 			kernel_dgemm_tt_4x8_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2051 			}
2052 		if(i<m)
2053 			{
2054 			kernel_dgemm_tt_4x8_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2055 			}
2056 		}
2057 	if(n>j)
2058 		{
2059 		if(n-j<=4)
2060 			{
2061 			goto left_4;
2062 			}
2063 		else
2064 			{
2065 			goto left_8;
2066 			}
2067 		}
2068 #elif 0//defined(TARGET_X86_AMD_BARCELONA)
2069 	for(; i<m-3; i+=4)
2070 		{
2071 		j = 0;
2072 		for(; j<n-1; j+=2)
2073 			{
2074 			kernel_dgemm_nn_4x2_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2075 			}
2076 		if(j<n)
2077 			{
2078 			kernel_dgemm_nn_4x2_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2079 			}
2080 		}
2081 	if(m>i)
2082 		{
2083 		goto left_4;
2084 		}
2085 #else // all others
2086 	for(; j<n-3; j+=4)
2087 		{
2088 		i = 0;
2089 		for(; i<m-3; i+=4)
2090 			{
2091 			kernel_dgemm_tt_4x4_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
2092 			}
2093 		if(i<m)
2094 			{
2095 			kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2096 			}
2097 		}
2098 	if(n>j)
2099 		{
2100 		goto left_4;
2101 		}
2102 #endif
2103 	// common return if i==m
2104 	return;
2105 
2106 
2107 
2108 	// main loop C, D not aligned
2109 loop_CD:
2110 	j = 0;
2111 #if defined(TARGET_X64_INTEL_HASWELL)
2112 	for(; j<n-8; j+=12)
2113 		{
2114 		i = 0;
2115 		for(; i<m; i+=4)
2116 			{
2117 			kernel_dgemm_tt_4x12_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2118 			}
2119 		}
2120 	if(n>j)
2121 		{
2122 		if(n-j<=4)
2123 			{
2124 			goto left_4_g;
2125 			}
2126 		else
2127 			{
2128 			goto left_8_g;
2129 			}
2130 		}
2131 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2132 	for(; j<n-4; j+=8)
2133 		{
2134 		i = 0;
2135 		for(; i<m; i+=4)
2136 			{
2137 			kernel_dgemm_tt_4x8_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2138 			}
2139 		}
2140 	if(n>j)
2141 		{
2142 		goto left_4_g;
2143 		}
2144 #else
2145 	for(; j<n; j+=4)
2146 		{
2147 		i = 0;
2148 		for(; i<m; i+=4)
2149 			{
2150 			kernel_dgemm_tt_4x4_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2151 			}
2152 		}
2153 #endif
2154 	// common return if i==m
2155 	return;
2156 
2157 
2158 
2159 	// clean up loops definitions
2160 
2161 #if defined(TARGET_X64_INTEL_HASWELL)
2162 	left_12_g:
2163 	i = 0;
2164 	for(; i<m; i+=4)
2165 		{
2166 		kernel_dgemm_tt_4x12_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2167 		}
2168 	return;
2169 #endif
2170 
2171 
2172 
2173 #if defined(TARGET_X64_INTEL_HASWELL) //| defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2174 	left_12:
2175 	i = 0;
2176 	for(; i<m; i+=4)
2177 		{
2178 		kernel_dgemm_tt_4x12_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2179 		}
2180 	return;
2181 #endif
2182 
2183 
2184 
2185 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2186 	left_8_g:
2187 	i = 0;
2188 	for(; i<m; i+=4)
2189 		{
2190 		kernel_dgemm_tt_4x8_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2191 		}
2192 	return;
2193 #endif
2194 
2195 
2196 
2197 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2198 	left_8:
2199 	i = 0;
2200 	for(; i<m; i+=4)
2201 		{
2202 		kernel_dgemm_tt_4x8_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2203 		}
2204 	return;
2205 #endif
2206 
2207 
2208 
2209 	left_4_g:
2210 	i = 0;
2211 	for(; i<m; i+=4)
2212 		{
2213 		kernel_dgemm_tt_4x4_gen_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
2214 		}
2215 	return;
2216 
2217 
2218 
2219 #if defined(TARGET_X64_INTEL_HASWELL)
2220 	left_4:
2221 	i = 0;
2222 	for(; i<m-8; i+=12)
2223 		{
2224 		kernel_dgemm_tt_12x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sda], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2225 		}
2226 	if(i<m-4)
2227 		{
2228 		kernel_dgemm_tt_8x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2229 		}
2230 	else if(i<m)
2231 		{
2232 		kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sda], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2233 		}
2234 	return;
2235 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2236 	left_4:
2237 	i = 0;
2238 	for(; i<m-4; i+=8)
2239 		{
2240 		kernel_dgemm_tt_8x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sda], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
2241 		}
2242 	if(i<m)
2243 		{
2244 		kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sda], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2245 		}
2246 	return;
2247 #elif 0//defined(TARGET_X86_AMD_BARCELONA)
2248 	left_4:
2249 	j = 0;
2250 	for(; j<n; j+=2)
2251 		{
2252 		kernel_dgemm_nn_4x2_vs_lib4(k, &alpha, &pA[i*sda], offsetB, &pB[j*ps], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2253 		}
2254 	return;
2255 #else // all others
2256 	left_4:
2257 	i = 0;
2258 	for(; i<m; i+=4)
2259 		{
2260 		kernel_dgemm_tt_4x4_vs_lib4(k, &alpha, offsetA, &pA[i*ps], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
2261 		}
2262 	return;
2263 #endif
2264 
2265 	return;
2266 
2267 	}
2268 
2269 #else
blasfeo_dgemm_tt(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)2270 void blasfeo_dgemm_tt(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
2271 	{
2272 	if(m<=0 || n<=0)
2273 		return;
2274 
2275 	// invalidate stored inverse diagonal of result matrix
2276 	sD->use_dA = 0;
2277 
2278 	const int ps = 4;
2279 
2280 	int sda = sA->cn;
2281 	int sdb = sB->cn;
2282 	int sdc = sC->cn;
2283 	int sdd = sD->cn;
2284 
2285 	int air = ai & (ps-1);
2286 	int bir = bi & (ps-1);
2287 	int cir = ci & (ps-1);
2288 	int dir = di & (ps-1);
2289 
2290 	double *pA = sA->pA + aj*ps + (ai-air)*sda;
2291 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
2292 	double *pC = sC->pA + cj*ps + (ci-cir)*sdc;
2293 	double *pD = sD->pA + dj*ps + (di-dir)*sdd;
2294 
2295 	int offsetA = air;
2296 	int offsetB = bir;
2297 	int offsetC = cir;
2298 	int offsetD = dir;
2299 
2300 // TODO visual studio alignment
2301 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2302 	ALIGNED( double pU[3*4*K_MAX_STACK], 64 );
2303 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2304 	ALIGNED( double pU[2*4*K_MAX_STACK], 64 );
2305 #elif defined(TARGET_GENERIC)
2306 	double pU[1*4*K_MAX_STACK];
2307 #else
2308 	ALIGNED( double pU[1*4*K_MAX_STACK], 64 );
2309 #endif
2310 	int sdu = (k+3)/4*4;
2311 	sdu = sdu<K_MAX_STACK ? sdu : K_MAX_STACK;
2312 
2313 	struct blasfeo_dmat sAt;
2314 	int sdat;
2315 	int sAt_size;
2316 	void *mem;
2317 	char *mem_align;
2318 	double *pAt;
2319 
2320 	int ii, jj;
2321 
2322 	int idxB;
2323 
2324 
2325 
2326 	// algorithm scheme
2327 	if(offsetC==0 & offsetD==0)
2328 		{
2329 		if(k>K_MAX_STACK)
2330 			{
2331 			goto loop_00_1;
2332 			}
2333 		else
2334 			{
2335 			goto loop_00_0;
2336 			}
2337 		}
2338 	else
2339 		{
2340 		if(k>K_MAX_STACK)
2341 			{
2342 			goto loop_CD_1;
2343 			}
2344 		else
2345 			{
2346 			goto loop_CD_0;
2347 			}
2348 		}
2349 	// should never get here
2350 	return;
2351 
2352 
2353 
2354 	// main loop aligned
2355 loop_00_0:
2356 	ii = 0;
2357 #if defined(TARGET_X64_INTEL_HASWELL)
2358 	for(; ii<m-11; ii+=12)
2359 		{
2360 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2361 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2362 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2363 		jj = 0;
2364 		idxB = 0;
2365 		// clean up at the beginning
2366 		if(bir!=0)
2367 			{
2368 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2369 			jj += ps-bir;
2370 			idxB += 4;
2371 			}
2372 		// main loop
2373 		for(; jj<n-3; jj+=4, idxB+=4)
2374 			{
2375 			kernel_dgemm_nt_12x4_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2376 			}
2377 		if(jj<n)
2378 			{
2379 			kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2380 			}
2381 		}
2382 	if(ii<m)
2383 		{
2384 		if(m-ii<=4)
2385 			{
2386 			goto left_4_0;
2387 			}
2388 		if(m-ii<=8)
2389 			{
2390 			goto left_8_0;
2391 			}
2392 		else
2393 			{
2394 			goto left_12_0;
2395 			}
2396 		}
2397 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2398 	for(; ii<m-11; ii+=12)
2399 		{
2400 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2401 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2402 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2403 		jj = 0;
2404 		idxB = 0;
2405 		// clean up at the beginning
2406 		if(bir!=0)
2407 			{
2408 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+0*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2409 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+4*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2410 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+8*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+8)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+8)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+8), bir, bir+n-jj);
2411 			jj += ps-bir;
2412 			idxB += 4;
2413 			}
2414 		// main loop
2415 		for(; jj<n-3; jj+=4, idxB+=4)
2416 			{
2417 			kernel_dgemm_nt_12x4_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2418 			}
2419 		if(jj<n)
2420 			{
2421 			kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2422 			}
2423 		}
2424 	if(ii<m)
2425 		{
2426 		if(m-ii<=4)
2427 			{
2428 			goto left_4_0;
2429 			}
2430 		if(m-ii<=8)
2431 			{
2432 			goto left_8_0;
2433 			}
2434 		else
2435 			{
2436 			goto left_12_0;
2437 			}
2438 		}
2439 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2440 	for(; ii<m-7; ii+=8)
2441 		{
2442 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2443 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2444 		jj = 0;
2445 		idxB = 0;
2446 		// clean up at the beginning
2447 		if(bir!=0)
2448 			{
2449 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2450 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2451 #else
2452 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+0*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2453 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+4*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2454 #endif
2455 			jj += ps-bir;
2456 			idxB += 4;
2457 			}
2458 		// main loop
2459 		for(; jj<n-3; jj+=4, idxB+=4)
2460 			{
2461 			kernel_dgemm_nt_8x4_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2462 			}
2463 		if(jj<n)
2464 			{
2465 			kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2466 			}
2467 		}
2468 	if(ii<m)
2469 		{
2470 		if(m-ii<=4)
2471 			{
2472 			goto left_4_0;
2473 			}
2474 		else
2475 			{
2476 			goto left_8_0;
2477 			}
2478 		}
2479 #elif defined(TARGET_X86_AMD_BARCELONA)
2480 	for(; ii<m-3; ii+=4)
2481 		{
2482 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2483 		jj = 0;
2484 		idxB = 0;
2485 		// clean up at the beginning
2486 		if(bir!=0)
2487 			{
2488 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2489 			jj += ps-bir;
2490 			idxB += 4;
2491 			}
2492 		// main loop
2493 		for(; jj<n-3; jj+=4, idxB+=4)
2494 			{
2495 			kernel_dgemm_nt_4x2_lib4(k, &alpha, pU, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps);
2496 			kernel_dgemm_nt_4x2_lib4(k, &alpha, pU, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps);
2497 			}
2498 		if(jj<n-2)
2499 			{
2500 			kernel_dgemm_nt_4x2_lib4(k, &alpha, pU, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps);
2501 			kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps, m-ii, n-(jj+2));
2502 			}
2503 		else if(jj<n)
2504 			{
2505 			kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2506 			}
2507 		}
2508 	if(ii<m)
2509 		{
2510 		goto left_4_0;
2511 		}
2512 #else
2513 	for(; ii<m-3; ii+=4)
2514 		{
2515 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2516 		jj = 0;
2517 		idxB = 0;
2518 		// clean up at the beginning
2519 		if(bir!=0)
2520 			{
2521 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2522 			jj += ps-bir;
2523 			idxB += 4;
2524 			}
2525 		// main loop
2526 		for(; jj<n-3; jj+=4, idxB+=4)
2527 			{
2528 			kernel_dgemm_nt_4x4_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
2529 			}
2530 		if(jj<n)
2531 			{
2532 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2533 			}
2534 		}
2535 	if(ii<m)
2536 		{
2537 		goto left_4_0;
2538 		}
2539 #endif
2540 	goto tt_0_return;
2541 
2542 
2543 
2544 	// main loop C, D not aligned
2545 loop_CD_0:
2546 	ii = 0;
2547 #if defined(TARGET_X64_INTEL_HASWELL)
2548 	for(; ii<m-8; ii+=12)
2549 		{
2550 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2551 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2552 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2553 		jj = 0;
2554 		idxB = 0;
2555 		// clean up at the beginning
2556 		if(bir!=0)
2557 			{
2558 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2559 			jj += ps-bir;
2560 			idxB += 4;
2561 			}
2562 		// main loop
2563 		for(; jj<n; jj+=4, idxB+=4)
2564 			{
2565 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2566 			}
2567 		}
2568 	if(m>ii)
2569 		{
2570 		if(m-ii<=4)
2571 			{
2572 			goto left_4_0_g;
2573 			}
2574 		else
2575 			{
2576 			goto left_8_0_g;
2577 			}
2578 		}
2579 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2580 	for(; ii<m-4; ii+=8)
2581 		{
2582 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2583 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2584 		jj = 0;
2585 		idxB = 0;
2586 		// clean up at the beginning
2587 		if(bir!=0)
2588 			{
2589 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2590 			jj += ps-bir;
2591 			idxB += 4;
2592 			}
2593 		// main loop
2594 		for(; jj<n; jj+=4, idxB+=4)
2595 			{
2596 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2597 			}
2598 		}
2599 	if(m>ii)
2600 		{
2601 		goto left_4_0_g;
2602 		}
2603 #else
2604 	for(; ii<m; ii+=4)
2605 		{
2606 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2607 		jj = 0;
2608 		idxB = 0;
2609 		// clean up at the beginning
2610 		if(bir!=0)
2611 			{
2612 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2613 			jj += ps-bir;
2614 			idxB += 4;
2615 			}
2616 		// main loop
2617 		for(; jj<n; jj+=4, idxB+=4)
2618 			{
2619 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2620 			}
2621 		}
2622 #endif
2623 	// common return if i==m
2624 	goto tt_0_return;
2625 
2626 
2627 
2628 #if defined(TARGET_X64_INTEL_HASWELL)
2629 left_12_0:
2630 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2631 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2632 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2633 	jj = 0;
2634 	idxB = 0;
2635 	// clean up at the beginning
2636 	if(bir!=0)
2637 		{
2638 		kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2639 		jj += ps-bir;
2640 		idxB += 4;
2641 		}
2642 	// main loop
2643 	for(; jj<n; jj+=4, idxB+=4)
2644 		{
2645 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2646 		}
2647 	goto tt_0_return;
2648 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2649 left_12_0:
2650 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2651 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2652 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pU+8*sdu);
2653 	jj = 0;
2654 	idxB = 0;
2655 	// clean up at the beginning
2656 	if(bir!=0)
2657 		{
2658 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+0*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2659 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+4*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2660 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+8*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+8)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+8)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+8), bir, bir+n-jj);
2661 		jj += ps-bir;
2662 		idxB += 4;
2663 		}
2664 	// main loop
2665 	for(; jj<n; jj+=4, idxB+=4)
2666 		{
2667 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2668 		}
2669 	goto tt_0_return;
2670 #endif
2671 
2672 
2673 
2674 #if defined(TARGET_X64_INTEL_HASWELL)
2675 left_8_0:
2676 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2677 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2678 	jj = 0;
2679 	idxB = 0;
2680 	// clean up at the beginning
2681 	if(bir!=0)
2682 		{
2683 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2684 		jj += ps-bir;
2685 		idxB += 4;
2686 		}
2687 	// main loop
2688 	for(; jj<n-8; jj+=12, idxB+=12)
2689 		{
2690 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pU, sdu, pB+(idxB+0)*sdb, sdb, &beta, pC+ii*sdc+(jj+0)*ps, sdc, pD+ii*sdd+(jj+0)*ps, sdd, m-ii, n-(jj+0));
2691 		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, pU, sdu, pB+(idxB+4)*sdb, sdb, &beta, pC+ii*sdc+(jj+4)*ps, sdc, pD+ii*sdd+(jj+4)*ps, sdd, m-ii, n-(jj+4));
2692 		}
2693 	if(jj<n)
2694 		{
2695 		if(n-jj<=4)
2696 			{
2697 			kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2698 			}
2699 		else
2700 			{
2701 			kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pU, sdu, pB+(idxB+0)*sdb, sdb, &beta, pC+ii*sdc+(jj+0)*ps, sdc, pD+ii*sdd+(jj+0)*ps, sdd, m-ii, n-(jj+0));
2702 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+(idxB+4)*sdb, &beta, pC+ii*sdc+(jj+4)*ps, pD+ii*sdd+(jj+4)*ps, m-ii, n-(jj+4));
2703 			}
2704 		}
2705 	goto tt_0_return;
2706 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2707 left_8_0:
2708 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2709 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2710 	jj = 0;
2711 	idxB = 0;
2712 	// clean up at the beginning
2713 	if(bir!=0)
2714 		{
2715 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2716 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2717 #else
2718 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+0*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2719 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU+4*sdu, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2720 #endif
2721 		jj += ps-bir;
2722 		idxB += 4;
2723 		}
2724 	// main loop
2725 	for(; jj<n; jj+=4, idxB+=4)
2726 		{
2727 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2728 		}
2729 	goto tt_0_return;
2730 #endif
2731 
2732 
2733 
2734 #if defined(TARGET_X64_INTEL_HASWELL)
2735 left_8_0_g:
2736 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pU+0*sdu);
2737 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pU+4*sdu);
2738 	jj = 0;
2739 	idxB = 0;
2740 	// clean up at the beginning
2741 	if(bir!=0)
2742 		{
2743 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2744 		jj += ps-bir;
2745 		idxB += 4;
2746 		}
2747 	// main loop
2748 	for(; jj<n; jj+=4, idxB+=4)
2749 		{
2750 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pU, sdu, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2751 		}
2752 	goto tt_0_return;
2753 #endif
2754 
2755 
2756 
2757 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2758 left_4_0:
2759 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2760 	jj = 0;
2761 	idxB = 0;
2762 	// clean up at the beginning
2763 	if(bir!=0)
2764 		{
2765 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2766 		jj += ps-bir;
2767 		idxB += 4;
2768 		}
2769 	// main loop
2770 	for(; jj<n-8; jj+=12, idxB+=12)
2771 		{
2772 		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, pU, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2773 		}
2774 	if(jj<n)
2775 		{
2776 		if(n-jj<=4)
2777 			{
2778 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2779 			}
2780 		else
2781 			{
2782 			kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pU, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2783 			}
2784 		}
2785 	goto tt_0_return;
2786 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2787 left_4_0:
2788 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2789 	jj = 0;
2790 	idxB = 0;
2791 	// clean up at the beginning
2792 	if(bir!=0)
2793 		{
2794 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2795 		jj += ps-bir;
2796 		idxB += 4;
2797 		}
2798 	// main loop
2799 	for(; jj<n-4; jj+=8, idxB+=8)
2800 		{
2801 		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pU, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2802 		}
2803 	if(jj<n)
2804 		{
2805 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2806 		}
2807 	goto tt_0_return;
2808 #elif defined(TARGET_X86_AMD_BARCELONA)
2809 left_4_0:
2810 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2811 	jj = 0;
2812 	idxB = 0;
2813 	// clean up at the beginning
2814 	if(bir!=0)
2815 		{
2816 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2817 		jj += ps-bir;
2818 		idxB += 4;
2819 		}
2820 	// main loop
2821 	for(; jj<n-2; jj+=4, idxB+=4)
2822 		{
2823 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps, m-ii, n-(jj+0));
2824 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps, m-ii, n-(jj+2));
2825 		}
2826 	if(jj<n)
2827 		{
2828 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2829 		}
2830 	goto tt_0_return;
2831 #else
2832 left_4_0:
2833 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2834 	jj = 0;
2835 	idxB = 0;
2836 	// clean up at the beginning
2837 	if(bir!=0)
2838 		{
2839 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2840 		jj += ps-bir;
2841 		idxB += 4;
2842 		}
2843 	// main loop
2844 	for(; jj<n; jj+=4, idxB+=4)
2845 		{
2846 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
2847 		}
2848 	goto tt_0_return;
2849 #endif
2850 
2851 
2852 
2853 left_4_0_g:
2854 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pU);
2855 	jj = 0;
2856 	idxB = 0;
2857 	// clean up at the beginning
2858 	if(bir!=0)
2859 		{
2860 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2861 		jj += ps-bir;
2862 		idxB += 4;
2863 		}
2864 	// main loop
2865 	for(; jj<n; jj+=4, idxB+=4)
2866 		{
2867 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pU, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
2868 		}
2869 	goto tt_0_return;
2870 
2871 
2872 
2873 tt_0_return:
2874 	return;
2875 
2876 
2877 
2878 loop_00_1:
2879 	sAt_size = blasfeo_memsize_dmat(12, k);
2880 	mem = malloc(sAt_size+64);
2881 	blasfeo_align_64_byte(mem, (void **) &mem_align);
2882 	blasfeo_create_dmat(12, k, &sAt, (void *) mem_align);
2883 	pAt = sAt.pA;
2884 	sdat = sAt.cn;
2885 
2886 	ii = 0;
2887 #if defined(TARGET_X64_INTEL_HASWELL)
2888 	for(; ii<m-11; ii+=12)
2889 		{
2890 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
2891 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
2892 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
2893 		jj = 0;
2894 		idxB = 0;
2895 		// clean up at the beginning
2896 		if(bir!=0)
2897 			{
2898 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2899 			jj += ps-bir;
2900 			idxB += 4;
2901 			}
2902 		// main loop
2903 		for(; jj<n-3; jj+=4, idxB+=4)
2904 			{
2905 			kernel_dgemm_nt_12x4_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2906 			}
2907 		if(jj<n)
2908 			{
2909 			kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdat, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2910 			}
2911 		}
2912 	if(ii<m)
2913 		{
2914 		if(m-ii<=4)
2915 			{
2916 			goto left_4_1;
2917 			}
2918 		if(m-ii<=8)
2919 			{
2920 			goto left_8_1;
2921 			}
2922 		else
2923 			{
2924 			goto left_12_1;
2925 			}
2926 		}
2927 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
2928 	for(; ii<m-11; ii+=12)
2929 		{
2930 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
2931 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
2932 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
2933 		jj = 0;
2934 		idxB = 0;
2935 		// clean up at the beginning
2936 		if(bir!=0)
2937 			{
2938 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+0*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2939 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+4*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2940 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+8*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+8)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+8)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+8), bir, bir+n-jj);
2941 			jj += ps-bir;
2942 			idxB += 4;
2943 			}
2944 		// main loop
2945 		for(; jj<n-3; jj+=4, idxB+=4)
2946 			{
2947 			kernel_dgemm_nt_12x4_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2948 			}
2949 		if(jj<n)
2950 			{
2951 			kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdat, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2952 			}
2953 		}
2954 	if(ii<m)
2955 		{
2956 		if(m-ii<=4)
2957 			{
2958 			goto left_4_1;
2959 			}
2960 		if(m-ii<=8)
2961 			{
2962 			goto left_8_1;
2963 			}
2964 		else
2965 			{
2966 			goto left_12_1;
2967 			}
2968 		}
2969 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2970 	for(; ii<m-7; ii+=8)
2971 		{
2972 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
2973 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
2974 		jj = 0;
2975 		idxB = 0;
2976 		// clean up at the beginning
2977 		if(bir!=0)
2978 			{
2979 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
2980 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
2981 #else
2982 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+0*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
2983 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+4*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
2984 #endif
2985 			jj += ps-bir;
2986 			idxB += 4;
2987 			}
2988 		// main loop
2989 		for(; jj<n-3; jj+=4, idxB+=4)
2990 			{
2991 			kernel_dgemm_nt_8x4_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd);
2992 			}
2993 		if(jj<n)
2994 			{
2995 			kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdat, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
2996 			}
2997 		}
2998 	if(ii<m)
2999 		{
3000 		if(m-ii<=4)
3001 			{
3002 			goto left_4_1;
3003 			}
3004 		else
3005 			{
3006 			goto left_8_1;
3007 			}
3008 		}
3009 #elif defined(TARGET_X86_AMD_BARCELONA)
3010 	for(; ii<m-3; ii+=4)
3011 		{
3012 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3013 		jj = 0;
3014 		idxB = 0;
3015 		// clean up at the beginning
3016 		if(bir!=0)
3017 			{
3018 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3019 			jj += ps-bir;
3020 			idxB += 4;
3021 			}
3022 		// main loop
3023 		for(; jj<n-3; jj+=4, idxB+=4)
3024 			{
3025 			kernel_dgemm_nt_4x2_lib4(k, &alpha, pAt, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps);
3026 			kernel_dgemm_nt_4x2_lib4(k, &alpha, pAt, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps);
3027 			}
3028 		if(jj<n-2)
3029 			{
3030 			kernel_dgemm_nt_4x2_lib4(k, &alpha, pAt, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps);
3031 			kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps, m-ii, n-(jj+2));
3032 			}
3033 		else if(jj<n)
3034 			{
3035 			kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3036 			}
3037 		}
3038 	if(ii<m)
3039 		{
3040 		goto left_4_1;
3041 		}
3042 #else
3043 	for(; ii<m-3; ii+=4)
3044 		{
3045 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3046 		jj = 0;
3047 		idxB = 0;
3048 		// clean up at the beginning
3049 		if(bir!=0)
3050 			{
3051 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3052 			jj += ps-bir;
3053 			idxB += 4;
3054 			}
3055 		// main loop
3056 		for(; jj<n-3; jj+=4, idxB+=4)
3057 			{
3058 			kernel_dgemm_nt_4x4_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps);
3059 			}
3060 		if(jj<n)
3061 			{
3062 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3063 			}
3064 		}
3065 	if(ii<m)
3066 		{
3067 		goto left_4_1;
3068 		}
3069 #endif
3070 	goto tt_1_return;
3071 
3072 
3073 
3074 	// main loop C, D not aligned
3075 loop_CD_1:
3076 	sAt_size = blasfeo_memsize_dmat(12, k);
3077 	mem = malloc(sAt_size+64);
3078 	blasfeo_align_64_byte(mem, (void **) &mem_align);
3079 	blasfeo_create_dmat(12, k, &sAt, (void *) mem_align);
3080 	pAt = sAt.pA;
3081 	sdat = sAt.cn;
3082 
3083 	ii = 0;
3084 #if defined(TARGET_X64_INTEL_HASWELL)
3085 	for(; ii<m-8; ii+=12)
3086 		{
3087 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3088 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3089 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
3090 		jj = 0;
3091 		idxB = 0;
3092 		// clean up at the beginning
3093 		if(bir!=0)
3094 			{
3095 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3096 			jj += ps-bir;
3097 			idxB += 4;
3098 			}
3099 		// main loop
3100 		for(; jj<n; jj+=4, idxB+=4)
3101 			{
3102 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3103 			}
3104 		}
3105 	if(m>ii)
3106 		{
3107 		if(m-ii<=4)
3108 			{
3109 			goto left_4_1_g;
3110 			}
3111 		else
3112 			{
3113 			goto left_8_1_g;
3114 			}
3115 		}
3116 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
3117 	for(; ii<m-4; ii+=8)
3118 		{
3119 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3120 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3121 		jj = 0;
3122 		idxB = 0;
3123 		// clean up at the beginning
3124 		if(bir!=0)
3125 			{
3126 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3127 			jj += ps-bir;
3128 			idxB += 4;
3129 			}
3130 		// main loop
3131 		for(; jj<n; jj+=4, idxB+=4)
3132 			{
3133 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3134 			}
3135 		}
3136 	if(m>ii)
3137 		{
3138 		goto left_4_1_g;
3139 		}
3140 #else
3141 	for(; ii<m; ii+=4)
3142 		{
3143 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3144 		jj = 0;
3145 		idxB = 0;
3146 		// clean up at the beginning
3147 		if(bir!=0)
3148 			{
3149 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3150 			jj += ps-bir;
3151 			idxB += 4;
3152 			}
3153 		// main loop
3154 		for(; jj<n; jj+=4, idxB+=4)
3155 			{
3156 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3157 			}
3158 		}
3159 #endif
3160 	// common return if i==m
3161 	goto tt_1_return;
3162 
3163 
3164 
3165 #if defined(TARGET_X64_INTEL_HASWELL)
3166 left_12_1:
3167 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3168 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3169 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
3170 	jj = 0;
3171 	idxB = 0;
3172 	// clean up at the beginning
3173 	if(bir!=0)
3174 		{
3175 		kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3176 		jj += ps-bir;
3177 		idxB += 4;
3178 		}
3179 	// main loop
3180 	for(; jj<n; jj+=4, idxB+=4)
3181 		{
3182 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
3183 		}
3184 	goto tt_1_return;
3185 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3186 left_12_1:
3187 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3188 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3189 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+8)*ps, sda, pAt+8*sdat);
3190 	jj = 0;
3191 	idxB = 0;
3192 	// clean up at the beginning
3193 	if(bir!=0)
3194 		{
3195 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+0*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
3196 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+4*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
3197 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+8*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+8)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+8)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+8), bir, bir+n-jj);
3198 		jj += ps-bir;
3199 		idxB += 4;
3200 		}
3201 	// main loop
3202 	for(; jj<n; jj+=4, idxB+=4)
3203 		{
3204 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
3205 		}
3206 	goto tt_1_return;
3207 #endif
3208 
3209 
3210 
3211 #if defined(TARGET_X64_INTEL_HASWELL)
3212 left_8_1:
3213 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3214 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3215 	jj = 0;
3216 	idxB = 0;
3217 	// clean up at the beginning
3218 	if(bir!=0)
3219 		{
3220 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3221 		jj += ps-bir;
3222 		idxB += 4;
3223 		}
3224 	// main loop
3225 	for(; jj<n-8; jj+=12, idxB+=12)
3226 		{
3227 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pAt, sdat, pB+(idxB+0)*sdb, sdb, &beta, pC+ii*sdc+(jj+0)*ps, sdc, pD+ii*sdd+(jj+0)*ps, sdd, m-ii, n-(jj+0));
3228 		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, pAt, sdat, pB+(idxB+4)*sdb, sdb, &beta, pC+ii*sdc+(jj+4)*ps, sdc, pD+ii*sdd+(jj+4)*ps, sdd, m-ii, n-(jj+4));
3229 		}
3230 	if(jj<n)
3231 		{
3232 		if(n-jj<=4)
3233 			{
3234 			kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
3235 			}
3236 		else
3237 			{
3238 			kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pAt, sdat, pB+(idxB+0)*sdb, sdb, &beta, pC+ii*sdc+(jj+0)*ps, sdc, pD+ii*sdd+(jj+0)*ps, sdd, m-ii, n-(jj+0));
3239 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+(idxB+4)*sdb, &beta, pC+ii*sdc+(jj+4)*ps, pD+ii*sdd+(jj+4)*ps, m-ii, n-(jj+4));
3240 			}
3241 		}
3242 	goto tt_1_return;
3243 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3244 left_8_1:
3245 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3246 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3247 	jj = 0;
3248 	idxB = 0;
3249 	// clean up at the beginning
3250 	if(bir!=0)
3251 		{
3252 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
3253 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3254 #else
3255 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+0*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+0)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+0)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+0), bir, bir+n-jj);
3256 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt+4*sdat, pB+idxB*sdb, &beta, 0, pC+(ii+4)*sdc+jj*ps-bir*ps, sdc, 0, pD+(ii+4)*sdd+jj*ps-bir*ps, sdd, 0, m-(ii+4), bir, bir+n-jj);
3257 #endif
3258 		jj += ps-bir;
3259 		idxB += 4;
3260 		}
3261 	// main loop
3262 	for(; jj<n; jj+=4, idxB+=4)
3263 		{
3264 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, sdc, pD+ii*sdd+jj*ps, sdd, m-ii, n-jj);
3265 		}
3266 	goto tt_1_return;
3267 #endif
3268 
3269 
3270 
3271 #if defined(TARGET_X64_INTEL_HASWELL)
3272 left_8_1_g:
3273 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+0)*ps, sda, pAt+0*sdat);
3274 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(ii+4)*ps, sda, pAt+4*sdat);
3275 	jj = 0;
3276 	idxB = 0;
3277 	// clean up at the beginning
3278 	if(bir!=0)
3279 		{
3280 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3281 		jj += ps-bir;
3282 		idxB += 4;
3283 		}
3284 	// main loop
3285 	for(; jj<n; jj+=4, idxB+=4)
3286 		{
3287 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pAt, sdat, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3288 		}
3289 	goto tt_1_return;
3290 #endif
3291 
3292 
3293 
3294 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3295 left_4_1:
3296 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3297 	jj = 0;
3298 	idxB = 0;
3299 	// clean up at the beginning
3300 	if(bir!=0)
3301 		{
3302 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3303 		jj += ps-bir;
3304 		idxB += 4;
3305 		}
3306 	// main loop
3307 	for(; jj<n-8; jj+=12, idxB+=12)
3308 		{
3309 		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3310 		}
3311 	if(jj<n)
3312 		{
3313 		if(n-jj<=4)
3314 			{
3315 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3316 			}
3317 		else
3318 			{
3319 			kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3320 			}
3321 		}
3322 	goto tt_1_return;
3323 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3324 left_4_1:
3325 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3326 	jj = 0;
3327 	idxB = 0;
3328 	// clean up at the beginning
3329 	if(bir!=0)
3330 		{
3331 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3332 		jj += ps-bir;
3333 		idxB += 4;
3334 		}
3335 	// main loop
3336 	for(; jj<n-4; jj+=8, idxB+=8)
3337 		{
3338 		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3339 		}
3340 	if(jj<n)
3341 		{
3342 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3343 		}
3344 	goto tt_1_return;
3345 #elif defined(TARGET_X86_AMD_BARCELONA)
3346 left_4_1:
3347 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3348 	jj = 0;
3349 	idxB = 0;
3350 	// clean up at the beginning
3351 	if(bir!=0)
3352 		{
3353 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3354 		jj += ps-bir;
3355 		idxB += 4;
3356 		}
3357 	// main loop
3358 	for(; jj<n-2; jj+=4, idxB+=4)
3359 		{
3360 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb+0, &beta, pC+ii*sdc+(jj+0)*ps, pD+ii*sdd+(jj+0)*ps, m-ii, n-(jj+0));
3361 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb+2, &beta, pC+ii*sdc+(jj+2)*ps, pD+ii*sdd+(jj+2)*ps, m-ii, n-(jj+2));
3362 		}
3363 	if(jj<n)
3364 		{
3365 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3366 		}
3367 	goto tt_1_return;
3368 #else
3369 left_4_1:
3370 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3371 	jj = 0;
3372 	idxB = 0;
3373 	// clean up at the beginning
3374 	if(bir!=0)
3375 		{
3376 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, 0, pC+ii*sdc+jj*ps-bir*ps, sdc, 0, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3377 		jj += ps-bir;
3378 		idxB += 4;
3379 		}
3380 	// main loop
3381 	for(; jj<n; jj+=4, idxB+=4)
3382 		{
3383 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, pC+ii*sdc+jj*ps, pD+ii*sdd+jj*ps, m-ii, n-jj);
3384 		}
3385 	goto tt_1_return;
3386 #endif
3387 
3388 
3389 
3390 left_4_1_g:
3391 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+ii*ps, sda, pAt);
3392 	jj = 0;
3393 	idxB = 0;
3394 	// clean up at the beginning
3395 	if(bir!=0)
3396 		{
3397 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps-bir*ps, sdc, offsetD, pD+ii*sdd+jj*ps-bir*ps, sdd, 0, m-ii, bir, bir+n-jj);
3398 		jj += ps-bir;
3399 		idxB += 4;
3400 		}
3401 	// main loop
3402 	for(; jj<n; jj+=4, idxB+=4)
3403 		{
3404 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pAt, pB+idxB*sdb, &beta, offsetC, pC+ii*sdc+jj*ps, sdc, offsetD, pD+ii*sdd+jj*ps, sdd, 0, m-ii, 0, n-jj);
3405 		}
3406 	goto tt_1_return;
3407 
3408 
3409 
3410 tt_1_return:
3411 	free(mem);
3412 	return;
3413 
3414 	}
3415 #endif
3416 
3417 
3418 
3419 // dtrsm_llnn
blasfeo_dtrsm_llnn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3420 void blasfeo_dtrsm_llnn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3421 	{
3422 	// invalidate stored inverse diagonal of result matrix
3423 	sD->use_dA = 0;
3424 
3425 	if(ai!=0 | bi!=0 | di!=0)
3426 		{
3427 		printf("\nblasfeo_dtrsm_llnn: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
3428 		exit(1);
3429 		}
3430 
3431 	const int ps = 4;
3432 
3433 	// TODO alpha
3434 	int sda = sA->cn;
3435 	int sdb = sB->cn;
3436 	int sdd = sD->cn;
3437 	double *pA = sA->pA + aj*ps;
3438 	double *pB = sB->pA + bj*ps;
3439 	double *pD = sD->pA + dj*ps;
3440 	double *dA = sA->dA;
3441 
3442 	if(m<=0 || n<=0)
3443 		return;
3444 
3445 	int i, j;
3446 
3447 	if(ai==0 & aj==0)
3448 		{
3449 		// recompute diagonal if size of operation grows
3450 		if(sA->use_dA<m)
3451 			{
3452 			ddiaex_lib(m, 1.0, ai, pA, sda, dA);
3453 			for(i=0; i<m; i++)
3454 				dA[i] = 1.0 / dA[i];
3455 			sA->use_dA = m;
3456 			}
3457 		}
3458 	// if submatrix recompute diagonal
3459 	else
3460 		{
3461 		ddiaex_lib(m, 1.0, ai, pA, sda, dA);
3462 		for(i=0; i<m; i++)
3463 			dA[i] = 1.0 / dA[i];
3464 		sA->use_dA = 0;
3465 		}
3466 
3467 	i = 0;
3468 #if defined(TARGET_X64_INTEL_HASWELL)
3469 	for( ; i<m-11; i+=12)
3470 		{
3471 		j = 0;
3472 		for( ; j<n-3; j+=4)
3473 			{
3474 			kernel_dtrsm_nn_ll_inv_12x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i);
3475 			}
3476 		if(j<n)
3477 			{
3478 			kernel_dtrsm_nn_ll_inv_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i, m-i, n-j);
3479 			}
3480 		}
3481 	if(i<m)
3482 		{
3483 		if(m-i<=4)
3484 			{
3485 			goto left_4;
3486 			}
3487 		if(m-i<=8)
3488 			{
3489 			goto left_8;
3490 			}
3491 		else
3492 			{
3493 			goto left_12;
3494 			}
3495 		}
3496 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
3497 	for( ; i<m-7; i+=8)
3498 		{
3499 		j = 0;
3500 		for( ; j<n-3; j+=4)
3501 			{
3502 			kernel_dtrsm_nn_ll_inv_8x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i);
3503 			}
3504 		if(j<n)
3505 			{
3506 			kernel_dtrsm_nn_ll_inv_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i, m-i, n-j);
3507 			}
3508 		}
3509 	if(i<m)
3510 		{
3511 		if(m-i<=4)
3512 			{
3513 			goto left_4;
3514 			}
3515 		else
3516 			{
3517 			goto left_8;
3518 			}
3519 		}
3520 #else
3521 	for( ; i<m-3; i+=4)
3522 		{
3523 		j = 0;
3524 		for( ; j<n-3; j+=4)
3525 			{
3526 			kernel_dtrsm_nn_ll_inv_4x4_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, dA+i);
3527 			}
3528 		if(j<n)
3529 			{
3530 			kernel_dtrsm_nn_ll_inv_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, dA+i, m-i, n-j);
3531 			}
3532 		}
3533 	if(i<m)
3534 		{
3535 		goto left_4;
3536 		}
3537 #endif
3538 	// common return
3539 	return;
3540 
3541 #if defined(TARGET_X64_INTEL_HASWELL)
3542 	left_12:
3543 	j = 0;
3544 	for( ; j<n; j+=4)
3545 		{
3546 		kernel_dtrsm_nn_ll_inv_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i, m-i, n-j);
3547 		}
3548 	return;
3549 #endif
3550 
3551 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3552 	left_8:
3553 	j = 0;
3554 	for( ; j<n; j+=4)
3555 		{
3556 		kernel_dtrsm_nn_ll_inv_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, dA+i, m-i, n-j);
3557 		}
3558 	return;
3559 #endif
3560 
3561 	left_4:
3562 	j = 0;
3563 	for( ; j<n; j+=4)
3564 		{
3565 		kernel_dtrsm_nn_ll_inv_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, dA+i, m-i, n-j);
3566 		}
3567 	return;
3568 
3569 	}
3570 
3571 
3572 
3573 // dtrsm_llnu
blasfeo_dtrsm_llnu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3574 void blasfeo_dtrsm_llnu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3575 	{
3576 	// invalidate stored inverse diagonal of result matrix
3577 	sD->use_dA = 0;
3578 
3579 	if(ai!=0 | bi!=0 | di!=0)
3580 		{
3581 		printf("\nblasfeo_dtrsm_llnu: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
3582 		exit(1);
3583 		}
3584 
3585 	const int ps = 4;
3586 
3587 	// TODO alpha
3588 	int sda = sA->cn;
3589 	int sdb = sB->cn;
3590 	int sdd = sD->cn;
3591 	double *pA = sA->pA + aj*ps;
3592 	double *pB = sB->pA + bj*ps;
3593 	double *pD = sD->pA + dj*ps;
3594 
3595 	if(m<=0 || n<=0)
3596 		return;
3597 
3598 	int i, j;
3599 
3600 	i = 0;
3601 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3602 	for( ; i<m-11; i+=12)
3603 		{
3604 		j = 0;
3605 		for( ; j<n-3; j+=4)
3606 			{
3607 			kernel_dtrsm_nn_ll_one_12x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
3608 			}
3609 		if(j<n)
3610 			{
3611 			kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
3612 			}
3613 		}
3614 	if(i<m)
3615 		{
3616 		if(m-i<=4)
3617 			{
3618 			goto left_4;
3619 			}
3620 		if(m-i<=8)
3621 			{
3622 			goto left_8;
3623 			}
3624 		else
3625 			{
3626 			goto left_12;
3627 			}
3628 		}
3629 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
3630 	for( ; i<m-7; i+=8)
3631 		{
3632 		j = 0;
3633 		for( ; j<n-3; j+=4)
3634 			{
3635 			kernel_dtrsm_nn_ll_one_8x4_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda);
3636 			}
3637 		if(j<n)
3638 			{
3639 			kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
3640 			}
3641 		}
3642 	if(i<m)
3643 		{
3644 		if(m-i<=4)
3645 			{
3646 			goto left_4;
3647 			}
3648 		else
3649 			{
3650 			goto left_8;
3651 			}
3652 		}
3653 #else
3654 	for( ; i<m-3; i+=4)
3655 		{
3656 		j = 0;
3657 		for( ; j<n-3; j+=4)
3658 			{
3659 			kernel_dtrsm_nn_ll_one_4x4_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps);
3660 			}
3661 		if(j<n)
3662 			{
3663 			kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
3664 			}
3665 		}
3666 	if(i<m)
3667 		{
3668 		goto left_4;
3669 		}
3670 #endif
3671 	// common return
3672 	return;
3673 
3674 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3675 	left_12:
3676 	j = 0;
3677 	for( ; j<n; j+=4)
3678 		{
3679 		kernel_dtrsm_nn_ll_one_12x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
3680 		}
3681 	return;
3682 #endif
3683 
3684 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
3685 	left_8:
3686 	j = 0;
3687 	for( ; j<n; j+=4)
3688 		{
3689 		kernel_dtrsm_nn_ll_one_8x4_vs_lib4(i, pA+i*sda, sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, sdb, pD+i*sdd+j*ps, sdd, pA+i*sda+i*ps, sda, m-i, n-j);
3690 		}
3691 	return;
3692 #endif
3693 
3694 	left_4:
3695 	j = 0;
3696 	for( ; j<n; j+=4)
3697 		{
3698 		kernel_dtrsm_nn_ll_one_4x4_vs_lib4(i, pA+i*sda, pD+j*ps, sdd, &alpha, pB+i*sdb+j*ps, pD+i*sdd+j*ps, pA+i*sda+i*ps, m-i, n-j);
3699 		}
3700 	return;
3701 
3702 	}
3703 
3704 
3705 
3706 // dtrsm_lltn
blasfeo_dtrsm_lltn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3707 void blasfeo_dtrsm_lltn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3708 	{
3709 #ifndef BENCHMARKS_MODE
3710 	printf("\nblasfeo_dtrsm_lltn: feature not implemented yet\n");
3711 	exit(1);
3712 #endif
3713 	return;
3714 	}
3715 
3716 
3717 
3718 // dtrsm_lltu
blasfeo_dtrsm_lltu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3719 void blasfeo_dtrsm_lltu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3720 	{
3721 #ifndef BENCHMARKS_MODE
3722 	printf("\nblasfeo_dtrsm_lltu: feature not implemented yet\n");
3723 	exit(1);
3724 #endif
3725 	return;
3726 	}
3727 
3728 
3729 
3730 // dtrsm_lunn
blasfeo_dtrsm_lunn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3731 void blasfeo_dtrsm_lunn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3732 	{
3733 
3734 	if(m<=0 || n<=0)
3735 		return;
3736 
3737 	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
3738 		{
3739 		printf("\nblasfeo_dtrsm_lunn: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
3740 		exit(1);
3741 		}
3742 
3743 	// invalidate stored inverse diagonal of result matrix
3744 	sD->use_dA = 0;
3745 
3746 	const int ps = 4;
3747 	// TODO alpha
3748 	int sda = sA->cn;
3749 	int sdb = sB->cn;
3750 	int sdd = sD->cn;
3751 	double *pA = sA->pA + aj*ps;
3752 	double *pB = sB->pA + bj*ps;
3753 	double *pD = sD->pA + dj*ps;
3754 	double *dA = sA->dA;
3755 	int ii;
3756 
3757 	int i, j, idx;
3758 //	double *dummy;
3759 
3760 	if(ai==0 & aj==0)
3761 		{
3762 		// recompute diagonal if size of operation grows
3763 		if(sA->use_dA<m)
3764 			{
3765 			ddiaex_lib(m, 1.0, ai, pA, sda, dA);
3766 			for(ii=0; ii<m; ii++)
3767 				dA[ii] = 1.0 / dA[ii];
3768 			sA->use_dA = m;
3769 			}
3770 		}
3771 	// if submatrix recompute diagonal
3772 	else
3773 		{
3774 		ddiaex_lib(m, 1.0, ai, pA, sda, dA);
3775 		for(ii=0; ii<m; ii++)
3776 			dA[ii] = 1.0 / dA[ii];
3777 		sA->use_dA = 0;
3778 		}
3779 
3780 	i = 0;
3781 	int rm = m%4;
3782 	if(rm>0)
3783 		{
3784 		// TODO code expliticly the final case
3785 		idx = m-rm; // position of the part to do
3786 		j = 0;
3787 		for( ; j<n; j+=4)
3788 			{
3789 //			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, dA+idx, rm, n-j);
3790 			// XXX pA & pD are dummy and should not be used internally !!!
3791 			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, pA, pD, sdd, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, dA+idx, rm, n-j);
3792 			}
3793 		// TODO
3794 		i += rm;
3795 		}
3796 //	int em = m-rm;
3797 #if defined(TARGET_X64_INTEL_HASWELL)
3798 	for( ; i<m-8; i+=12)
3799 		{
3800 		idx = m-i; // position of already done part
3801 		j = 0;
3802 		for( ; j<n-3; j+=4)
3803 			{
3804 			kernel_dtrsm_nn_lu_inv_12x4_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, dA+(idx-12));
3805 			}
3806 		if(j<n)
3807 			{
3808 			kernel_dtrsm_nn_lu_inv_12x4_vs_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, dA+(idx-12), 12, n-j);
3809 //			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, dA+(idx-4), 4, n-j);
3810 //			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, dA+(idx-8), 4, n-j);
3811 //			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+8, pA+(idx-12)*sda+(idx-8)*ps, pD+(idx-8)*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, pD+(idx-12)*sdd+j*ps, pA+(idx-12)*sda+(idx-12)*ps, dA+(idx-12), 4, n-j);
3812 			}
3813 		}
3814 #endif
3815 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3816 	for( ; i<m-4; i+=8)
3817 		{
3818 		idx = m-i; // position of already done part
3819 		j = 0;
3820 		for( ; j<n-3; j+=4)
3821 			{
3822 			kernel_dtrsm_nn_lu_inv_8x4_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, dA+(idx-8));
3823 			}
3824 		if(j<n)
3825 			{
3826 			kernel_dtrsm_nn_lu_inv_8x4_vs_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, dA+(idx-8), 8, n-j);
3827 //			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, dA+(idx-4), 4, n-j);
3828 //			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, dA+(idx-8), 4, n-j);
3829 			}
3830 		}
3831 #endif
3832 	for( ; i<m; i+=4)
3833 		{
3834 		idx = m-i; // position of already done part
3835 		j = 0;
3836 		for( ; j<n-3; j+=4)
3837 			{
3838 			kernel_dtrsm_nn_lu_inv_4x4_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, dA+(idx-4));
3839 			}
3840 		if(j<n)
3841 			{
3842 			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, dA+(idx-4), 4, n-j);
3843 			}
3844 		}
3845 
3846 	// common return
3847 	return;
3848 
3849 	}
3850 
3851 
3852 
3853 // dtrsm_lunu
blasfeo_dtrsm_lunu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3854 void blasfeo_dtrsm_lunu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3855 	{
3856 
3857 	if(m<=0 || n<=0)
3858 		return;
3859 
3860 	if(ai!=0 | bi!=0 | di!=0 | alpha!=1.0)
3861 		{
3862 		printf("\nblasfeo_dtrsm_lunu: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
3863 		exit(1);
3864 		}
3865 
3866 	// invalidate stored inverse diagonal of result matrix
3867 	sD->use_dA = 0;
3868 
3869 	const int ps = 4;
3870 	// TODO alpha
3871 	int sda = sA->cn;
3872 	int sdb = sB->cn;
3873 	int sdd = sD->cn;
3874 	double *pA = sA->pA + aj*ps;
3875 	double *pB = sB->pA + bj*ps;
3876 	double *pD = sD->pA + dj*ps;
3877 	int ii;
3878 
3879 	int i, j, idx;
3880 //	double *dummy;
3881 
3882 	i = 0;
3883 	int rm = m%4;
3884 	if(rm>0)
3885 		{
3886 		// TODO code expliticly the final case
3887 		idx = m-rm; // position of the part to do
3888 		j = 0;
3889 		for( ; j<n; j+=4)
3890 			{
3891 //			kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(0, dummy, dummy, 0, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, dA+idx, rm, n-j);
3892 			// XXX pA & pD are dummy and should not be used internally !!!
3893 			kernel_dtrsm_nn_lu_one_4x4_vs_lib4(0, pA, pD, sdd, pB+idx*sdb+j*ps, pD+idx*sdd+j*ps, pA+idx*sda+idx*ps, rm, n-j);
3894 			}
3895 		// TODO
3896 		i += rm;
3897 		}
3898 //	int em = m-rm;
3899 #if 0//defined(TARGET_X64_INTEL_HASWELL)
3900 	for( ; i<m-8; i+=12)
3901 		{
3902 		idx = m-i; // position of already done part
3903 		j = 0;
3904 		for( ; j<n-3; j+=4)
3905 			{
3906 			kernel_dtrsm_nn_lu_one_12x4_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda);
3907 			}
3908 		if(j<n)
3909 			{
3910 			kernel_dtrsm_nn_lu_one_12x4_vs_lib4(i, pA+(idx-12)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, sdb, pD+(idx-12)*sdd+j*ps, sdd, pA+(idx-12)*sda+(idx-12)*ps, sda, 12, n-j);
3911 //			kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, 4, n-j);
3912 //			kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, 4, n-j);
3913 //			kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i+8, pA+(idx-12)*sda+(idx-8)*ps, pD+(idx-8)*sdd+j*ps, sdd, pB+(idx-12)*sdb+j*ps, pD+(idx-12)*sdd+j*ps, pA+(idx-12)*sda+(idx-12)*ps, 4, n-j);
3914 			}
3915 		}
3916 #endif
3917 #if 0//defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
3918 	for( ; i<m-4; i+=8)
3919 		{
3920 		idx = m-i; // position of already done part
3921 		j = 0;
3922 		for( ; j<n-3; j+=4)
3923 			{
3924 			kernel_dtrsm_nn_lu_one_8x4_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda));
3925 			}
3926 		if(j<n)
3927 			{
3928 			kernel_dtrsm_nn_lu_one_8x4_vs_lib4(i, pA+(idx-8)*sda+idx*ps, sda, pD+idx*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, sdb, pD+(idx-8)*sdd+j*ps, sdd, pA+(idx-8)*sda+(idx-8)*ps, sda, 8, n-j);
3929 //			kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, 4, n-j);
3930 //			kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i+4, pA+(idx-8)*sda+(idx-4)*ps, pD+(idx-4)*sdd+j*ps, sdd, pB+(idx-8)*sdb+j*ps, pD+(idx-8)*sdd+j*ps, pA+(idx-8)*sda+(idx-8)*ps, 4, n-j);
3931 			}
3932 		}
3933 #endif
3934 	for( ; i<m; i+=4)
3935 		{
3936 		idx = m-i; // position of already done part
3937 		j = 0;
3938 		for( ; j<n-3; j+=4)
3939 			{
3940 			kernel_dtrsm_nn_lu_one_4x4_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps);
3941 			}
3942 		if(j<n)
3943 			{
3944 			kernel_dtrsm_nn_lu_one_4x4_vs_lib4(i, pA+(idx-4)*sda+idx*ps, pD+idx*sdd+j*ps, sdd, pB+(idx-4)*sdb+j*ps, pD+(idx-4)*sdd+j*ps, pA+(idx-4)*sda+(idx-4)*ps, 4, n-j);
3945 			}
3946 		}
3947 
3948 	// common return
3949 	return;
3950 
3951 	}
3952 
3953 
3954 
3955 // dtrsm_lutn
blasfeo_dtrsm_lutn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3956 void blasfeo_dtrsm_lutn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3957 	{
3958 #ifndef BENCHMARKS_MODE
3959 	printf("\nblasfeo_dtrsm_lutn: feature not implemented yet\n");
3960 	exit(1);
3961 #endif
3962 	return;
3963 	}
3964 
3965 
3966 
3967 // dtrsm_lutu
blasfeo_dtrsm_lutu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3968 void blasfeo_dtrsm_lutu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3969 	{
3970 #ifndef BENCHMARKS_MODE
3971 	printf("\nblasfeo_dtrsm_lutu: feature not implemented yet\n");
3972 	exit(1);
3973 #endif
3974 	return;
3975 	}
3976 
3977 
3978 
3979 // dtrsm_rlnn
blasfeo_dtrsm_rlnn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3980 void blasfeo_dtrsm_rlnn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3981 	{
3982 #ifndef BENCHMARKS_MODE
3983 	printf("\nblasfeo_dtrsm_rlnn: feature not implemented yet\n");
3984 	exit(1);
3985 #endif
3986 	return;
3987 	}
3988 
3989 
3990 
3991 // dtrsm_rlnu
blasfeo_dtrsm_rlnu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)3992 void blasfeo_dtrsm_rlnu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
3993 	{
3994 #ifndef BENCHMARKS_MODE
3995 	printf("\nblasfeo_dtrsm_rlnu: feature not implemented yet\n");
3996 	exit(1);
3997 #endif
3998 	return;
3999 	}
4000 
4001 
4002 
4003 // dtrsm_right_lower_transposed_notunit
blasfeo_dtrsm_rltn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4004 void blasfeo_dtrsm_rltn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4005 	{
4006 
4007 	if(m<=0 || n<=0)
4008 		return;
4009 
4010 	const int ps = 4;
4011 
4012 	// invalidate stored inverse diagonal of result matrix
4013 	sD->use_dA = 0;
4014 
4015 	// TODO alpha !!!!!
4016 
4017 	int sda = sA->cn;
4018 	int sdb = sB->cn;
4019 	int sdd = sD->cn;
4020 	int bir = bi & (ps-1);
4021 	int dir = di & (ps-1);
4022 	double *pA = sA->pA + aj*ps;
4023 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
4024 	double *pD = sD->pA + dj*ps + (di-dir)*sdd;
4025 	double *dA = sA->dA;
4026 
4027 	if(ai!=0 | bir!=0 | dir!=0 | alpha!=1.0)
4028 		{
4029 		printf("\nblasfeo_dtrsm_rltn: feature not implemented yet: ai=%d, bi=%d, di=%d, alpha=%f\n", ai, bi, di, alpha);
4030 		exit(1);
4031 		}
4032 
4033 	int i, j;
4034 
4035 	// TODO to avoid touching A, better temporarely use sD.dA ?????
4036 	if(ai==0 & aj==0)
4037 		{
4038 		if(sA->use_dA<n)
4039 			{
4040 			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
4041 			for(i=0; i<n; i++)
4042 				dA[i] = 1.0 / dA[i];
4043 			sA->use_dA = n;
4044 			}
4045 		}
4046 	else
4047 		{
4048 		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
4049 		for(i=0; i<n; i++)
4050 			dA[i] = 1.0 / dA[i];
4051 		sA->use_dA = 0;
4052 		}
4053 
4054 	i = 0;
4055 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4056 	for(; i<m-11; i+=12)
4057 		{
4058 		j = 0;
4059 		for(; j<n-3; j+=4)
4060 			{
4061 			kernel_dtrsm_nt_rl_inv_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
4062 			}
4063 		if(j<n)
4064 			{
4065 			kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4066 			}
4067 		}
4068 	if(m>i)
4069 		{
4070 		if(m-i<=4)
4071 			{
4072 			goto left_4;
4073 			}
4074 		else if(m-i<=8)
4075 			{
4076 			goto left_8;
4077 			}
4078 		else
4079 			{
4080 			goto left_12;
4081 			}
4082 		}
4083 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4084 	for(; i<m-7; i+=8)
4085 		{
4086 		j = 0;
4087 		for(; j<n-3; j+=4)
4088 			{
4089 			kernel_dtrsm_nt_rl_inv_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j]);
4090 			}
4091 		if(j<n)
4092 			{
4093 			kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4094 			}
4095 		}
4096 	if(m>i)
4097 		{
4098 		if(m-i<=4)
4099 			{
4100 			goto left_4;
4101 			}
4102 		else
4103 			{
4104 			goto left_8;
4105 			}
4106 		}
4107 #elif defined(TARGET_X86_AMD_BARCELONA)
4108 	for(; i<m-3; i+=4)
4109 		{
4110 		j = 0;
4111 		for(; j<n-3; j+=4)
4112 			{
4113 			kernel_dtrsm_nt_rl_inv_4x2_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j]);
4114 			kernel_dtrsm_nt_rl_inv_4x2_lib4(j+2, &pD[i*sdd], &pA[j*sda+2], &alpha, &pB[(j+2)*ps+i*sdb], &pD[(j+2)*ps+i*sdd], &pA[(j+2)*ps+j*sda+2], &dA[j+2]);
4115 			}
4116 		if(j<n)
4117 			{
4118 			kernel_dtrsm_nt_rl_inv_4x2_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4119 			if(j<n-2)
4120 				kernel_dtrsm_nt_rl_inv_4x2_vs_lib4(j+2, &pD[i*sdd], &pA[j*sda+2], &alpha, &pB[(j+2)*ps+i*sdb], &pD[(j+2)*ps+i*sdd], &pA[(j+2)*ps+j*sda+2], &dA[j+2], m-i, n-(j+2));
4121 			}
4122 		}
4123 	if(m>i)
4124 		{
4125 		goto left_4;
4126 		}
4127 #else
4128 	for(; i<m-3; i+=4)
4129 		{
4130 		j = 0;
4131 		for(; j<n-3; j+=4)
4132 			{
4133 			kernel_dtrsm_nt_rl_inv_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j]);
4134 			}
4135 		if(j<n)
4136 			{
4137 			kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4138 			}
4139 		}
4140 	if(m>i)
4141 		{
4142 		goto left_4;
4143 		}
4144 #endif
4145 
4146 	// common return if i==m
4147 	return;
4148 
4149 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4150 	left_12:
4151 	j = 0;
4152 	for(; j<n; j+=4)
4153 		{
4154 		kernel_dtrsm_nt_rl_inv_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4155 		}
4156 	return;
4157 #endif
4158 
4159 #if defined(TARGET_X64_INTEL_HASWELL)
4160 	left_8:
4161 	j = 0;
4162 	for(; j<n-8; j+=12)
4163 		{
4164 		kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
4165 		kernel_dtrsm_nt_rl_inv_8x8u_vs_lib4((j+4), &pD[i*sdd], sdd, &pA[(j+4)*sda], sda, &pB[(j+4)*ps+i*sdb], sdb, &pD[(j+4)*ps+i*sdd], sdd, &pA[(j+4)*ps+(j+4)*sda], sda, &dA[(j+4)], m-i, n-(j+4));
4166 		}
4167 	if(j<n-4)
4168 		{
4169 		kernel_dtrsm_nt_rl_inv_8x8l_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], sda, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
4170 		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4((j+4), &pD[i*sdd], &pA[(j+4)*sda], &alpha, &pB[(j+4)*ps+i*sdb], &pD[(j+4)*ps+i*sdd], &pA[(j+4)*ps+(j+4)*sda], &dA[(j+4)], m-i, n-(j+4));
4171 		j += 8;
4172 		}
4173 	else if(j<n)
4174 		{
4175 		kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4176 		j += 4;
4177 		}
4178 	return;
4179 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4180 	left_8:
4181 	j = 0;
4182 	for(; j<n; j+=4)
4183 		{
4184 		kernel_dtrsm_nt_rl_inv_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4185 		}
4186 	return;
4187 #endif
4188 
4189 #if defined(TARGET_X64_INTEL_HASWELL)
4190 	left_4:
4191 	j = 0;
4192 	for(; j<n-8; j+=12)
4193 		{
4194 		kernel_dtrsm_nt_rl_inv_4x12_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
4195 		}
4196 	if(j<n-4)
4197 		{
4198 		kernel_dtrsm_nt_rl_inv_4x8_vs_lib4(j, &pD[i*sdd], &pA[j*sda], sda, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], sda, &dA[j], m-i, n-j);
4199 		j += 8;
4200 		}
4201 	else if(j<n)
4202 		{
4203 		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4204 		j += 4;
4205 		}
4206 	return;
4207 #elif defined(TARGET_X86_AMD_BARCELONA)
4208 	left_4:
4209 	j = 0;
4210 	for(; j<n; j+=4)
4211 		{
4212 		kernel_dtrsm_nt_rl_inv_4x2_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4213 		if(j<n-2)
4214 		{
4215 			kernel_dtrsm_nt_rl_inv_4x2_vs_lib4(j+2, &pD[i*sdd], &pA[j*sda+2], &alpha, &pB[(j+2)*ps+i*sdb], &pD[(j+2)*ps+i*sdd], &pA[(j+2)*ps+j*sda+2], &dA[j+2], m-i, n-(j+2));
4216 			}
4217 		}
4218 	return;
4219 #else
4220 	left_4:
4221 	j = 0;
4222 	for(; j<n; j+=4)
4223 		{
4224 		kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], &dA[j], m-i, n-j);
4225 		}
4226 	return;
4227 #endif
4228 	}
4229 
4230 
4231 
4232 // dtrsm_right_lower_transposed_unit
blasfeo_dtrsm_rltu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4233 void blasfeo_dtrsm_rltu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4234 	{
4235 
4236 	if(m<=0 || n<=0)
4237 		return;
4238 
4239 	if(ai!=0 | bi!=0 | di!=0)
4240 		{
4241 		printf("\nblasfeo_dtrsm_rltu: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
4242 		exit(1);
4243 		}
4244 
4245 	// invalidate stored inverse diagonal of result matrix
4246 	sD->use_dA = 0;
4247 
4248 	const int ps = 4;
4249 
4250 	int sda = sA->cn;
4251 	int sdb = sB->cn;
4252 	int sdd = sD->cn;
4253 	double *pA = sA->pA + aj*ps;
4254 	double *pB = sB->pA + bj*ps;
4255 	double *pD = sD->pA + dj*ps;
4256 
4257 	int i, j;
4258 
4259 	i = 0;
4260 
4261 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4262 	for(; i<m-11; i+=12)
4263 		{
4264 		j = 0;
4265 		for(; j<n-3; j+=4)
4266 			{
4267 			kernel_dtrsm_nt_rl_one_12x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
4268 			}
4269 		if(j<n)
4270 			{
4271 			kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
4272 			}
4273 		}
4274 	if(m>i)
4275 		{
4276 		if(m-i<=4)
4277 			{
4278 			goto left_4;
4279 			}
4280 		else if(m-i<=8)
4281 			{
4282 			goto left_8;
4283 			}
4284 		else
4285 			{
4286 			goto left_12;
4287 			}
4288 		}
4289 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
4290 	for(; i<m-7; i+=8)
4291 		{
4292 		j = 0;
4293 		for(; j<n-3; j+=4)
4294 			{
4295 			kernel_dtrsm_nt_rl_one_8x4_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda]);
4296 			}
4297 		if(j<n)
4298 			{
4299 			kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
4300 			}
4301 		}
4302 	if(m>i)
4303 		{
4304 		if(m-i<=4)
4305 			{
4306 			goto left_4;
4307 			}
4308 		else
4309 			{
4310 			goto left_8;
4311 			}
4312 		}
4313 #else
4314 	for(; i<m-3; i+=4)
4315 		{
4316 		j = 0;
4317 		for(; j<n-3; j+=4)
4318 			{
4319 			kernel_dtrsm_nt_rl_one_4x4_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda]);
4320 			}
4321 		if(j<n)
4322 			{
4323 			kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
4324 			}
4325 		}
4326 	if(m>i)
4327 		{
4328 		goto left_4;
4329 		}
4330 #endif
4331 
4332 	// common return if i==m
4333 	return;
4334 
4335 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4336 	left_12:
4337 	j = 0;
4338 	for(; j<n; j+=4)
4339 		{
4340 		kernel_dtrsm_nt_rl_one_12x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
4341 		}
4342 	return;
4343 #endif
4344 
4345 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
4346 	left_8:
4347 	j = 0;
4348 	for(; j<n; j+=4)
4349 		{
4350 		kernel_dtrsm_nt_rl_one_8x4_vs_lib4(j, &pD[i*sdd], sdd, &pA[j*sda], &alpha, &pB[j*ps+i*sdb], sdb, &pD[j*ps+i*sdd], sdd, &pA[j*ps+j*sda], m-i, n-j);
4351 		}
4352 	return;
4353 #endif
4354 
4355 	left_4:
4356 	j = 0;
4357 	for(; j<n; j+=4)
4358 		{
4359 		kernel_dtrsm_nt_rl_one_4x4_vs_lib4(j, &pD[i*sdd], &pA[j*sda], &alpha, &pB[j*ps+i*sdb], &pD[j*ps+i*sdd], &pA[j*ps+j*sda], m-i, n-j);
4360 		}
4361 
4362 	return;
4363 
4364 	}
4365 
4366 
4367 
4368 // dtrsm_runn
blasfeo_dtrsm_runn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4369 void blasfeo_dtrsm_runn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4370 	{
4371 #ifndef BENCHMARKS_MODE
4372 	printf("\nblasfeo_dtrsm_runn: feature not implemented yet\n");
4373 	exit(1);
4374 #endif
4375 	return;
4376 	}
4377 
4378 
4379 
4380 // dtrsm_runu
blasfeo_dtrsm_runu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4381 void blasfeo_dtrsm_runu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4382 	{
4383 #ifndef BENCHMARKS_MODE
4384 	printf("\nblasfeo_dtrsm_runu: feature not implemented yet\n");
4385 	exit(1);
4386 #endif
4387 	return;
4388 	}
4389 
4390 
4391 
4392 // dtrsm_right_upper_transposed_notunit
blasfeo_dtrsm_rutn(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4393 void blasfeo_dtrsm_rutn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4394 	{
4395 	if(ai!=0 | bi!=0 | di!=0)
4396 		{
4397 		printf("\nblasfeo_dtrsm_rutn: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
4398 		exit(1);
4399 		}
4400 
4401 	// invalidate stored inverse diagonal of result matrix
4402 	sD->use_dA = 0;
4403 
4404 	const int ps = 4;
4405 
4406 	int sda = sA->cn;
4407 	int sdb = sB->cn;
4408 	int sdd = sD->cn;
4409 	double *pA = sA->pA + aj*ps;
4410 	double *pB = sB->pA + bj*ps;
4411 	double *pD = sD->pA + dj*ps;
4412 	double *dA = sA->dA;
4413 
4414 	int ii;
4415 
4416 	if(ai==0 & aj==0)
4417 		{
4418 		if(sA->use_dA<n)
4419 			{
4420 			ddiaex_lib(n, 1.0, ai, pA, sda, dA);
4421 			for(ii=0; ii<n; ii++)
4422 				dA[ii] = 1.0 / dA[ii];
4423 			sA->use_dA = n;
4424 			}
4425 		}
4426 	else
4427 		{
4428 		ddiaex_lib(n, 1.0, ai, pA, sda, dA);
4429 		for(ii=0; ii<n; ii++)
4430 			dA[ii] = 1.0 / dA[ii];
4431 		sA->use_dA = 0;
4432 		}
4433 //	dtrsm_nt_ru_inv_lib(m, n, pA, sda, dA, pB, sdb, pD, sdd);
4434 
4435 	if(m<=0 || n<=0)
4436 		return;
4437 
4438 	int i, j, idx;
4439 
4440 	int rn = n%4;
4441 
4442 	double *dummy = NULL;
4443 
4444 	i = 0;
4445 
4446 #if defined(TARGET_X64_INTEL_HASWELL)
4447 	for(; i<m-11; i+=12)
4448 		{
4449 		j = 0;
4450 		// clean at the end
4451 		if(rn>0)
4452 			{
4453 			idx = n-rn;
4454 			kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4455 			j += rn;
4456 			}
4457 		for(; j<n; j+=4)
4458 			{
4459 			idx = n-j-4;
4460 			kernel_dtrsm_nt_ru_inv_12x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx]);
4461 			}
4462 		}
4463 	if(m>i)
4464 		{
4465 		if(m-i<=4)
4466 			{
4467 			goto left_4;
4468 			}
4469 		else if(m-i<=8)
4470 			{
4471 			goto left_8;
4472 			}
4473 		else
4474 			{
4475 			goto left_12;
4476 			}
4477 		}
4478 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4479 	for(; i<m-7; i+=8)
4480 		{
4481 		j = 0;
4482 		// clean at the end
4483 		if(rn>0)
4484 			{
4485 			idx = n-rn;
4486 			kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4487 			j += rn;
4488 			}
4489 		for(; j<n; j+=4)
4490 			{
4491 			idx = n-j-4;
4492 			kernel_dtrsm_nt_ru_inv_8x4_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx]);
4493 			}
4494 		}
4495 	if(m>i)
4496 		{
4497 		if(m-i<=4)
4498 			{
4499 			goto left_4;
4500 			}
4501 		else
4502 			{
4503 			goto left_8;
4504 			}
4505 		}
4506 #else
4507 	for(; i<m-3; i+=4)
4508 		{
4509 		j = 0;
4510 		// clean at the end
4511 		if(rn>0)
4512 			{
4513 			idx = n-rn;
4514 			kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &alpha, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4515 			j += rn;
4516 			}
4517 		for(; j<n; j+=4)
4518 			{
4519 			idx = n-j-4;
4520 			kernel_dtrsm_nt_ru_inv_4x4_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &dA[idx]);
4521 			}
4522 		}
4523 	if(m>i)
4524 		{
4525 		goto left_4;
4526 		}
4527 #endif
4528 	// common return if i==m
4529 	return;
4530 
4531 #if defined(TARGET_X64_INTEL_HASWELL)
4532 	left_12:
4533 	j = 0;
4534 	// TODO
4535 	// clean at the end
4536 	if(rn>0)
4537 		{
4538 		idx = n-rn;
4539 		kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(0, dummy, 0, dummy, &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4540 		j += rn;
4541 		}
4542 	for(; j<n; j+=4)
4543 		{
4544 		idx = n-j-4;
4545 		kernel_dtrsm_nt_ru_inv_12x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, 4);
4546 		}
4547 	return;
4548 #endif
4549 
4550 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4551 	left_8:
4552 	j = 0;
4553 	// TODO
4554 	// clean at the end
4555 	if(rn>0)
4556 		{
4557 		idx = n-rn;
4558 		kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(0, dummy, 0, dummy, &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4559 		j += rn;
4560 		}
4561 	for(; j<n; j+=4)
4562 		{
4563 		idx = n-j-4;
4564 		kernel_dtrsm_nt_ru_inv_8x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], sdd, &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], sdb, &pD[i*sdd+idx*ps], sdd, &pA[idx*sda+idx*ps], &dA[idx], m-i, 4);
4565 		}
4566 	return;
4567 #endif
4568 
4569 	left_4:
4570 	j = 0;
4571 	// TODO
4572 	// clean at the end
4573 	if(rn>0)
4574 		{
4575 		idx = n-rn;
4576 		kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(0, dummy, dummy, &alpha, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &dA[idx], m-i, rn);
4577 		j += rn;
4578 		}
4579 	for(; j<n; j+=4)
4580 		{
4581 		idx = n-j-4;
4582 		kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(j, &pD[i*sdd+(idx+4)*ps], &pA[idx*sda+(idx+4)*ps], &alpha, &pB[i*sdb+idx*ps], &pD[i*sdd+idx*ps], &pA[idx*sda+idx*ps], &dA[idx], m-i, 4);
4583 		}
4584 	return;
4585 	}
4586 
4587 
4588 
4589 // dtrsm_rutu
blasfeo_dtrsm_rutu(int m,int n,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sD,int di,int dj)4590 void blasfeo_dtrsm_rutu(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sD, int di, int dj)
4591 	{
4592 #ifndef BENCHMARKS_MODE
4593 	printf("\nblasfeo_dtrsm_rutu: feature not implemented yet\n");
4594 	exit(1);
4595 #endif
4596 	return;
4597 	}
4598 
4599 
4600 
4601 // dtrmm_right_upper_transposed_notunit (B, i.e. the first matrix, is triangular !!!)
blasfeo_dtrmm_rutn(int m,int n,double alpha,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sD,int di,int dj)4602 void blasfeo_dtrmm_rutn(int m, int n, double alpha, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sD, int di, int dj)
4603 	{
4604 	// invalidate stored inverse diagonal of result matrix
4605 	sD->use_dA = 0;
4606 
4607 	if(ai!=0 | bi!=0 | di!=0)
4608 		{
4609 		printf("\nblasfeo_dtrmm_rutn: feature not implemented yet: ai=%d, bi=%d, di=%d\n", ai, bi, di);
4610 		exit(1);
4611 		}
4612 
4613 	if(m<=0 || n<=0)
4614 		return;
4615 
4616 	const int ps = 4;
4617 
4618 	int sda = sA->cn;
4619 	int sdb = sB->cn;
4620 	int sdd = sD->cn;
4621 	double *pA = sA->pA + aj*ps;
4622 	double *pB = sB->pA + bj*ps;
4623 	double *pD = sD->pA + dj*ps;
4624 
4625 	int i, j;
4626 
4627 	i = 0;
4628 #if defined(TARGET_X64_INTEL_HASWELL)
4629 // XXX there is a bug here !!!!!!
4630 	for(; i<m-11; i+=12)
4631 		{
4632 		j = 0;
4633 		for(; j<n-3; j+=4)
4634 			{
4635 			kernel_dtrmm_nt_ru_12x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd);
4636 			}
4637 		if(j<n)
4638 			{
4639 			kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd, m-i, n-j);
4640 			}
4641 		}
4642 	if(i<m)
4643 		{
4644 		if(m-i<5)
4645 			{
4646 			goto left_4;
4647 			}
4648 		if(m-i<9)
4649 			{
4650 			goto left_8;
4651 			}
4652 		else
4653 			{
4654 			goto left_12;
4655 			}
4656 		}
4657 
4658 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4659 	for(; i<m-7; i+=8)
4660 		{
4661 		j = 0;
4662 		for(; j<n-3; j+=4)
4663 			{
4664 			kernel_dtrmm_nt_ru_8x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd);
4665 			}
4666 		if(j<n)
4667 			{
4668 			kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd, m-i, n-j);
4669 			}
4670 		}
4671 	if(i<m)
4672 		{
4673 		if(m-i<5)
4674 			{
4675 			goto left_4;
4676 			}
4677 		else
4678 			{
4679 			goto left_8;
4680 			}
4681 		}
4682 
4683 #else
4684 	for(; i<m-3; i+=4)
4685 		{
4686 		j = 0;
4687 		for(; j<n-3; j+=4)
4688 			{
4689 			kernel_dtrmm_nt_ru_4x4_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &pD[j*ps+i*sdd]);
4690 			}
4691 		if(j<n)
4692 			{
4693 			kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], m-i, n-j);
4694 			}
4695 		}
4696 	if(i<m)
4697 		{
4698 		goto left_4;
4699 		}
4700 #endif
4701 
4702 	// common return
4703 	return;
4704 
4705 #if defined(TARGET_X64_INTEL_HASWELL)
4706 	// clean up
4707 	left_12:
4708 	j = 0;
4709 	for(; j<n; j+=4)
4710 		{
4711 		kernel_dtrmm_nt_ru_12x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd, m-i, n-j);
4712 		}
4713 	return;
4714 #endif
4715 
4716 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_HASWELL)
4717 	// clean up
4718 	left_8:
4719 	j = 0;
4720 	for(; j<n; j+=4)
4721 		{
4722 		kernel_dtrmm_nt_ru_8x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], sda, &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], sdd, m-i, n-j);
4723 		}
4724 	return;
4725 #endif
4726 
4727 	left_4:
4728 	j = 0;
4729 	for(; j<n; j+=4)
4730 		{
4731 		kernel_dtrmm_nt_ru_4x4_vs_lib4(n-j, &alpha, &pA[j*ps+i*sda], &pB[j*ps+j*sdb], &pD[j*ps+i*sdd], m-i, n-j);
4732 		}
4733 	return;
4734 
4735 	}
4736 
4737 
4738 
4739 // dtrmm_right_lower_nottransposed_notunit (B, i.e. the first matrix, is triangular !!!)
blasfeo_dtrmm_rlnn(int m,int n,double alpha,struct blasfeo_dmat * sB,int bi,int bj,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sD,int di,int dj)4740 void blasfeo_dtrmm_rlnn(int m, int n, double alpha, struct blasfeo_dmat *sB, int bi, int bj, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sD, int di, int dj)
4741 	{
4742 
4743 	const int ps = 4;
4744 
4745 	int sda = sA->cn;
4746 	int sdb = sB->cn;
4747 	int sdd = sD->cn;
4748 	int air = ai & (ps-1);
4749 	int bir = bi & (ps-1);
4750 	double *pA = sA->pA + aj*ps + (ai-air)*sda;
4751 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
4752 	double *pD = sD->pA + dj*ps;
4753 
4754 	int offsetB = bir;
4755 
4756 	int di0 = di-air;
4757 	int offsetD;
4758 
4759 	// invalidate stored inverse diagonal of result matrix
4760 	sD->use_dA = 0;
4761 
4762 	if(di0>=0)
4763 		{
4764 		pD += di0/ps*ps*sdd;
4765 		offsetD = di0%ps;
4766 		}
4767 	else
4768 		{
4769 		pD += -4*sdd;
4770 		offsetD = ps+di0;
4771 		}
4772 
4773 	int ii, jj;
4774 
4775 	if(air!=0)
4776 		{
4777 		jj = 0;
4778 		for(; jj<n; jj+=4)
4779 			{
4780 			kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[jj*ps], sdd, air, air+m, 0, n-jj);
4781 			}
4782 		m -= ps-air;
4783 		pA += ps*sda;
4784 		pD += ps*sdd;
4785 		}
4786 	ii = 0;
4787 	if(offsetD==0)
4788 		{
4789 #if defined(TARGET_X64_INTEL_HASWELL)
4790 		for(; ii<m-11; ii+=12)
4791 			{
4792 			jj = 0;
4793 			for(; jj<n-5; jj+=4)
4794 				{
4795 				kernel_dtrmm_nn_rl_12x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd); // n-j>=6 !!!!!
4796 				}
4797 			for(; jj<n; jj+=4)
4798 				{
4799 				kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 12, n-jj);
4800 				}
4801 			}
4802 		if(ii<m)
4803 			{
4804 			if(ii<m-8)
4805 				goto left_12;
4806 			else if(ii<m-4)
4807 				goto left_8;
4808 			else
4809 				goto left_4;
4810 			}
4811 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4812 		for(; ii<m-7; ii+=8)
4813 			{
4814 			jj = 0;
4815 			for(; jj<n-5; jj+=4)
4816 				{
4817 				kernel_dtrmm_nn_rl_8x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd);
4818 				}
4819 			for(; jj<n; jj+=4)
4820 				{
4821 				kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, 8, n-jj);
4822 				}
4823 			}
4824 		if(ii<m)
4825 			{
4826 			if(ii<m-4)
4827 				goto left_8;
4828 			else
4829 				goto left_4;
4830 			}
4831 #elif defined(TARGET_X86_AMD_BARCELONA)
4832 		for(; ii<m-3; ii+=4)
4833 			{
4834 			jj = 0;
4835 			for(; jj<n-3; jj+=4)
4836 				{
4837 				kernel_dtrmm_nn_rl_4x2_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps]);
4838 				if(offsetB+2<4)
4839 					kernel_dtrmm_nn_rl_4x2_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2, &pB[jj*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps]);
4840 				else
4841 					kernel_dtrmm_nn_rl_4x2_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2-ps, &pB[(jj+ps)*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps]);
4842 				}
4843 			for(; jj<n; jj+=4)
4844 				{
4845 				kernel_dtrmm_nn_rl_4x2_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], 4, n-jj);
4846 				if(jj<n-2)
4847 					{
4848 					if(offsetB+2<4)
4849 						kernel_dtrmm_nn_rl_4x2_vs_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2, &pB[jj*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps], 4, n-(jj+2));
4850 					else
4851 						kernel_dtrmm_nn_rl_4x2_vs_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2-ps, &pB[(jj+ps)*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps], 4, n-(jj+2));
4852 					}
4853 				}
4854 			}
4855 		if(ii<m)
4856 			{
4857 			goto left_4;
4858 			}
4859 #else
4860 		for(; ii<m-3; ii+=4)
4861 			{
4862 			jj = 0;
4863 			for(; jj<n-5; jj+=4)
4864 				{
4865 				kernel_dtrmm_nn_rl_4x4_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps]);
4866 				}
4867 			for(; jj<n; jj+=4)
4868 				{
4869 				kernel_dtrmm_nn_rl_4x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], 4, n-jj);
4870 				}
4871 			}
4872 		if(ii<m)
4873 			{
4874 			goto left_4;
4875 			}
4876 #endif
4877 		}
4878 	else
4879 		{
4880 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4881 		for(; ii<m-4; ii+=8)
4882 			{
4883 			jj = 0;
4884 			for(; jj<n; jj+=4)
4885 				{
4886 				kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
4887 				}
4888 			}
4889 		if(ii<m)
4890 			{
4891 			goto left_4_gen;
4892 			}
4893 #else
4894 		for(; ii<m; ii+=4)
4895 			{
4896 			jj = 0;
4897 			for(; jj<n; jj+=4)
4898 				{
4899 				kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
4900 				}
4901 			}
4902 #endif
4903 		}
4904 
4905 	// common return if i==m
4906 	return;
4907 
4908 	// clean up loops definitions
4909 
4910 #if defined(TARGET_X64_INTEL_HASWELL)
4911 	left_12:
4912 	jj = 0;
4913 	for(; jj<n; jj+=4)
4914 		{
4915 		kernel_dtrmm_nn_rl_12x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
4916 		}
4917 	return;
4918 #endif
4919 
4920 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4921 	left_8:
4922 	jj = 0;
4923 	for(; jj<n; jj+=4)
4924 		{
4925 		kernel_dtrmm_nn_rl_8x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], sdd, m-ii, n-jj);
4926 		}
4927 	return;
4928 #endif
4929 
4930 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
4931 	left_8_gen:
4932 	jj = 0;
4933 	for(; jj<n; jj+=4)
4934 		{
4935 		kernel_dtrmm_nn_rl_8x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], sda, offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
4936 		}
4937 	return;
4938 #endif
4939 
4940 #if defined(TARGET_X86_AMD_BARCELONA)
4941 	left_4:
4942 	jj = 0;
4943 	for(; jj<n; jj+=4)
4944 		{
4945 		kernel_dtrmm_nn_rl_4x2_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], m-ii, n-jj);
4946 		if(jj<n-2)
4947 			{
4948 			if(offsetB+2<4)
4949 				kernel_dtrmm_nn_rl_4x2_vs_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2, &pB[jj*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps], m-ii, n-(jj+2));
4950 			else
4951 				kernel_dtrmm_nn_rl_4x2_vs_lib4(n-(jj+2), &alpha, &pA[ii*sda+(jj+2)*ps], offsetB+2-ps, &pB[(jj+ps)*sdb+(jj+2)*ps], sdb, &pD[ii*sdd+(jj+2)*ps], m-ii, n-(jj+2));
4952 			}
4953 		}
4954 	return;
4955 #else
4956 	left_4:
4957 	jj = 0;
4958 	for(; jj<n; jj+=4)
4959 		{
4960 		kernel_dtrmm_nn_rl_4x4_vs_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, &pD[ii*sdd+jj*ps], m-ii, n-jj);
4961 		}
4962 	return;
4963 #endif
4964 
4965 	left_4_gen:
4966 	jj = 0;
4967 	for(; jj<n; jj+=4)
4968 		{
4969 		kernel_dtrmm_nn_rl_4x4_gen_lib4(n-jj, &alpha, &pA[ii*sda+jj*ps], offsetB, &pB[jj*sdb+jj*ps], sdb, offsetD, &pD[ii*sdd+jj*ps], sdd, 0, m-ii, 0, n-jj);
4970 		}
4971 	return;
4972 	}
4973 
4974 
4975 
blasfeo_dsyrk_ln(int m,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)4976 void blasfeo_dsyrk_ln(int m, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
4977 	{
4978 
4979 	// fast return
4980 	if(m<=0)
4981 		return;
4982 
4983 	// invalidate stored inverse diagonal of result matrix
4984 	sD->use_dA = 0;
4985 
4986 	const int ps = 4;
4987 
4988 	int sda = sA->cn;
4989 	int sdb = sB->cn;
4990 	int sdc = sC->cn;
4991 	int sdd = sD->cn;
4992 	int air = ai & (ps-1);
4993 	int bir = bi & (ps-1);
4994 	double *pA = sA->pA + aj*ps + (ai-air)*sda;
4995 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
4996 	double *pC = sC->pA + cj*ps;
4997 	double *pD = sD->pA + dj*ps;
4998 
4999 	int ci0 = ci;//-air;
5000 	int di0 = di;//-air;
5001 	int offsetC;
5002 	int offsetD;
5003 	if(ci0>=0)
5004 		{
5005 		pC += ci0/ps*ps*sdd;
5006 		offsetC = ci0%ps;
5007 		}
5008 	else
5009 		{
5010 		pC += -4*sdc;
5011 		offsetC = ps+ci0;
5012 		}
5013 	if(di0>=0)
5014 		{
5015 		pD += di0/ps*ps*sdd;
5016 		offsetD = di0%ps;
5017 		}
5018 	else
5019 		{
5020 		pD += -4*sdd;
5021 		offsetD = ps+di0;
5022 		}
5023 
5024 	void *mem;
5025 	double *pU, *pA2;
5026 	int sdu, sda2;
5027 
5028 // TODO visual studio alignment
5029 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5030 	ALIGNED( double pU0[3*4*K_MAX_STACK], 64 );
5031 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5032 	ALIGNED( double pU0[2*4*K_MAX_STACK], 64 );
5033 #elif defined(TARGET_GENERIC)
5034 	double pU0[1*4*K_MAX_STACK];
5035 #else
5036 	ALIGNED( double pU0[1*4*K_MAX_STACK], 64 );
5037 #endif
5038 	int sdu0 = (k+3)/4*4;
5039 	sdu0 = sdu0<K_MAX_STACK ? sdu0 : K_MAX_STACK;
5040 
5041 	// allocate memory
5042 	if(k>K_MAX_STACK)
5043 		{
5044 		sdu = (k+ps-1)/ps*ps;
5045 		mem = malloc(12*sdu*sizeof(double)+63);
5046 		blasfeo_align_64_byte(mem, (void **) &pU);
5047 		}
5048 	else
5049 		{
5050 		pU = pU0;
5051 		sdu = sdu0;
5052 		}
5053 
5054 
5055 	int i, j, n1;
5056 
5057 	int idxB;
5058 
5059 
5060 
5061 	// algorithm scheme
5062 	if(offsetC==0 & offsetD==0)
5063 		{
5064 		if(bir==0)
5065 			{
5066 //	printf("\n000\n");
5067 			goto loop_000;
5068 			}
5069 		else
5070 			{
5071 //	printf("\nB00\n");
5072 			goto loop_B00;
5073 			}
5074 		}
5075 	else
5076 		{
5077 		if(bir==0)
5078 			{
5079 //	printf("\n0CD\n");
5080 			goto loop_0CD;
5081 			}
5082 		else
5083 			{
5084 //	printf("\nBCD\n");
5085 			goto loop_BCD;
5086 			}
5087 		}
5088 	// should never get here
5089 	goto end;
5090 
5091 
5092 
5093 	// main loop aligned
5094 loop_000:
5095 	i = 0;
5096 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5097 	for(; i<m-11; i+=12)
5098 		{
5099 		if(air==0)
5100 			{
5101 			pA2 = pA+i*sda;
5102 			sda2 = sda;
5103 			}
5104 		else
5105 			{
5106 #if defined(TARGET_X64_INTEL_HASWELL)
5107 			kernel_dpacp_nn_12_lib4(k, air, pA+i*sda, sda, pU, sdu);
5108 #else
5109 			kernel_dpacp_nn_4_lib4(k, air, pA+(i+0)*sda, sda, pU+0*sdu);
5110 			kernel_dpacp_nn_4_lib4(k, air, pA+(i+4)*sda, sda, pU+4*sdu);
5111 			kernel_dpacp_nn_4_lib4(k, air, pA+(i+8)*sda, sda, pU+8*sdu);
5112 #endif
5113 			pA2 = pU;
5114 			sda2 = sdu;
5115 			}
5116 		j = 0;
5117 		// main loop
5118 		for(; j<i; j+=4)
5119 			{
5120 			kernel_dgemm_nt_12x4_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5121 			}
5122 		kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5123 #if defined(TARGET_X64_INTEL_HASWELL)
5124 		kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
5125 #else
5126 		kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
5127 		kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, pA2+8*sda2, &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd]);
5128 #endif
5129 		}
5130 	if(m>i)
5131 		{
5132 		if(m-i<=4)
5133 			{
5134 			goto left_4;
5135 			}
5136 		else if(m-i<=8)
5137 			{
5138 			goto left_8;
5139 			}
5140 		else
5141 			{
5142 			goto left_12;
5143 			}
5144 		}
5145 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5146 	for(; i<m-7; i+=8)
5147 		{
5148 		if(air==0)
5149 			{
5150 			pA2 = pA+i*sda;
5151 			sda2 = sda;
5152 			}
5153 		else
5154 			{
5155 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE)
5156 			kernel_dpacp_nn_8_lib4(k, air, pA+i*sda, sda, pU, sdu);
5157 #else
5158 			kernel_dpacp_nn_4_lib4(k, air, pA+(i+0)*sda, sda, pU+0*sdu);
5159 			kernel_dpacp_nn_4_lib4(k, air, pA+(i+4)*sda, sda, pU+4*sdu);
5160 #endif
5161 			pA2 = pU;
5162 			sda2 = sdu;
5163 			}
5164 		j = 0;
5165 		// main loop
5166 		for(; j<i; j+=4)
5167 			{
5168 			kernel_dgemm_nt_8x4_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5169 			}
5170 		kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5171 		kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, pA2+4*sda2, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
5172 		}
5173 	if(m>i)
5174 		{
5175 		if(m-i<=4)
5176 			{
5177 			goto left_4;
5178 			}
5179 		else
5180 			{
5181 			goto left_8;
5182 			}
5183 		}
5184 #else
5185 	for(; i<m-3; i+=4)
5186 		{
5187 		if(air==0)
5188 			{
5189 			pA2 = pA+i*sda;
5190 			sda2 = sda;
5191 			}
5192 		else
5193 			{
5194 			kernel_dpacp_nn_4_lib4(k, air, pA+i*sda, sda, pU);
5195 			pA2 = pU;
5196 			sda2 = sdu;
5197 			}
5198 		j = 0;
5199 		// main loop
5200 		for(; j<i; j+=4)
5201 			{
5202 			kernel_dgemm_nt_4x4_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5203 			}
5204 		kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5205 		}
5206 	if(m>i)
5207 		{
5208 		goto left_4;
5209 		}
5210 #endif
5211 	// common return if i==m
5212 	goto end;
5213 
5214 
5215 
5216 	// main loop aligned
5217 loop_B00:
5218 	i = 0;
5219 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_HASWELL)
5220 	for(; i<m-7; i+=8)
5221 		{
5222 		if(air==0)
5223 			{
5224 			pA2 = pA+i*sda;
5225 			sda2 = sda;
5226 			}
5227 		else
5228 			{
5229 			kernel_dpacp_nn_8_vs_lib4(k, air, pA+i*sda, sda, pU, sdu, m-i);
5230 			pA2 = pU;
5231 			sda2 = sdu;
5232 			}
5233 		j = 0;
5234 		idxB = 0;
5235 		if(j<i)
5236 			{
5237 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5238 			j += ps-bir;
5239 			idxB += 4;
5240 			// main loop
5241 			for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5242 				{
5243 				kernel_dgemm_nt_8x4_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5244 				}
5245 			kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, bir);
5246 			j += bir;
5247 			}
5248 		kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5249 		kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[(j+4)*sdb], &beta, 0, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, 0, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5250 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2+4*sda2, &pB[(j+8)*sdb], &beta, 0, &pC[(j+4)*ps+(i+4)*sdc]+(ps-bir)*ps, sdc, 0, &pD[(j+4)*ps+(i+4)*sdd]+(ps-bir)*ps, sdd, ps-bir, m-(i+4), 0, m-(j+4));
5251 		}
5252 	if(m>i)
5253 		{
5254 		if(m-i<=4)
5255 			{
5256 			goto left_4_g;
5257 			}
5258 		else
5259 			{
5260 			goto left_8_g;
5261 			}
5262 		}
5263 #else
5264 	for(; i<m-3; i+=4)
5265 		{
5266 		if(air==0)
5267 			{
5268 			pA2 = pA+i*sda;
5269 			sda2 = sda;
5270 			}
5271 		else
5272 			{
5273 			kernel_dpacp_nn_4_lib4(k, air, pA+i*sda, sda, pU);
5274 			pA2 = pU;
5275 			sda2 = sdu;
5276 			}
5277 		j = 0;
5278 		idxB = 0;
5279 		if(j<i)
5280 			{
5281 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, 4);
5282 			j += ps-bir;
5283 			idxB += 4;
5284 			// main loop
5285 			for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5286 				{
5287 				kernel_dgemm_nt_4x4_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5288 				}
5289 			kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, bir);
5290 			j += bir;
5291 			}
5292 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, 0, &pC[j*ps+i*sdc]-bir*ps, sdc, 0, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, 4);
5293 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[(j+4)*sdb], &beta, 0, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, 0, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, 4);
5294 		}
5295 	if(m>i)
5296 		{
5297 		goto left_4_g;
5298 		}
5299 #endif
5300 	// common return if i==m
5301 	goto end;
5302 
5303 
5304 
5305 	// main loop C, D not aligned
5306 loop_0CD:
5307 	i = 0;
5308 #if 0//defined(TARGET_X64_INTEL_HASWELL)
5309 	for(; i<m-8; i+=12)
5310 		{
5311 		if(air==0)
5312 			{
5313 			pA2 = pA+i*sda;
5314 			sda2 = sda;
5315 			}
5316 		else
5317 			{
5318 			kernel_dpacp_nn_12_lib4(k, air, pA+i*sda, sda, pU, sdu);
5319 			pA2 = pU;
5320 			sda2 = sdu;
5321 			}
5322 		j = 0;
5323 		// main loop
5324 		for(; j<i; j+=4)
5325 			{
5326 			kernel_dgemm_nt_12x4_gen_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5327 			}
5328 		kernel_dsyrk_nt_l_12x4_gen_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5329 		kernel_dsyrk_nt_l_8x8_gen_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], sdb, &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
5330 		}
5331 	if(m>i)
5332 		{
5333 		if(m-i<=4)
5334 			{
5335 			goto left_4_g;
5336 			}
5337 		else
5338 			{
5339 			goto left_8_g;
5340 			}
5341 		}
5342 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_HASWELL)
5343 	for(; i<m-4; i+=8)
5344 		{
5345 		if(air==0)
5346 			{
5347 			pA2 = pA+i*sda;
5348 			sda2 = sda;
5349 			}
5350 		else
5351 			{
5352 			kernel_dpacp_nn_8_lib4(k, air, pA+i*sda, sda, pU, sdu);
5353 			pA2 = pU;
5354 			sda2 = sdu;
5355 			}
5356 		j = 0;
5357 		// main loop
5358 		for(; j<i; j+=4)
5359 			{
5360 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5361 			}
5362 		kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5363 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2+4*sda2, &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
5364 		}
5365 	if(m>i)
5366 		{
5367 		goto left_4_g;
5368 		}
5369 #else
5370 	for(; i<m; i+=4)
5371 		{
5372 		if(air==0)
5373 			{
5374 			pA2 = pA+i*sda;
5375 			sda2 = sda;
5376 			}
5377 		else
5378 			{
5379 			kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5380 			pA2 = pU;
5381 			sda2 = sdu;
5382 			}
5383 		j = 0;
5384 		// main loop
5385 		for(; j<i; j+=4)
5386 			{
5387 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5388 			}
5389 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5390 		}
5391 #endif
5392 	// common return if i==m
5393 	goto end;
5394 
5395 
5396 
5397 	// main loop aligned
5398 loop_BCD:
5399 	i = 0;
5400 #if defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_X64_INTEL_HASWELL)
5401 	for(; i<m-4; i+=8)
5402 		{
5403 		if(air==0)
5404 			{
5405 			pA2 = pA+i*sda;
5406 			sda2 = sda;
5407 			}
5408 		else
5409 			{
5410 			kernel_dpacp_nn_8_vs_lib4(k, air, pA+i*sda, sda, pU, sdu, m-i);
5411 			pA2 = pU;
5412 			sda2 = sdu;
5413 			}
5414 		j = 0;
5415 		idxB = 0;
5416 		if(j<i)
5417 			{
5418 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5419 			j += ps-bir;
5420 			idxB += 4;
5421 			// main loop
5422 			for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5423 				{
5424 				kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5425 				}
5426 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, bir); // XXX n1
5427 			j += bir;
5428 			}
5429 		kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5430 		kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[(j+4)*sdb], &beta, offsetC, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5431 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2+4*sda2, &pB[(j+8)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd]+(ps-bir)*ps, sdd, ps-bir, m-(i+4), 0, m-(j+4));
5432 		}
5433 	if(m>i)
5434 		{
5435 		goto left_4_g;
5436 		}
5437 #else
5438 	for(; i<m; i+=4)
5439 		{
5440 		if(air==0)
5441 			{
5442 			pA2 = pA+i*sda;
5443 			sda2 = sda;
5444 			}
5445 		else
5446 			{
5447 			kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5448 			pA2 = pU;
5449 			sda2 = sdu;
5450 			}
5451 		j = 0;
5452 		idxB = 0;
5453 		if(j<i)
5454 			{
5455 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5456 			j += ps-bir;
5457 			idxB += 4;
5458 			// main loop
5459 			for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5460 				{
5461 				kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5462 				}
5463 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, bir); // XXX n1
5464 			j += bir;
5465 			}
5466 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5467 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[(j+4)*sdb], &beta, offsetC, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5468 		}
5469 #endif
5470 	// common return if i==m
5471 	goto end;
5472 
5473 
5474 
5475 	// clean up loops definitions
5476 
5477 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5478 	left_12:
5479 	if(air==0)
5480 		{
5481 		pA2 = pA+i*sda;
5482 		sda2 = sda;
5483 		}
5484 	else
5485 		{
5486 #if defined(TARGET_X64_INTEL_HASWELL)
5487 		kernel_dpacp_nn_12_lib4(k, air, pA+i*sda, sda, pU, sdu);
5488 #else
5489 		kernel_dpacp_nn_4_lib4(k, air, pA+(i+0)*sda, sda, pU+0*sdu);
5490 		kernel_dpacp_nn_4_lib4(k, air, pA+(i+4)*sda, sda, pU+4*sdu);
5491 		kernel_dpacp_nn_4_lib4(k, air, pA+(i+8)*sda, sda, pU+8*sdu);
5492 #endif
5493 		pA2 = pU;
5494 		sda2 = sdu;
5495 		}
5496 	j = 0;
5497 	// main loop
5498 	for(; j<i; j+=4)
5499 		{
5500 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5501 		}
5502 	kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5503 #if defined(TARGET_X64_INTEL_HASWELL)
5504 	kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
5505 #else
5506 	kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, pA2+4*sda2, sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
5507 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2+8*sda2, &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, m-j-8);
5508 #endif
5509 	goto end;
5510 #endif
5511 
5512 
5513 
5514 #if defined(TARGET_X64_INTEL_HASWELL)
5515 	left_8:
5516 	if(air==0)
5517 		{
5518 		pA2 = pA+i*sda;
5519 		sda2 = sda;
5520 		}
5521 	else
5522 		{
5523 		kernel_dpacp_nn_8_lib4(k, air, pA+i*sda, sda, pU, sdu);
5524 		pA2 = pU;
5525 		sda2 = sdu;
5526 		}
5527 	j = 0;
5528 	// main loop
5529 	for(; j<i-8; j+=12)
5530 		{
5531 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5532 		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, pA2, sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, m-(j+4));
5533 		}
5534 	if(j<i-4)
5535 		{
5536 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5537 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, m-(j+4));
5538 		j += 8;
5539 		}
5540 	else if(j<i)
5541 		{
5542 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5543 		j += 4;
5544 		}
5545 	kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5546 	goto end;
5547 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE)
5548 	left_8:
5549 	if(air==0)
5550 		{
5551 		pA2 = pA+i*sda;
5552 		sda2 = sda;
5553 		}
5554 	else
5555 		{
5556 		kernel_dpacp_nn_8_lib4(k, air, pA+i*sda, sda, pU, sdu);
5557 		pA2 = pU;
5558 		sda2 = sdu;
5559 		}
5560 	j = 0;
5561 	// main loop
5562 	for(; j<i; j+=4)
5563 		{
5564 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5565 		}
5566 	kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5567 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2+4*sda2, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
5568 	goto end;
5569 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5570 	left_8:
5571 	if(air==0)
5572 		{
5573 		pA2 = pA+i*sda;
5574 		sda2 = sda;
5575 		}
5576 	else
5577 		{
5578 		kernel_dpacp_nn_4_lib4(k, air, pA+(i+0)*sda, sda, pU+0*sdu);
5579 		kernel_dpacp_nn_4_lib4(k, air, pA+(i+4)*sda, sda, pU+4*sdu);
5580 		pA2 = pU;
5581 		sda2 = sdu;
5582 		}
5583 	j = 0;
5584 	// main loop
5585 	for(; j<i; j+=4)
5586 		{
5587 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5588 		}
5589 	kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, pA2, sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5590 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2+4*sda2, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
5591 	goto end;
5592 #endif
5593 
5594 
5595 
5596 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE)
5597 	left_8_g:
5598 	if(air==0)
5599 		{
5600 		pA2 = pA+i*sda;
5601 		sda2 = sda;
5602 		}
5603 	else
5604 		{
5605 		kernel_dpacp_nn_8_vs_lib4(k, air, pA+i*sda, sda, pU, sdu, m-i);
5606 		pA2 = pU;
5607 		sda2 = sdu;
5608 		}
5609 	j = 0;
5610 	idxB = 0;
5611 	if(j<i)
5612 		{
5613 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5614 		j += ps-bir;
5615 		idxB += 4;
5616 		// main loop
5617 		for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5618 			{
5619 			kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5620 			}
5621 		kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, bir); // XXX n1
5622 		j += bir;
5623 		}
5624 	kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5625 	kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, pA2, sda2, &pB[(j+4)*sdb], &beta, offsetC, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5626 	kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2+4*sda2, &pB[(j+8)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd]+(ps-bir)*ps, sdd, ps-bir, m-(i+4), 0, m-(j+4));
5627 	goto end;
5628 #endif
5629 
5630 
5631 
5632 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5633 	left_4:
5634 	if(air==0)
5635 		{
5636 		pA2 = pA+i*sda;
5637 		sda2 = sda;
5638 		}
5639 	else
5640 		{
5641 		kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5642 		pA2 = pU;
5643 		sda2 = sdu;
5644 		}
5645 	j = 0;
5646 	// main loop
5647 	for(; j<i-8; j+=12)
5648 		{
5649 		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, pA2, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5650 		}
5651 	if(j<i-4)
5652 		{
5653 		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pA2, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5654 		j += 8;
5655 		}
5656 	else if(j<i)
5657 		{
5658 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5659 		j += 4;
5660 		}
5661 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5662 	goto end;
5663 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5664 	left_4:
5665 	if(air==0)
5666 		{
5667 		pA2 = pA+i*sda;
5668 		sda2 = sda;
5669 		}
5670 	else
5671 		{
5672 		kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5673 		pA2 = pU;
5674 		sda2 = sdu;
5675 		}
5676 	j = 0;
5677 	// main loop
5678 	for(; j<i-4; j+=8)
5679 		{
5680 		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, pA2, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5681 		}
5682 	if(j<i)
5683 		{
5684 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5685 		j+=4;
5686 		}
5687 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5688 	goto end;
5689 #else
5690 	left_4:
5691 	if(air==0)
5692 		{
5693 		pA2 = pA+i*sda;
5694 		sda2 = sda;
5695 		}
5696 	else
5697 		{
5698 		kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5699 		pA2 = pU;
5700 		sda2 = sdu;
5701 		}
5702 	j = 0;
5703 	// main loop
5704 	for(; j<i; j+=4)
5705 		{
5706 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5707 		}
5708 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5709 	goto end;
5710 #endif
5711 
5712 
5713 
5714 	left_4_g:
5715 	j = 0;
5716 	if(air==0)
5717 		{
5718 		pA2 = pA+i*sda;
5719 		sda2 = sda;
5720 		}
5721 	else
5722 		{
5723 		kernel_dpacp_nn_4_vs_lib4(k, air, pA+i*sda, sda, pU, m-i);
5724 		pA2 = pU;
5725 		sda2 = sdu;
5726 		}
5727 	if(bir!=0)
5728 		{
5729 		idxB = 0;
5730 		if(j<i)
5731 			{
5732 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, m-j);
5733 			j += ps-bir;
5734 			idxB += 4;
5735 			// main loop
5736 			for(; j<i+(ps-bir)-ps; j+=4, idxB+=4)
5737 				{
5738 				kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5739 				}
5740 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[idxB*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, bir); // XXX n1
5741 			j += bir;
5742 			}
5743 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc]-bir*ps, sdc, offsetD, &pD[j*ps+i*sdd]-bir*ps, sdd, 0, m-i, bir, bir+m-j);
5744 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[(j+4)*sdb], &beta, offsetC, &pC[j*ps+i*sdc]+(ps-bir)*ps, sdc, offsetD, &pD[j*ps+i*sdd]+(ps-bir)*ps, sdd, ps-bir, m-i, 0, m-j);
5745 		}
5746 	else
5747 		{
5748 		// main loop
5749 		for(; j<i; j+=4)
5750 			{
5751 			kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5752 			}
5753 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, pA2, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5754 		}
5755 	goto end;
5756 
5757 
5758 
5759 end:
5760 	if(k>K_MAX_STACK)
5761 		{
5762 		free(mem);
5763 		}
5764 	return;
5765 
5766 
5767 
5768 #if 0
5769 	// main loop
5770 	i = 0;
5771 	if(offsetC==0 & offsetD==0)
5772 		{
5773 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5774 		for(; i<m-11; i+=12)
5775 			{
5776 			j = 0;
5777 			for(; j<i; j+=4)
5778 				{
5779 				kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5780 				}
5781 			kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5782 #if defined(TARGET_X64_INTEL_HASWELL)
5783 			kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
5784 #else
5785 			kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
5786 			kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd]);
5787 #endif
5788 			}
5789 		if(m>i)
5790 			{
5791 			if(m-i<=4)
5792 				{
5793 				goto left_4;
5794 				}
5795 			else if(m-i<=8)
5796 				{
5797 				goto left_8;
5798 				}
5799 			else
5800 				{
5801 				goto left_12;
5802 				}
5803 			}
5804 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
5805 		for(; i<m-7; i+=8)
5806 			{
5807 			j = 0;
5808 			for(; j<i; j+=4)
5809 				{
5810 				kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5811 				}
5812 			kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
5813 			kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
5814 			}
5815 		if(m>i)
5816 			{
5817 			if(m-i<=4)
5818 				{
5819 				goto left_4;
5820 				}
5821 			else
5822 				{
5823 				goto left_8;
5824 				}
5825 			}
5826 #elif defined(TARGET_X86_AMD_BARCELONA)
5827 		for(; i<m-3; i+=4)
5828 			{
5829 			j = 0;
5830 			for(; j<i; j+=4)
5831 				{
5832 				kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5833 				kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd]);
5834 				}
5835 			kernel_dsyrk_nt_l_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5836 			kernel_dsyrk_nt_l_2x2_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2]);
5837 			}
5838 		if(m>i)
5839 			{
5840 			goto left_4;
5841 			}
5842 #else
5843 		for(; i<m-3; i+=4)
5844 			{
5845 			j = 0;
5846 			for(; j<i; j+=4)
5847 				{
5848 				kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5849 				}
5850 			kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
5851 			}
5852 		if(m>i)
5853 			{
5854 			goto left_4;
5855 			}
5856 #endif
5857 		}
5858 	else
5859 		{
5860 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
5861 		for(; i<m-4; i+=8)
5862 			{
5863 			j = 0;
5864 			for(; j<i; j+=4)
5865 				{
5866 				kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5867 				}
5868 			kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5869 			kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, m-j-4);
5870 			}
5871 		if(m>i)
5872 			{
5873 			goto left_4_gen;
5874 			}
5875 #else
5876 		for(; i<m; i+=4)
5877 			{
5878 			j = 0;
5879 			for(; j<i; j+=4)
5880 				{
5881 				kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5882 				}
5883 			kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5884 			}
5885 #endif
5886 		}
5887 
5888 	// common return if i==m
5889 	return;
5890 
5891 	// clean up loops definitions
5892 
5893 #if defined(TARGET_X64_INTEL_HASWELL)
5894 	left_12:
5895 	j = 0;
5896 	for(; j<i; j+=4)
5897 		{
5898 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5899 		}
5900 	kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5901 	kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
5902 //	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
5903 	return;
5904 #elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5905 	left_12:
5906 	j = 0;
5907 	for(; j<i; j+=4)
5908 		{
5909 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5910 		}
5911 	kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5912 	kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, m-j-4);
5913 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, m-j-8);
5914 	return;
5915 #endif
5916 
5917 #if defined(TARGET_X64_INTEL_HASWELL)
5918 	left_8:
5919 	j = 0;
5920 	for(; j<i-8; j+=12)
5921 		{
5922 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5923 		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, m-(j+4));
5924 		}
5925 	if(j<i-4)
5926 		{
5927 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5928 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, m-(j+4));
5929 		j += 8;
5930 		}
5931 	else if(j<i)
5932 		{
5933 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5934 		j += 4;
5935 		}
5936 	kernel_dsyrk_nt_l_8x8_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5937 //	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
5938 	return;
5939 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
5940 	left_8:
5941 	j = 0;
5942 	for(; j<i; j+=4)
5943 		{
5944 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5945 		}
5946 	kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, m-j);
5947 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, m-j-4);
5948 	return;
5949 #endif
5950 
5951 #if defined(TARGET_X64_INTEL_HASWELL)
5952 	left_4:
5953 	j = 0;
5954 	for(; j<i-8; j+=12)
5955 		{
5956 		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5957 		}
5958 	if(j<i-4)
5959 		{
5960 		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5961 		j += 8;
5962 		}
5963 	else if(j<i)
5964 		{
5965 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5966 		j += 4;
5967 		}
5968 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5969 	return;
5970 #elif defined(TARGET_X86_AMD_BARCELONA)
5971 	left_4:
5972 	j = 0;
5973 	for(; j<i; j+=4)
5974 		{
5975 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5976 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, m-(j+2));
5977 		}
5978 	kernel_dsyrk_nt_l_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5979 	if(j<m-2)
5980 		kernel_dsyrk_nt_l_2x2_vs_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2], m-(i+2), m-(j+2));
5981 	return;
5982 #else
5983 	left_4:
5984 	j = 0;
5985 	for(; j<i; j+=4)
5986 		{
5987 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5988 		}
5989 	kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, m-j);
5990 	return;
5991 #endif
5992 
5993 	left_4_gen:
5994 	j = 0;
5995 	for(; j<i; j+=4)
5996 		{
5997 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
5998 		}
5999 	kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, m-j);
6000 	return;
6001 #endif
6002 
6003 	}
6004 
6005 
6006 
blasfeo_dsyrk_ln_mn(int m,int n,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)6007 void blasfeo_dsyrk_ln_mn(int m, int n, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
6008 	{
6009 	if(m<=0 | n<=0)
6010 		return;
6011 
6012 	if(ai!=0 | bi!=0)
6013 		{
6014 		printf("\nblasfeo_dsyrk_ln: feature not implemented yet: ai=%d, bi=%d\n", ai, bi);
6015 		exit(1);
6016 		}
6017 
6018 	// invalidate stored inverse diagonal of result matrix
6019 	sD->use_dA = 0;
6020 
6021 	const int ps = 4;
6022 
6023 	int i, j;
6024 
6025 	int sda = sA->cn;
6026 	int sdb = sB->cn;
6027 	int sdc = sC->cn;
6028 	int sdd = sD->cn;
6029 	double *pA = sA->pA + aj*ps;
6030 	double *pB = sB->pA + bj*ps;
6031 	double *pC = sC->pA + cj*ps + (ci-(ci&(ps-1)))*sdc;
6032 	double *pD = sD->pA + dj*ps + (di-(di&(ps-1)))*sdd;
6033 
6034 	// TODO ai and bi
6035 	int offsetC;
6036 	int offsetD;
6037 	offsetC = ci&(ps-1);
6038 	offsetD = di&(ps-1);
6039 
6040 	// main loop
6041 	i = 0;
6042 	if(offsetC==0 & offsetD==0)
6043 		{
6044 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6045 		for(; i<m-11; i+=12)
6046 			{
6047 			j = 0;
6048 			for(; j<i & j<n-3; j+=4)
6049 				{
6050 				kernel_dgemm_nt_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
6051 				}
6052 			if(j<n)
6053 				{
6054 				if(j<i) // dgemm
6055 					{
6056 					kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6057 					}
6058 				else // dsyrk
6059 					{
6060 					if(j<n-11)
6061 						{
6062 						kernel_dsyrk_nt_l_12x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
6063 #if defined(TARGET_X64_INTEL_HASWELL)
6064 						kernel_dsyrk_nt_l_8x8_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
6065 #else
6066 						kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd);
6067 						kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd]);
6068 #endif
6069 						}
6070 					else
6071 						{
6072 						kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6073 						if(j<n-4)
6074 							{
6075 							kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
6076 							if(j<n-8)
6077 								{
6078 								kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
6079 								}
6080 							}
6081 						}
6082 					}
6083 				}
6084 			}
6085 		if(m>i)
6086 			{
6087 			if(m-i<=4)
6088 				{
6089 				goto left_4;
6090 				}
6091 			else if(m-i<=8)
6092 				{
6093 				goto left_8;
6094 				}
6095 			else
6096 				{
6097 				goto left_12;
6098 				}
6099 			}
6100 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6101 		for(; i<m-7; i+=8)
6102 			{
6103 			j = 0;
6104 			for(; j<i & j<n-3; j+=4)
6105 				{
6106 				kernel_dgemm_nt_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
6107 				}
6108 			if(j<n)
6109 				{
6110 				if(j<i) // dgemm
6111 					{
6112 					kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6113 					}
6114 				else // dsyrk
6115 					{
6116 					if(j<n-7)
6117 						{
6118 						kernel_dsyrk_nt_l_8x4_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd);
6119 						kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd]);
6120 						}
6121 					else
6122 						{
6123 						kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6124 						if(j<n-4)
6125 							{
6126 							kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
6127 							}
6128 						}
6129 					}
6130 				}
6131 			}
6132 		if(m>i)
6133 			{
6134 			if(m-i<=4)
6135 				{
6136 				goto left_4;
6137 				}
6138 			else
6139 				{
6140 				goto left_8;
6141 				}
6142 			}
6143 #elif defined(TARGET_X86_AMD_BARCELONA)
6144 		for(; i<m-3; i+=4)
6145 			{
6146 			j = 0;
6147 			for(; j<i & j<n-3; j+=4)
6148 				{
6149 				kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
6150 				kernel_dgemm_nt_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd]);
6151 				}
6152 			if(j<n)
6153 				{
6154 				if(j<i) // dgemm
6155 					{
6156 					kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6157 					if(j<n-2)
6158 						kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, n-(j+2));
6159 					}
6160 				else // dsyrk
6161 					{
6162 					if(j<n-3)
6163 						{
6164 						kernel_dsyrk_nt_l_4x2_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
6165 						kernel_dsyrk_nt_l_2x2_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2]);
6166 						}
6167 					else
6168 						{
6169 						kernel_dsyrk_nt_l_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6170 						if(j<n-2)
6171 							kernel_dsyrk_nt_l_2x2_vs_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2], m-(i+2), n-(j+2));
6172 						}
6173 					}
6174 				}
6175 			}
6176 		if(m>i)
6177 			{
6178 			goto left_4;
6179 			}
6180 #else
6181 		for(; i<m-3; i+=4)
6182 			{
6183 			j = 0;
6184 			for(; j<i & j<n-3; j+=4)
6185 				{
6186 				kernel_dgemm_nt_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
6187 				}
6188 			if(j<n)
6189 				{
6190 				if(j<i) // dgemm
6191 					{
6192 					kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6193 					}
6194 				else // dsyrk
6195 					{
6196 					if(j<n-3)
6197 						{
6198 						kernel_dsyrk_nt_l_4x4_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd]);
6199 						}
6200 					else
6201 						{
6202 						kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6203 						}
6204 					}
6205 				}
6206 			}
6207 		if(m>i)
6208 			{
6209 			goto left_4;
6210 			}
6211 #endif
6212 		}
6213 	else
6214 		{
6215 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE)
6216 		for(; i<m-4; i+=8)
6217 			{
6218 			j = 0;
6219 			for(; j<i & j<n; j+=4)
6220 				{
6221 				kernel_dgemm_nt_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6222 				}
6223 			if(j<n)
6224 				{
6225 				kernel_dsyrk_nt_l_8x4_gen_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6226 				if(j<n-4)
6227 					{
6228 					kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, offsetC, &pC[(j+4)*ps+(i+4)*sdc], sdc, offsetD, &pD[(j+4)*ps+(i+4)*sdd], sdd, 0, m-i-4, 0, n-j-4);
6229 					}
6230 				}
6231 			}
6232 		if(m>i)
6233 			{
6234 			goto left_4_gen;
6235 			}
6236 #else
6237 		for(; i<m; i+=4)
6238 			{
6239 			j = 0;
6240 			for(; j<i & j<n; j+=4)
6241 				{
6242 				kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6243 				}
6244 			if(j<n)
6245 				{
6246 				kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6247 				}
6248 			}
6249 #endif
6250 		}
6251 
6252 	// common return if i==m
6253 	return;
6254 
6255 	// clean up loops definitions
6256 
6257 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6258 	left_12:
6259 	j = 0;
6260 	for(; j<i & j<n; j+=4)
6261 		{
6262 		kernel_dgemm_nt_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6263 		}
6264 	if(j<n)
6265 		{
6266 		kernel_dsyrk_nt_l_12x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6267 		if(j<n-4)
6268 			{
6269 			kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], sda, &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], sdc, &pD[(j+4)*ps+(i+4)*sdd], sdd, m-i-4, n-j-4);
6270 			if(j<n-8)
6271 				{
6272 				kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+8)*sda], &pB[(j+8)*sdb], &beta, &pC[(j+8)*ps+(i+8)*sdc], &pD[(j+8)*ps+(i+8)*sdd], m-i-8, n-j-8);
6273 				}
6274 			}
6275 		}
6276 	return;
6277 #endif
6278 
6279 #if defined(TARGET_X64_INTEL_HASWELL)
6280 	left_8:
6281 	j = 0;
6282 	for(; j<i-8 & j<n-8; j+=12)
6283 		{
6284 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6285 		kernel_dgemm_nt_8x8u_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[(j+4)*sdb], sdb, &beta, &pC[(j+4)*ps+i*sdc], sdc, &pD[(j+4)*ps+i*sdd], sdd, m-i, n-(j+4));
6286 		}
6287 	if(j<i-4 & j<n-4)
6288 		{
6289 		kernel_dgemm_nt_8x8l_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6290 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+i*sdc], &pD[(j+4)*ps+i*sdd], m-i, n-(j+4));
6291 		j += 8;
6292 		}
6293 	if(j<i & j<n)
6294 		{
6295 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6296 		j += 4;
6297 		}
6298 	if(j<n)
6299 		{
6300 		kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6301 		if(j<n-4)
6302 			{
6303 			kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
6304 			}
6305 		}
6306 	return;
6307 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6308 	left_8:
6309 	j = 0;
6310 	for(; j<i & j<n; j+=4)
6311 		{
6312 		kernel_dgemm_nt_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6313 		}
6314 	if(j<n)
6315 		{
6316 		kernel_dsyrk_nt_l_8x4_vs_lib4(k, &alpha, &pA[i*sda], sda, &pB[j*sdb], &beta, &pC[j*ps+i*sdc], sdc, &pD[j*ps+i*sdd], sdd, m-i, n-j);
6317 		if(j<n-4)
6318 			{
6319 			kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[(i+4)*sda], &pB[(j+4)*sdb], &beta, &pC[(j+4)*ps+(i+4)*sdc], &pD[(j+4)*ps+(i+4)*sdd], m-i-4, n-j-4);
6320 			}
6321 		}
6322 	return;
6323 #endif
6324 
6325 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6326 	left_4:
6327 	j = 0;
6328 	for(; j<i-8 & j<n-8; j+=12)
6329 		{
6330 		kernel_dgemm_nt_4x12_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6331 		}
6332 	if(j<i-4 & j<n-4)
6333 		{
6334 		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6335 		j += 8;
6336 		}
6337 	else if(j<i & j<n)
6338 		{
6339 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6340 		j += 4;
6341 		}
6342 	if(j<n)
6343 		{
6344 		kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6345 		}
6346 	return;
6347 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6348 	left_4:
6349 	j = 0;
6350 	for(; j<i-4 & j<n-4; j+=8)
6351 		{
6352 		kernel_dgemm_nt_4x8_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], sdb, &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6353 		}
6354 	if(j<i & j<n)
6355 		{
6356 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6357 		j += 4;
6358 		}
6359 	if(j<n)
6360 		{
6361 		kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6362 		}
6363 	return;
6364 #elif defined(TARGET_X86_AMD_BARCELONA)
6365 	left_4:
6366 	j = 0;
6367 	for(; j<i & j<n; j+=4)
6368 		{
6369 		kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6370 		if(j<n-2)
6371 			kernel_dgemm_nt_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc], &pD[(j+2)*ps+i*sdd], m-i, n-(j+2));
6372 		}
6373 	if(j<n)
6374 		{
6375 		kernel_dsyrk_nt_l_4x2_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6376 		if(j<n-2)
6377 			kernel_dsyrk_nt_l_2x2_vs_lib4(k, &alpha, &pA[i*sda+2], &pB[j*sdb+2], &beta, &pC[(j+2)*ps+i*sdc+2], &pD[(j+2)*ps+i*sdd+2], m-(i+2), n-(j+2));
6378 		}
6379 	return;
6380 #else
6381 	left_4:
6382 	j = 0;
6383 	for(; j<i & j<n; j+=4)
6384 		{
6385 		kernel_dgemm_nt_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6386 		}
6387 	if(j<n)
6388 		{
6389 		kernel_dsyrk_nt_l_4x4_vs_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, &pC[j*ps+i*sdc], &pD[j*ps+i*sdd], m-i, n-j);
6390 		}
6391 	return;
6392 #endif
6393 
6394 	left_4_gen:
6395 	j = 0;
6396 	for(; j<i & j<n; j+=4)
6397 		{
6398 		kernel_dgemm_nt_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6399 		}
6400 	if(j<n)
6401 		{
6402 		kernel_dsyrk_nt_l_4x4_gen_lib4(k, &alpha, &pA[i*sda], &pB[j*sdb], &beta, offsetC, &pC[j*ps+i*sdc], sdc, offsetD, &pD[j*ps+i*sdd], sdd, 0, m-i, 0, n-j);
6403 		}
6404 	return;
6405 
6406 	}
6407 
6408 
6409 
blasfeo_dsyrk_lt(int m,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)6410 void blasfeo_dsyrk_lt(int m, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
6411 	{
6412 #ifndef BENCHMARKS_MODE
6413 	printf("\nblasfeo_dsyrk_lt: feature not implemented yet\n");
6414 	exit(1);
6415 #endif
6416 	return;
6417 	}
6418 
6419 
6420 
blasfeo_dsyrk_un(int m,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)6421 void blasfeo_dsyrk_un(int m, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
6422 	{
6423 #ifndef BENCHMARKS_MODE
6424 	printf("\nblasfeo_dsyrk_un: feature not implemented yet\n");
6425 	exit(1);
6426 #endif
6427 	return;
6428 	}
6429 
6430 
6431 
blasfeo_dsyrk_ut(int m,int k,double alpha,struct blasfeo_dmat * sA,int ai,int aj,struct blasfeo_dmat * sB,int bi,int bj,double beta,struct blasfeo_dmat * sC,int ci,int cj,struct blasfeo_dmat * sD,int di,int dj)6432 void blasfeo_dsyrk_ut(int m, int k, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dmat *sB, int bi, int bj, double beta, struct blasfeo_dmat *sC, int ci, int cj, struct blasfeo_dmat *sD, int di, int dj)
6433 	{
6434 
6435 	// fast return
6436 	if(m<=0)
6437 		return;
6438 
6439 	// invalidate stored inverse diagonal of result matrix
6440 	sD->use_dA = 0;
6441 
6442 	const int ps = 4;
6443 
6444 	int sda = sA->cn;
6445 	int sdb = sB->cn;
6446 	int sdc = sC->cn;
6447 	int sdd = sD->cn;
6448 
6449 	int air = ai & (ps-1);
6450 	int bir = bi & (ps-1);
6451 	int cir = ci & (ps-1);
6452 	int dir = di & (ps-1);
6453 
6454 	double *pA = sA->pA + aj*ps + (ai-air)*sda;
6455 	double *pB = sB->pA + bj*ps + (bi-bir)*sdb;
6456 	double *pC = sC->pA + cj*ps + (ci-cir)*sdc;
6457 	double *pD = sD->pA + dj*ps + (di-dir)*sdd;
6458 
6459 	int offsetA = air;
6460 	int offsetB = bir;
6461 	int offsetC = cir;
6462 	int offsetD = dir;
6463 
6464 	void *mem;
6465 	double *pU, *pA2;
6466 	int sdu, sda2;
6467 
6468 // TODO visual studio alignment
6469 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6470 	ALIGNED( double pU0[3*4*K_MAX_STACK], 64 );
6471 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6472 	ALIGNED( double pU0[2*4*K_MAX_STACK], 64 );
6473 #elif defined(TARGET_GENERIC)
6474 	double pU0[1*4*K_MAX_STACK];
6475 #else
6476 	ALIGNED( double pU0[1*4*K_MAX_STACK], 64 );
6477 #endif
6478 	int sdu0 = (k+3)/4*4;
6479 	sdu0 = sdu0<K_MAX_STACK ? sdu0 : K_MAX_STACK;
6480 
6481 	// allocate memory
6482 	if(k>K_MAX_STACK)
6483 		{
6484 		sdu = (k+ps-1)/ps*ps;
6485 		mem = malloc(12*sdu*sizeof(double)+63);
6486 		blasfeo_align_64_byte(mem, (void **) &pU);
6487 		}
6488 	else
6489 		{
6490 		pU = pU0;
6491 		sdu = sdu0;
6492 		}
6493 
6494 
6495 	int i, j, n1;
6496 
6497 	int idxB;
6498 
6499 
6500 	if(ci!=0 | di!=0)
6501 		{
6502 		printf("\nblasfeo_dsyrk_ut: feature not implemented yet: ci!=0 | di!=0\n");
6503 		exit(1);
6504 		}
6505 
6506 	// algorithm scheme
6507 	goto loop_00;
6508 #if 0
6509 	if(offsetC==0 & offsetD==0)
6510 		{
6511 //	printf("\n00\n");
6512 		goto loop_00;
6513 		}
6514 	else
6515 		{
6516 //	printf("\nCD\n");
6517 		goto loop_CD;
6518 		}
6519 #endif
6520 	// should never get here
6521 	goto end;
6522 
6523 
6524 
6525 	// main loop aligned
6526 loop_00:
6527 	i = 0;
6528 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6529 	for(; i<m-11; i+=12)
6530 		{
6531 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6532 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+4)*ps, sda, pU+4*sdu);
6533 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+8)*ps, sda, pU+8*sdu);
6534 #if defined(TARGET_X64_INTEL_HASWELL)
6535 		kernel_dsyrk_nn_u_8x8_lib4(k, &alpha, pU, sdu, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, sdc, pD+i*sdd+i*ps, sdd);
6536 #else
6537 		kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6538 		kernel_dsyrk_nn_u_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+4)*ps, sdb, &beta, pC+i*sdc+(i+4)*ps, sdc, pD+i*sdd+(i+4)*ps, sdd);
6539 #endif
6540 		kernel_dsyrk_nn_u_12x4_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+8)*ps, sdb, &beta, pC+i*sdc+(i+8)*ps, sdc, pD+i*sdd+(i+8)*ps, sdd);
6541 		for(j=i+12; j<m-3; j+=4)
6542 			{
6543 			kernel_dgemm_nn_12x4_lib4(k, &alpha, pU, sdu, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, sdc, pD+i*sdd+j*ps, sdc);
6544 			}
6545 		if(j<m)
6546 			{
6547 			kernel_dgemm_nn_12x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, sdc, pD+i*sdd+j*ps, sdc, m-i, m-j);
6548 			}
6549 		}
6550 	if(i<m)
6551 		{
6552 		if(m-i<=4)
6553 			{
6554 			goto left_4;
6555 			}
6556 		if(m-i<=8)
6557 			{
6558 			goto left_8;
6559 			}
6560 		else
6561 			{
6562 			goto left_12;
6563 			}
6564 		}
6565 #elif defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57)
6566 	for(; i<m-7; i+=8)
6567 		{
6568 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6569 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+4)*ps, sda, pU+4*sdu);
6570 		kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6571 		kernel_dsyrk_nn_u_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+4)*ps, sdb, &beta, pC+i*sdc+(i+4)*ps, sdc, pD+i*sdd+(i+4)*ps, sdd);
6572 		for(j=i+8; j<m-3; j+=4)
6573 			{
6574 			kernel_dgemm_nn_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, sdc, pD+i*sdd+j*ps, sdc);
6575 			}
6576 		if(j<m)
6577 			{
6578 			kernel_dgemm_nn_8x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, sdc, pD+i*sdd+j*ps, sdc, m-i, m-j);
6579 			}
6580 		}
6581 	if(i<m)
6582 		{
6583 		if(m-i<=4)
6584 			{
6585 			goto left_4;
6586 			}
6587 		else
6588 			{
6589 			goto left_8;
6590 			}
6591 		}
6592 #else
6593 	for(; i<m-3; i+=4)
6594 		{
6595 		kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6596 		kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6597 		for(j=i+4; j<m-3; j+=4)
6598 			{
6599 			kernel_dgemm_nn_4x4_lib4(k, &alpha, pU, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, pD+i*sdd+j*ps);
6600 			}
6601 		if(j<m)
6602 			{
6603 			kernel_dgemm_nn_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+j*ps, sdb, &beta, pC+i*sdc+j*ps, pD+i*sdd+j*ps, m-i, m-j);
6604 			}
6605 		}
6606 	if(i<m)
6607 		{
6608 		goto left_4;
6609 		}
6610 #endif
6611 	goto end;
6612 
6613 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6614 left_12:
6615 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6616 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+4)*ps, sda, pU+4*sdu);
6617 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+8)*ps, sda, pU+8*sdu);
6618 #if defined(TARGET_X64_INTEL_HASWELL)
6619 	kernel_dsyrk_nn_u_8x8_lib4(k, &alpha, pU, sdu, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, sdc, pD+i*sdd+i*ps, sdd);
6620 #else
6621 	kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6622 	kernel_dsyrk_nn_u_8x4_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+4)*ps, sdb, &beta, pC+i*sdc+(i+4)*ps, sdc, pD+i*sdd+(i+4)*ps, sdd);
6623 #endif
6624 	kernel_dsyrk_nn_u_12x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+8)*ps, sdb, &beta, pC+i*sdc+(i+8)*ps, sdc, pD+i*sdd+(i+8)*ps, sdd, m-i, m-i-8);
6625 	goto end;
6626 #endif
6627 
6628 #if defined(TARGET_X64_INTEL_HASWELL) | defined(TARGET_X64_INTEL_SANDY_BRIDGE) | defined(TARGET_ARMV8A_ARM_CORTEX_A57) | defined(TARGET_ARMV8A_ARM_CORTEX_A53)
6629 left_8:
6630 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6631 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+(i+4)*ps, sda, pU+4*sdu);
6632 	kernel_dsyrk_nn_u_4x4_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps);
6633 	kernel_dsyrk_nn_u_8x4_vs_lib4(k, &alpha, pU, sdu, offsetB, pB+(i+4)*ps, sdb, &beta, pC+i*sdc+(i+4)*ps, sdc, pD+i*sdd+(i+4)*ps, sdd, m-i, m-i-4);
6634 	goto end;
6635 #endif
6636 
6637 left_4:
6638 	kernel_dpacp_tn_4_lib4(k, offsetA, pA+i*ps, sda, pU);
6639 	kernel_dsyrk_nn_u_4x4_vs_lib4(k, &alpha, pU, offsetB, pB+i*ps, sdb, &beta, pC+i*sdc+i*ps, pD+i*sdd+i*ps, m-i, m-i);
6640 	goto end;
6641 
6642 end:
6643 	if(k>K_MAX_STACK)
6644 		{
6645 		free(mem);
6646 		}
6647 	return;
6648 
6649 	}
6650 
6651 
6652 
6653 #else
6654 
6655 #error : wrong LA choice
6656 
6657 #endif
6658 
6659 
6660 
6661