1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 
37 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_nt_4x4_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)38 void kernel_dgemm_nt_4x4_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
39 	{
40 
41 	const int bs = 4;
42 
43 	double
44 		a_0, a_1, a_2, a_3,
45 		b_0, b_1, b_2, b_3;
46 
47 #if defined(TARGET_GENERIC)
48 	double CC[16] = {0};
49 #else
50 	ALIGNED( double CC[16], 64 ) = {0};
51 #endif
52 
53 	int k;
54 
55 	for(k=0; k<kmax-3; k+=4)
56 		{
57 
58 		// k = 0
59 
60 		a_0 = A[0+lda*0];
61 		a_1 = A[1+lda*0];
62 		a_2 = A[2+lda*0];
63 		a_3 = A[3+lda*0];
64 
65 		b_0 = B[0+ldb*0];
66 		b_1 = B[1+ldb*0];
67 		b_2 = B[2+ldb*0];
68 		b_3 = B[3+ldb*0];
69 
70 		CC[0+bs*0] += a_0 * b_0;
71 		CC[1+bs*0] += a_1 * b_0;
72 		CC[2+bs*0] += a_2 * b_0;
73 		CC[3+bs*0] += a_3 * b_0;
74 
75 		CC[0+bs*1] += a_0 * b_1;
76 		CC[1+bs*1] += a_1 * b_1;
77 		CC[2+bs*1] += a_2 * b_1;
78 		CC[3+bs*1] += a_3 * b_1;
79 
80 		CC[0+bs*2] += a_0 * b_2;
81 		CC[1+bs*2] += a_1 * b_2;
82 		CC[2+bs*2] += a_2 * b_2;
83 		CC[3+bs*2] += a_3 * b_2;
84 
85 		CC[0+bs*3] += a_0 * b_3;
86 		CC[1+bs*3] += a_1 * b_3;
87 		CC[2+bs*3] += a_2 * b_3;
88 		CC[3+bs*3] += a_3 * b_3;
89 
90 
91 		// k = 1
92 
93 		a_0 = A[0+lda*1];
94 		a_1 = A[1+lda*1];
95 		a_2 = A[2+lda*1];
96 		a_3 = A[3+lda*1];
97 
98 		b_0 = B[0+ldb*1];
99 		b_1 = B[1+ldb*1];
100 		b_2 = B[2+ldb*1];
101 		b_3 = B[3+ldb*1];
102 
103 		CC[0+bs*0] += a_0 * b_0;
104 		CC[1+bs*0] += a_1 * b_0;
105 		CC[2+bs*0] += a_2 * b_0;
106 		CC[3+bs*0] += a_3 * b_0;
107 
108 		CC[0+bs*1] += a_0 * b_1;
109 		CC[1+bs*1] += a_1 * b_1;
110 		CC[2+bs*1] += a_2 * b_1;
111 		CC[3+bs*1] += a_3 * b_1;
112 
113 		CC[0+bs*2] += a_0 * b_2;
114 		CC[1+bs*2] += a_1 * b_2;
115 		CC[2+bs*2] += a_2 * b_2;
116 		CC[3+bs*2] += a_3 * b_2;
117 
118 		CC[0+bs*3] += a_0 * b_3;
119 		CC[1+bs*3] += a_1 * b_3;
120 		CC[2+bs*3] += a_2 * b_3;
121 		CC[3+bs*3] += a_3 * b_3;
122 
123 
124 		// k = 2
125 
126 		a_0 = A[0+lda*2];
127 		a_1 = A[1+lda*2];
128 		a_2 = A[2+lda*2];
129 		a_3 = A[3+lda*2];
130 
131 		b_0 = B[0+ldb*2];
132 		b_1 = B[1+ldb*2];
133 		b_2 = B[2+ldb*2];
134 		b_3 = B[3+ldb*2];
135 
136 		CC[0+bs*0] += a_0 * b_0;
137 		CC[1+bs*0] += a_1 * b_0;
138 		CC[2+bs*0] += a_2 * b_0;
139 		CC[3+bs*0] += a_3 * b_0;
140 
141 		CC[0+bs*1] += a_0 * b_1;
142 		CC[1+bs*1] += a_1 * b_1;
143 		CC[2+bs*1] += a_2 * b_1;
144 		CC[3+bs*1] += a_3 * b_1;
145 
146 		CC[0+bs*2] += a_0 * b_2;
147 		CC[1+bs*2] += a_1 * b_2;
148 		CC[2+bs*2] += a_2 * b_2;
149 		CC[3+bs*2] += a_3 * b_2;
150 
151 		CC[0+bs*3] += a_0 * b_3;
152 		CC[1+bs*3] += a_1 * b_3;
153 		CC[2+bs*3] += a_2 * b_3;
154 		CC[3+bs*3] += a_3 * b_3;
155 
156 
157 		// k = 3
158 
159 		a_0 = A[0+lda*3];
160 		a_1 = A[1+lda*3];
161 		a_2 = A[2+lda*3];
162 		a_3 = A[3+lda*3];
163 
164 		b_0 = B[0+ldb*3];
165 		b_1 = B[1+ldb*3];
166 		b_2 = B[2+ldb*3];
167 		b_3 = B[3+ldb*3];
168 
169 		CC[0+bs*0] += a_0 * b_0;
170 		CC[1+bs*0] += a_1 * b_0;
171 		CC[2+bs*0] += a_2 * b_0;
172 		CC[3+bs*0] += a_3 * b_0;
173 
174 		CC[0+bs*1] += a_0 * b_1;
175 		CC[1+bs*1] += a_1 * b_1;
176 		CC[2+bs*1] += a_2 * b_1;
177 		CC[3+bs*1] += a_3 * b_1;
178 
179 		CC[0+bs*2] += a_0 * b_2;
180 		CC[1+bs*2] += a_1 * b_2;
181 		CC[2+bs*2] += a_2 * b_2;
182 		CC[3+bs*2] += a_3 * b_2;
183 
184 		CC[0+bs*3] += a_0 * b_3;
185 		CC[1+bs*3] += a_1 * b_3;
186 		CC[2+bs*3] += a_2 * b_3;
187 		CC[3+bs*3] += a_3 * b_3;
188 
189 		A += 4*lda;
190 		B += 4*ldb;
191 
192 		}
193 
194 	for(; k<kmax; k++)
195 		{
196 
197 		// k = 0
198 
199 		a_0 = A[0+lda*0];
200 		a_1 = A[1+lda*0];
201 		a_2 = A[2+lda*0];
202 		a_3 = A[3+lda*0];
203 
204 		b_0 = B[0+ldb*0];
205 		b_1 = B[1+ldb*0];
206 		b_2 = B[2+ldb*0];
207 		b_3 = B[3+ldb*0];
208 
209 		CC[0+bs*0] += a_0 * b_0;
210 		CC[1+bs*0] += a_1 * b_0;
211 		CC[2+bs*0] += a_2 * b_0;
212 		CC[3+bs*0] += a_3 * b_0;
213 
214 		CC[0+bs*1] += a_0 * b_1;
215 		CC[1+bs*1] += a_1 * b_1;
216 		CC[2+bs*1] += a_2 * b_1;
217 		CC[3+bs*1] += a_3 * b_1;
218 
219 		CC[0+bs*2] += a_0 * b_2;
220 		CC[1+bs*2] += a_1 * b_2;
221 		CC[2+bs*2] += a_2 * b_2;
222 		CC[3+bs*2] += a_3 * b_2;
223 
224 		CC[0+bs*3] += a_0 * b_3;
225 		CC[1+bs*3] += a_1 * b_3;
226 		CC[2+bs*3] += a_2 * b_3;
227 		CC[3+bs*3] += a_3 * b_3;
228 
229 		A += 1*lda;
230 		B += 1*ldb;
231 
232 		}
233 
234 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
235 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
236 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
237 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
238 
239 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
240 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
241 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
242 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
243 
244 	D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
245 	D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
246 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
247 	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
248 
249 	D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
250 	D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
251 	D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
252 	D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
253 
254 	return;
255 
256 	}
257 #endif
258 
259 
260 
kernel_dgemm_nt_4x3_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)261 static void kernel_dgemm_nt_4x3_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
262 	{
263 
264 	const int bs = 4;
265 
266 	double
267 		a_0, a_1, a_2, a_3,
268 		b_0, b_1, b_2;
269 
270 #if defined(TARGET_GENERIC)
271 	double CC[16] = {0};
272 #else
273 	ALIGNED( double CC[16], 64 ) = {0};
274 #endif
275 
276 	int k;
277 
278 	for(k=0; k<kmax-3; k+=4)
279 		{
280 
281 		// k = 0
282 
283 		a_0 = A[0+lda*0];
284 		a_1 = A[1+lda*0];
285 		a_2 = A[2+lda*0];
286 		a_3 = A[3+lda*0];
287 
288 		b_0 = B[0+ldb*0];
289 		b_1 = B[1+ldb*0];
290 		b_2 = B[2+ldb*0];
291 
292 		CC[0+bs*0] += a_0 * b_0;
293 		CC[1+bs*0] += a_1 * b_0;
294 		CC[2+bs*0] += a_2 * b_0;
295 		CC[3+bs*0] += a_3 * b_0;
296 
297 		CC[0+bs*1] += a_0 * b_1;
298 		CC[1+bs*1] += a_1 * b_1;
299 		CC[2+bs*1] += a_2 * b_1;
300 		CC[3+bs*1] += a_3 * b_1;
301 
302 		CC[0+bs*2] += a_0 * b_2;
303 		CC[1+bs*2] += a_1 * b_2;
304 		CC[2+bs*2] += a_2 * b_2;
305 		CC[3+bs*2] += a_3 * b_2;
306 
307 
308 		// k = 1
309 
310 		a_0 = A[0+lda*1];
311 		a_1 = A[1+lda*1];
312 		a_2 = A[2+lda*1];
313 		a_3 = A[3+lda*1];
314 
315 		b_0 = B[0+ldb*1];
316 		b_1 = B[1+ldb*1];
317 		b_2 = B[2+ldb*1];
318 
319 		CC[0+bs*0] += a_0 * b_0;
320 		CC[1+bs*0] += a_1 * b_0;
321 		CC[2+bs*0] += a_2 * b_0;
322 		CC[3+bs*0] += a_3 * b_0;
323 
324 		CC[0+bs*1] += a_0 * b_1;
325 		CC[1+bs*1] += a_1 * b_1;
326 		CC[2+bs*1] += a_2 * b_1;
327 		CC[3+bs*1] += a_3 * b_1;
328 
329 		CC[0+bs*2] += a_0 * b_2;
330 		CC[1+bs*2] += a_1 * b_2;
331 		CC[2+bs*2] += a_2 * b_2;
332 		CC[3+bs*2] += a_3 * b_2;
333 
334 
335 		// k = 2
336 
337 		a_0 = A[0+lda*2];
338 		a_1 = A[1+lda*2];
339 		a_2 = A[2+lda*2];
340 		a_3 = A[3+lda*2];
341 
342 		b_0 = B[0+ldb*2];
343 		b_1 = B[1+ldb*2];
344 		b_2 = B[2+ldb*2];
345 
346 		CC[0+bs*0] += a_0 * b_0;
347 		CC[1+bs*0] += a_1 * b_0;
348 		CC[2+bs*0] += a_2 * b_0;
349 		CC[3+bs*0] += a_3 * b_0;
350 
351 		CC[0+bs*1] += a_0 * b_1;
352 		CC[1+bs*1] += a_1 * b_1;
353 		CC[2+bs*1] += a_2 * b_1;
354 		CC[3+bs*1] += a_3 * b_1;
355 
356 		CC[0+bs*2] += a_0 * b_2;
357 		CC[1+bs*2] += a_1 * b_2;
358 		CC[2+bs*2] += a_2 * b_2;
359 		CC[3+bs*2] += a_3 * b_2;
360 
361 
362 		// k = 3
363 
364 		a_0 = A[0+lda*3];
365 		a_1 = A[1+lda*3];
366 		a_2 = A[2+lda*3];
367 		a_3 = A[3+lda*3];
368 
369 		b_0 = B[0+ldb*3];
370 		b_1 = B[1+ldb*3];
371 		b_2 = B[2+ldb*3];
372 
373 		CC[0+bs*0] += a_0 * b_0;
374 		CC[1+bs*0] += a_1 * b_0;
375 		CC[2+bs*0] += a_2 * b_0;
376 		CC[3+bs*0] += a_3 * b_0;
377 
378 		CC[0+bs*1] += a_0 * b_1;
379 		CC[1+bs*1] += a_1 * b_1;
380 		CC[2+bs*1] += a_2 * b_1;
381 		CC[3+bs*1] += a_3 * b_1;
382 
383 		CC[0+bs*2] += a_0 * b_2;
384 		CC[1+bs*2] += a_1 * b_2;
385 		CC[2+bs*2] += a_2 * b_2;
386 		CC[3+bs*2] += a_3 * b_2;
387 
388 		A += 4*lda;
389 		B += 4*ldb;
390 
391 		}
392 
393 	for(; k<kmax; k++)
394 		{
395 
396 		// k = 0
397 
398 		a_0 = A[0+lda*0];
399 		a_1 = A[1+lda*0];
400 		a_2 = A[2+lda*0];
401 		a_3 = A[3+lda*0];
402 
403 		b_0 = B[0+ldb*0];
404 		b_1 = B[1+ldb*0];
405 		b_2 = B[2+ldb*0];
406 
407 		CC[0+bs*0] += a_0 * b_0;
408 		CC[1+bs*0] += a_1 * b_0;
409 		CC[2+bs*0] += a_2 * b_0;
410 		CC[3+bs*0] += a_3 * b_0;
411 
412 		CC[0+bs*1] += a_0 * b_1;
413 		CC[1+bs*1] += a_1 * b_1;
414 		CC[2+bs*1] += a_2 * b_1;
415 		CC[3+bs*1] += a_3 * b_1;
416 
417 		CC[0+bs*2] += a_0 * b_2;
418 		CC[1+bs*2] += a_1 * b_2;
419 		CC[2+bs*2] += a_2 * b_2;
420 		CC[3+bs*2] += a_3 * b_2;
421 
422 		A += 1*lda;
423 		B += 1*ldb;
424 
425 		}
426 
427 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
428 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
429 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
430 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
431 
432 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
433 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
434 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
435 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
436 
437 	D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
438 	D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
439 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
440 	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
441 
442 	return;
443 
444 	}
445 
446 
447 
kernel_dgemm_nt_4x2_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)448 static void kernel_dgemm_nt_4x2_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
449 	{
450 
451 	const int bs = 4;
452 
453 	double
454 		a_0, a_1, a_2, a_3,
455 		b_0, b_1;
456 
457 #if defined(TARGET_GENERIC)
458 	double CC[16] = {0};
459 #else
460 	ALIGNED( double CC[16], 64 ) = {0};
461 #endif
462 
463 	int k;
464 
465 	for(k=0; k<kmax-3; k+=4)
466 		{
467 
468 		// k = 0
469 
470 		a_0 = A[0+lda*0];
471 		a_1 = A[1+lda*0];
472 		a_2 = A[2+lda*0];
473 		a_3 = A[3+lda*0];
474 
475 		b_0 = B[0+ldb*0];
476 		b_1 = B[1+ldb*0];
477 
478 		CC[0+bs*0] += a_0 * b_0;
479 		CC[1+bs*0] += a_1 * b_0;
480 		CC[2+bs*0] += a_2 * b_0;
481 		CC[3+bs*0] += a_3 * b_0;
482 
483 		CC[0+bs*1] += a_0 * b_1;
484 		CC[1+bs*1] += a_1 * b_1;
485 		CC[2+bs*1] += a_2 * b_1;
486 		CC[3+bs*1] += a_3 * b_1;
487 
488 
489 		// k = 1
490 
491 		a_0 = A[0+lda*1];
492 		a_1 = A[1+lda*1];
493 		a_2 = A[2+lda*1];
494 		a_3 = A[3+lda*1];
495 
496 		b_0 = B[0+ldb*1];
497 		b_1 = B[1+ldb*1];
498 
499 		CC[0+bs*0] += a_0 * b_0;
500 		CC[1+bs*0] += a_1 * b_0;
501 		CC[2+bs*0] += a_2 * b_0;
502 		CC[3+bs*0] += a_3 * b_0;
503 
504 		CC[0+bs*1] += a_0 * b_1;
505 		CC[1+bs*1] += a_1 * b_1;
506 		CC[2+bs*1] += a_2 * b_1;
507 		CC[3+bs*1] += a_3 * b_1;
508 
509 
510 		// k = 2
511 
512 		a_0 = A[0+lda*2];
513 		a_1 = A[1+lda*2];
514 		a_2 = A[2+lda*2];
515 		a_3 = A[3+lda*2];
516 
517 		b_0 = B[0+ldb*2];
518 		b_1 = B[1+ldb*2];
519 
520 		CC[0+bs*0] += a_0 * b_0;
521 		CC[1+bs*0] += a_1 * b_0;
522 		CC[2+bs*0] += a_2 * b_0;
523 		CC[3+bs*0] += a_3 * b_0;
524 
525 		CC[0+bs*1] += a_0 * b_1;
526 		CC[1+bs*1] += a_1 * b_1;
527 		CC[2+bs*1] += a_2 * b_1;
528 		CC[3+bs*1] += a_3 * b_1;
529 
530 
531 		// k = 3
532 
533 		a_0 = A[0+lda*3];
534 		a_1 = A[1+lda*3];
535 		a_2 = A[2+lda*3];
536 		a_3 = A[3+lda*3];
537 
538 		b_0 = B[0+ldb*3];
539 		b_1 = B[1+ldb*3];
540 
541 		CC[0+bs*0] += a_0 * b_0;
542 		CC[1+bs*0] += a_1 * b_0;
543 		CC[2+bs*0] += a_2 * b_0;
544 		CC[3+bs*0] += a_3 * b_0;
545 
546 		CC[0+bs*1] += a_0 * b_1;
547 		CC[1+bs*1] += a_1 * b_1;
548 		CC[2+bs*1] += a_2 * b_1;
549 		CC[3+bs*1] += a_3 * b_1;
550 
551 		A += 4*lda;
552 		B += 4*ldb;
553 
554 		}
555 
556 	for(; k<kmax; k++)
557 		{
558 
559 		// k = 0
560 
561 		a_0 = A[0+lda*0];
562 		a_1 = A[1+lda*0];
563 		a_2 = A[2+lda*0];
564 		a_3 = A[3+lda*0];
565 
566 		b_0 = B[0+ldb*0];
567 		b_1 = B[1+ldb*0];
568 
569 		CC[0+bs*0] += a_0 * b_0;
570 		CC[1+bs*0] += a_1 * b_0;
571 		CC[2+bs*0] += a_2 * b_0;
572 		CC[3+bs*0] += a_3 * b_0;
573 
574 		CC[0+bs*1] += a_0 * b_1;
575 		CC[1+bs*1] += a_1 * b_1;
576 		CC[2+bs*1] += a_2 * b_1;
577 		CC[3+bs*1] += a_3 * b_1;
578 
579 		A += 1*lda;
580 		B += 1*ldb;
581 
582 		}
583 
584 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
585 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
586 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
587 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
588 
589 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
590 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
591 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
592 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
593 
594 	return;
595 
596 	}
597 
598 
599 
kernel_dgemm_nt_4x1_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)600 static void kernel_dgemm_nt_4x1_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
601 	{
602 
603 	const int bs = 4;
604 
605 	double
606 		a_0, a_1, a_2, a_3,
607 		b_0;
608 
609 #if defined(TARGET_GENERIC)
610 	double CC[16] = {0};
611 #else
612 	ALIGNED( double CC[16], 64 ) = {0};
613 #endif
614 
615 	int k;
616 
617 	for(k=0; k<kmax-3; k+=4)
618 		{
619 
620 		// k = 0
621 
622 		a_0 = A[0+lda*0];
623 		a_1 = A[1+lda*0];
624 		a_2 = A[2+lda*0];
625 		a_3 = A[3+lda*0];
626 
627 		b_0 = B[0+ldb*0];
628 
629 		CC[0+bs*0] += a_0 * b_0;
630 		CC[1+bs*0] += a_1 * b_0;
631 		CC[2+bs*0] += a_2 * b_0;
632 		CC[3+bs*0] += a_3 * b_0;
633 
634 
635 		// k = 1
636 
637 		a_0 = A[0+lda*1];
638 		a_1 = A[1+lda*1];
639 		a_2 = A[2+lda*1];
640 		a_3 = A[3+lda*1];
641 
642 		b_0 = B[0+ldb*1];
643 
644 		CC[0+bs*0] += a_0 * b_0;
645 		CC[1+bs*0] += a_1 * b_0;
646 		CC[2+bs*0] += a_2 * b_0;
647 		CC[3+bs*0] += a_3 * b_0;
648 
649 
650 		// k = 2
651 
652 		a_0 = A[0+lda*2];
653 		a_1 = A[1+lda*2];
654 		a_2 = A[2+lda*2];
655 		a_3 = A[3+lda*2];
656 
657 		b_0 = B[0+ldb*2];
658 
659 		CC[0+bs*0] += a_0 * b_0;
660 		CC[1+bs*0] += a_1 * b_0;
661 		CC[2+bs*0] += a_2 * b_0;
662 		CC[3+bs*0] += a_3 * b_0;
663 
664 
665 		// k = 3
666 
667 		a_0 = A[0+lda*3];
668 		a_1 = A[1+lda*3];
669 		a_2 = A[2+lda*3];
670 		a_3 = A[3+lda*3];
671 
672 		b_0 = B[0+ldb*3];
673 
674 		CC[0+bs*0] += a_0 * b_0;
675 		CC[1+bs*0] += a_1 * b_0;
676 		CC[2+bs*0] += a_2 * b_0;
677 		CC[3+bs*0] += a_3 * b_0;
678 
679 		A += 4*lda;
680 		B += 4*ldb;
681 
682 		}
683 
684 	for(; k<kmax; k++)
685 		{
686 
687 		// k = 0
688 
689 		a_0 = A[0+lda*0];
690 		a_1 = A[1+lda*0];
691 		a_2 = A[2+lda*0];
692 		a_3 = A[3+lda*0];
693 
694 		b_0 = B[0+ldb*0];
695 
696 		CC[0+bs*0] += a_0 * b_0;
697 		CC[1+bs*0] += a_1 * b_0;
698 		CC[2+bs*0] += a_2 * b_0;
699 		CC[3+bs*0] += a_3 * b_0;
700 
701 		A += 1*lda;
702 		B += 1*ldb;
703 
704 		}
705 
706 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
707 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
708 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
709 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
710 
711 	return;
712 
713 	}
714 
715 
716 
717 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_nt_4x4_vs_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)718 void kernel_dgemm_nt_4x4_vs_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
719 	{
720 
721 	const int bs = 4;
722 
723 #if defined(TARGET_GENERIC)
724 	double CC[16] = {0};
725 #else
726 	ALIGNED( double CC[16], 64 ) = {0};
727 #endif
728 
729 	double alpha1 = 1.0;
730 	double beta1 = 0.0;
731 
732 	if(n1<=1)
733 		{
734 		kernel_dgemm_nt_4x1_libcccc(kmax, &alpha1, A, lda, B, ldb, &beta1, CC, bs, CC, bs);
735 		}
736 	else if(n1==2)
737 		{
738 		kernel_dgemm_nt_4x2_libcccc(kmax, &alpha1, A, lda, B, ldb, &beta1, CC, bs, CC, bs);
739 		}
740 	else if(n1==3)
741 		{
742 		kernel_dgemm_nt_4x3_libcccc(kmax, &alpha1, A, lda, B, ldb, &beta1, CC, bs, CC, bs);
743 		}
744 	else //if(n1==1)
745 		{
746 		kernel_dgemm_nt_4x4_libcccc(kmax, &alpha1, A, lda, B, ldb, &beta1, CC, bs, CC, bs);
747 		}
748 
749 	if(m1>=4)
750 		{
751 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
752 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
753 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
754 		D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
755 
756 		if(n1==1)
757 			return;
758 
759 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
760 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
761 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
762 		D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
763 
764 		if(n1==2)
765 			return;
766 
767 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
768 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
769 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
770 		D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
771 
772 		if(n1==3)
773 			return;
774 
775 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
776 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
777 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
778 		D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
779 		}
780 	else if(m1>=3)
781 		{
782 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
783 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
784 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
785 
786 		if(n1==1)
787 			return;
788 
789 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
790 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
791 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
792 
793 		if(n1==2)
794 			return;
795 
796 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
797 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
798 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
799 
800 		if(n1==3)
801 			return;
802 
803 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
804 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
805 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
806 		}
807 	else if(m1>=2)
808 		{
809 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
810 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
811 
812 		if(n1==1)
813 			return;
814 
815 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
816 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
817 
818 		if(n1==2)
819 			return;
820 
821 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
822 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
823 
824 		if(n1==3)
825 			return;
826 
827 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
828 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
829 		}
830 	else //if(m1>=1)
831 		{
832 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
833 
834 		if(n1==1)
835 			return;
836 
837 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
838 
839 		if(n1==2)
840 			return;
841 
842 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
843 
844 		if(n1==3)
845 			return;
846 
847 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
848 		}
849 
850 	return;
851 
852 	}
853 #endif
854 
855 
856 
857 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nt_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)858 void kernel_dgemm_nt_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
859 	{
860 
861 	kernel_dgemm_nt_4x4_libcccc(kmax, alpha, A, 4, B, ldb, beta, C, ldc, D, ldd);
862 
863 	return;
864 
865 	}
866 #endif
867 
868 
869 
870 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nt_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)871 void kernel_dgemm_nt_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
872 	{
873 
874 	kernel_dgemm_nt_4x4_vs_libcccc(kmax, alpha, A, 4, B, ldb, beta, C, ldc, D, ldd, m1, n1);
875 
876 	return;
877 
878 	}
879 #endif
880 
881 
882 
883 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nt_4x4_libc4cc(int kmax,double * alpha,double * A,int lda,double * B,double * beta,double * C,int ldc,double * D,int ldd)884 void kernel_dgemm_nt_4x4_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd)
885 	{
886 
887 	kernel_dgemm_nt_4x4_libcccc(kmax, alpha, A, lda, B, 4, beta, C, ldc, D, ldd);
888 
889 	return;
890 
891 	}
892 #endif
893 
894 
895 
896 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nt_4x4_vs_libc4cc(int kmax,double * alpha,double * A,int lda,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)897 void kernel_dgemm_nt_4x4_vs_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
898 	{
899 
900 	kernel_dgemm_nt_4x4_vs_libcccc(kmax, alpha, A, lda, B, 4, beta, C, ldc, D, ldd, m1, n1);
901 
902 	return;
903 
904 	}
905 #endif
906 
907 
908 
909 #if defined(TARGET_GENERIC) | defined(TARGET_X86_AMD_BARCELONA)
kernel_dgemm_nt_4x4_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd)910 void kernel_dgemm_nt_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
911 	{
912 
913 #if defined(TARGET_X86_AMD_BARCELONA)
914 	kernel_dgemm_nt_4x2_lib44cc(kmax, alpha, A, B+0, beta, C+0*ldc, ldc, D+0*ldd, ldd);
915 	kernel_dgemm_nt_4x2_lib44cc(kmax, alpha, A, B+2, beta, C+2*ldc, ldc, D+2*ldd, ldd);
916 	return;
917 #endif
918 
919 	const int bs = 4;
920 
921 #if defined(TARGET_GENERIC)
922 	double CC[16] = {0};
923 #else
924 	ALIGNED( double CC[16], 64 ) = {0};
925 #endif
926 
927 	double alpha1 = 1.0;
928 	double beta1 = 0.0;
929 
930 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
931 
932 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
933 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
934 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
935 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
936 
937 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
938 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
939 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
940 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
941 
942 	D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
943 	D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
944 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
945 	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
946 
947 	D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
948 	D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
949 	D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
950 	D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
951 
952 	return;
953 
954 	}
955 #endif
956 
957 
958 
959 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nt_4x4_vs_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)960 void kernel_dgemm_nt_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
961 	{
962 
963 	const int bs = 4;
964 
965 #if defined(TARGET_GENERIC)
966 	double CC[16] = {0};
967 #else
968 	ALIGNED( double CC[16], 64 ) = {0};
969 #endif
970 
971 	double alpha1 = 1.0;
972 	double beta1 = 0.0;
973 
974 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
975 
976 	if(m1>=4)
977 		{
978 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
979 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
980 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
981 		D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
982 
983 		if(n1==1)
984 			return;
985 
986 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
987 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
988 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
989 		D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
990 
991 		if(n1==2)
992 			return;
993 
994 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
995 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
996 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
997 		D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
998 
999 		if(n1==3)
1000 			return;
1001 
1002 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1003 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
1004 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
1005 		D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
1006 		}
1007 	else if(m1>=3)
1008 		{
1009 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1010 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1011 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
1012 
1013 		if(n1==1)
1014 			return;
1015 
1016 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1017 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1018 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
1019 
1020 		if(n1==2)
1021 			return;
1022 
1023 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1024 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
1025 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
1026 
1027 		if(n1==3)
1028 			return;
1029 
1030 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1031 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
1032 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
1033 		}
1034 	else if(m1>=2)
1035 		{
1036 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1037 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1038 
1039 		if(n1==1)
1040 			return;
1041 
1042 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1043 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1044 
1045 		if(n1==2)
1046 			return;
1047 
1048 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1049 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
1050 
1051 		if(n1==3)
1052 			return;
1053 
1054 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1055 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
1056 		}
1057 	else //if(m1>=1)
1058 		{
1059 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1060 
1061 		if(n1==1)
1062 			return;
1063 
1064 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1065 
1066 		if(n1==2)
1067 			return;
1068 
1069 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1070 
1071 		if(n1==3)
1072 			return;
1073 
1074 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1075 		}
1076 
1077 	return;
1078 
1079 	}
1080 #endif
1081 
1082 
1083 
1084 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_nn_4x4_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)1085 void kernel_dgemm_nn_4x4_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
1086 	{
1087 
1088 	const int bs = 4;
1089 
1090 	double
1091 		a_0, a_1, a_2, a_3,
1092 		b_0, b_1, b_2, b_3;
1093 
1094 #if defined(TARGET_GENERIC)
1095 	double CC[16] = {0};
1096 #else
1097 	ALIGNED( double CC[16], 64 ) = {0};
1098 #endif
1099 
1100 	int k;
1101 
1102 	for(k=0; k<kmax-3; k+=4)
1103 		{
1104 
1105 		// k = 0
1106 
1107 		a_0 = A[0+lda*0];
1108 		a_1 = A[1+lda*0];
1109 		a_2 = A[2+lda*0];
1110 		a_3 = A[3+lda*0];
1111 
1112 		b_0 = B[0+ldb*0];
1113 		b_1 = B[0+ldb*1];
1114 		b_2 = B[0+ldb*2];
1115 		b_3 = B[0+ldb*3];
1116 
1117 		CC[0+bs*0] += a_0 * b_0;
1118 		CC[1+bs*0] += a_1 * b_0;
1119 		CC[2+bs*0] += a_2 * b_0;
1120 		CC[3+bs*0] += a_3 * b_0;
1121 
1122 		CC[0+bs*1] += a_0 * b_1;
1123 		CC[1+bs*1] += a_1 * b_1;
1124 		CC[2+bs*1] += a_2 * b_1;
1125 		CC[3+bs*1] += a_3 * b_1;
1126 
1127 		CC[0+bs*2] += a_0 * b_2;
1128 		CC[1+bs*2] += a_1 * b_2;
1129 		CC[2+bs*2] += a_2 * b_2;
1130 		CC[3+bs*2] += a_3 * b_2;
1131 
1132 		CC[0+bs*3] += a_0 * b_3;
1133 		CC[1+bs*3] += a_1 * b_3;
1134 		CC[2+bs*3] += a_2 * b_3;
1135 		CC[3+bs*3] += a_3 * b_3;
1136 
1137 
1138 		// k = 1
1139 
1140 		a_0 = A[0+lda*1];
1141 		a_1 = A[1+lda*1];
1142 		a_2 = A[2+lda*1];
1143 		a_3 = A[3+lda*1];
1144 
1145 		b_0 = B[1+ldb*0];
1146 		b_1 = B[1+ldb*1];
1147 		b_2 = B[1+ldb*2];
1148 		b_3 = B[1+ldb*3];
1149 
1150 		CC[0+bs*0] += a_0 * b_0;
1151 		CC[1+bs*0] += a_1 * b_0;
1152 		CC[2+bs*0] += a_2 * b_0;
1153 		CC[3+bs*0] += a_3 * b_0;
1154 
1155 		CC[0+bs*1] += a_0 * b_1;
1156 		CC[1+bs*1] += a_1 * b_1;
1157 		CC[2+bs*1] += a_2 * b_1;
1158 		CC[3+bs*1] += a_3 * b_1;
1159 
1160 		CC[0+bs*2] += a_0 * b_2;
1161 		CC[1+bs*2] += a_1 * b_2;
1162 		CC[2+bs*2] += a_2 * b_2;
1163 		CC[3+bs*2] += a_3 * b_2;
1164 
1165 		CC[0+bs*3] += a_0 * b_3;
1166 		CC[1+bs*3] += a_1 * b_3;
1167 		CC[2+bs*3] += a_2 * b_3;
1168 		CC[3+bs*3] += a_3 * b_3;
1169 
1170 
1171 		// k = 2
1172 
1173 		a_0 = A[0+lda*2];
1174 		a_1 = A[1+lda*2];
1175 		a_2 = A[2+lda*2];
1176 		a_3 = A[3+lda*2];
1177 
1178 		b_0 = B[2+ldb*0];
1179 		b_1 = B[2+ldb*1];
1180 		b_2 = B[2+ldb*2];
1181 		b_3 = B[2+ldb*3];
1182 
1183 		CC[0+bs*0] += a_0 * b_0;
1184 		CC[1+bs*0] += a_1 * b_0;
1185 		CC[2+bs*0] += a_2 * b_0;
1186 		CC[3+bs*0] += a_3 * b_0;
1187 
1188 		CC[0+bs*1] += a_0 * b_1;
1189 		CC[1+bs*1] += a_1 * b_1;
1190 		CC[2+bs*1] += a_2 * b_1;
1191 		CC[3+bs*1] += a_3 * b_1;
1192 
1193 		CC[0+bs*2] += a_0 * b_2;
1194 		CC[1+bs*2] += a_1 * b_2;
1195 		CC[2+bs*2] += a_2 * b_2;
1196 		CC[3+bs*2] += a_3 * b_2;
1197 
1198 		CC[0+bs*3] += a_0 * b_3;
1199 		CC[1+bs*3] += a_1 * b_3;
1200 		CC[2+bs*3] += a_2 * b_3;
1201 		CC[3+bs*3] += a_3 * b_3;
1202 
1203 
1204 		// k = 3
1205 
1206 		a_0 = A[0+lda*3];
1207 		a_1 = A[1+lda*3];
1208 		a_2 = A[2+lda*3];
1209 		a_3 = A[3+lda*3];
1210 
1211 		b_0 = B[3+ldb*0];
1212 		b_1 = B[3+ldb*1];
1213 		b_2 = B[3+ldb*2];
1214 		b_3 = B[3+ldb*3];
1215 
1216 		CC[0+bs*0] += a_0 * b_0;
1217 		CC[1+bs*0] += a_1 * b_0;
1218 		CC[2+bs*0] += a_2 * b_0;
1219 		CC[3+bs*0] += a_3 * b_0;
1220 
1221 		CC[0+bs*1] += a_0 * b_1;
1222 		CC[1+bs*1] += a_1 * b_1;
1223 		CC[2+bs*1] += a_2 * b_1;
1224 		CC[3+bs*1] += a_3 * b_1;
1225 
1226 		CC[0+bs*2] += a_0 * b_2;
1227 		CC[1+bs*2] += a_1 * b_2;
1228 		CC[2+bs*2] += a_2 * b_2;
1229 		CC[3+bs*2] += a_3 * b_2;
1230 
1231 		CC[0+bs*3] += a_0 * b_3;
1232 		CC[1+bs*3] += a_1 * b_3;
1233 		CC[2+bs*3] += a_2 * b_3;
1234 		CC[3+bs*3] += a_3 * b_3;
1235 
1236 		A += 4*lda;
1237 		B += 4;
1238 
1239 		}
1240 
1241 	for(; k<kmax; k++)
1242 		{
1243 
1244 		// k = 0
1245 
1246 		a_0 = A[0+lda*0];
1247 		a_1 = A[1+lda*0];
1248 		a_2 = A[2+lda*0];
1249 		a_3 = A[3+lda*0];
1250 
1251 		b_0 = B[0+ldb*0];
1252 		b_1 = B[0+ldb*1];
1253 		b_2 = B[0+ldb*2];
1254 		b_3 = B[0+ldb*3];
1255 
1256 		CC[0+bs*0] += a_0 * b_0;
1257 		CC[1+bs*0] += a_1 * b_0;
1258 		CC[2+bs*0] += a_2 * b_0;
1259 		CC[3+bs*0] += a_3 * b_0;
1260 
1261 		CC[0+bs*1] += a_0 * b_1;
1262 		CC[1+bs*1] += a_1 * b_1;
1263 		CC[2+bs*1] += a_2 * b_1;
1264 		CC[3+bs*1] += a_3 * b_1;
1265 
1266 		CC[0+bs*2] += a_0 * b_2;
1267 		CC[1+bs*2] += a_1 * b_2;
1268 		CC[2+bs*2] += a_2 * b_2;
1269 		CC[3+bs*2] += a_3 * b_2;
1270 
1271 		CC[0+bs*3] += a_0 * b_3;
1272 		CC[1+bs*3] += a_1 * b_3;
1273 		CC[2+bs*3] += a_2 * b_3;
1274 		CC[3+bs*3] += a_3 * b_3;
1275 
1276 		A += 1*lda;
1277 		B += 1;
1278 
1279 		}
1280 
1281 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1282 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1283 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
1284 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
1285 
1286 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1287 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1288 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
1289 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
1290 
1291 	D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1292 	D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
1293 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
1294 	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
1295 
1296 	D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1297 	D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
1298 	D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
1299 	D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
1300 
1301 	return;
1302 
1303 	}
1304 #endif
1305 
1306 
1307 
kernel_dgemm_nn_4x3_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)1308 static void kernel_dgemm_nn_4x3_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
1309 	{
1310 
1311 	const int bs = 4;
1312 
1313 	double
1314 		a_0, a_1, a_2, a_3,
1315 		b_0, b_1, b_2;
1316 
1317 #if defined(TARGET_GENERIC)
1318 	double CC[16] = {0};
1319 #else
1320 	ALIGNED( double CC[16], 64 ) = {0};
1321 #endif
1322 
1323 	int k;
1324 
1325 	for(k=0; k<kmax-3; k+=4)
1326 		{
1327 
1328 		// k = 0
1329 
1330 		a_0 = A[0+lda*0];
1331 		a_1 = A[1+lda*0];
1332 		a_2 = A[2+lda*0];
1333 		a_3 = A[3+lda*0];
1334 
1335 		b_0 = B[0+ldb*0];
1336 		b_1 = B[0+ldb*1];
1337 		b_2 = B[0+ldb*2];
1338 
1339 		CC[0+bs*0] += a_0 * b_0;
1340 		CC[1+bs*0] += a_1 * b_0;
1341 		CC[2+bs*0] += a_2 * b_0;
1342 		CC[3+bs*0] += a_3 * b_0;
1343 
1344 		CC[0+bs*1] += a_0 * b_1;
1345 		CC[1+bs*1] += a_1 * b_1;
1346 		CC[2+bs*1] += a_2 * b_1;
1347 		CC[3+bs*1] += a_3 * b_1;
1348 
1349 		CC[0+bs*2] += a_0 * b_2;
1350 		CC[1+bs*2] += a_1 * b_2;
1351 		CC[2+bs*2] += a_2 * b_2;
1352 		CC[3+bs*2] += a_3 * b_2;
1353 
1354 
1355 		// k = 1
1356 
1357 		a_0 = A[0+lda*1];
1358 		a_1 = A[1+lda*1];
1359 		a_2 = A[2+lda*1];
1360 		a_3 = A[3+lda*1];
1361 
1362 		b_0 = B[1+ldb*0];
1363 		b_1 = B[1+ldb*1];
1364 		b_2 = B[1+ldb*2];
1365 
1366 		CC[0+bs*0] += a_0 * b_0;
1367 		CC[1+bs*0] += a_1 * b_0;
1368 		CC[2+bs*0] += a_2 * b_0;
1369 		CC[3+bs*0] += a_3 * b_0;
1370 
1371 		CC[0+bs*1] += a_0 * b_1;
1372 		CC[1+bs*1] += a_1 * b_1;
1373 		CC[2+bs*1] += a_2 * b_1;
1374 		CC[3+bs*1] += a_3 * b_1;
1375 
1376 		CC[0+bs*2] += a_0 * b_2;
1377 		CC[1+bs*2] += a_1 * b_2;
1378 		CC[2+bs*2] += a_2 * b_2;
1379 		CC[3+bs*2] += a_3 * b_2;
1380 
1381 
1382 		// k = 2
1383 
1384 		a_0 = A[0+lda*2];
1385 		a_1 = A[1+lda*2];
1386 		a_2 = A[2+lda*2];
1387 		a_3 = A[3+lda*2];
1388 
1389 		b_0 = B[2+ldb*0];
1390 		b_1 = B[2+ldb*1];
1391 		b_2 = B[2+ldb*2];
1392 
1393 		CC[0+bs*0] += a_0 * b_0;
1394 		CC[1+bs*0] += a_1 * b_0;
1395 		CC[2+bs*0] += a_2 * b_0;
1396 		CC[3+bs*0] += a_3 * b_0;
1397 
1398 		CC[0+bs*1] += a_0 * b_1;
1399 		CC[1+bs*1] += a_1 * b_1;
1400 		CC[2+bs*1] += a_2 * b_1;
1401 		CC[3+bs*1] += a_3 * b_1;
1402 
1403 		CC[0+bs*2] += a_0 * b_2;
1404 		CC[1+bs*2] += a_1 * b_2;
1405 		CC[2+bs*2] += a_2 * b_2;
1406 		CC[3+bs*2] += a_3 * b_2;
1407 
1408 
1409 		// k = 3
1410 
1411 		a_0 = A[0+lda*3];
1412 		a_1 = A[1+lda*3];
1413 		a_2 = A[2+lda*3];
1414 		a_3 = A[3+lda*3];
1415 
1416 		b_0 = B[3+ldb*0];
1417 		b_1 = B[3+ldb*1];
1418 		b_2 = B[3+ldb*2];
1419 
1420 		CC[0+bs*0] += a_0 * b_0;
1421 		CC[1+bs*0] += a_1 * b_0;
1422 		CC[2+bs*0] += a_2 * b_0;
1423 		CC[3+bs*0] += a_3 * b_0;
1424 
1425 		CC[0+bs*1] += a_0 * b_1;
1426 		CC[1+bs*1] += a_1 * b_1;
1427 		CC[2+bs*1] += a_2 * b_1;
1428 		CC[3+bs*1] += a_3 * b_1;
1429 
1430 		CC[0+bs*2] += a_0 * b_2;
1431 		CC[1+bs*2] += a_1 * b_2;
1432 		CC[2+bs*2] += a_2 * b_2;
1433 		CC[3+bs*2] += a_3 * b_2;
1434 
1435 		A += 4*lda;
1436 		B += 4;
1437 
1438 		}
1439 
1440 	for(; k<kmax; k++)
1441 		{
1442 
1443 		// k = 0
1444 
1445 		a_0 = A[0+lda*0];
1446 		a_1 = A[1+lda*0];
1447 		a_2 = A[2+lda*0];
1448 		a_3 = A[3+lda*0];
1449 
1450 		b_0 = B[0+ldb*0];
1451 		b_1 = B[0+ldb*1];
1452 		b_2 = B[0+ldb*2];
1453 
1454 		CC[0+bs*0] += a_0 * b_0;
1455 		CC[1+bs*0] += a_1 * b_0;
1456 		CC[2+bs*0] += a_2 * b_0;
1457 		CC[3+bs*0] += a_3 * b_0;
1458 
1459 		CC[0+bs*1] += a_0 * b_1;
1460 		CC[1+bs*1] += a_1 * b_1;
1461 		CC[2+bs*1] += a_2 * b_1;
1462 		CC[3+bs*1] += a_3 * b_1;
1463 
1464 		CC[0+bs*2] += a_0 * b_2;
1465 		CC[1+bs*2] += a_1 * b_2;
1466 		CC[2+bs*2] += a_2 * b_2;
1467 		CC[3+bs*2] += a_3 * b_2;
1468 
1469 		A += 1*lda;
1470 		B += 1;
1471 
1472 		}
1473 
1474 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1475 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1476 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
1477 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
1478 
1479 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1480 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1481 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
1482 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
1483 
1484 	D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1485 	D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
1486 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
1487 	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
1488 
1489 	return;
1490 
1491 	}
1492 
1493 
1494 
kernel_dgemm_nn_4x2_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)1495 static void kernel_dgemm_nn_4x2_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
1496 	{
1497 
1498 	const int bs = 4;
1499 
1500 	double
1501 		a_0, a_1, a_2, a_3,
1502 		b_0, b_1;
1503 
1504 #if defined(TARGET_GENERIC)
1505 	double CC[16] = {0};
1506 #else
1507 	ALIGNED( double CC[16], 64 ) = {0};
1508 #endif
1509 
1510 	int k;
1511 
1512 	for(k=0; k<kmax-3; k+=4)
1513 		{
1514 
1515 		// k = 0
1516 
1517 		a_0 = A[0+lda*0];
1518 		a_1 = A[1+lda*0];
1519 		a_2 = A[2+lda*0];
1520 		a_3 = A[3+lda*0];
1521 
1522 		b_0 = B[0+ldb*0];
1523 		b_1 = B[0+ldb*1];
1524 
1525 		CC[0+bs*0] += a_0 * b_0;
1526 		CC[1+bs*0] += a_1 * b_0;
1527 		CC[2+bs*0] += a_2 * b_0;
1528 		CC[3+bs*0] += a_3 * b_0;
1529 
1530 		CC[0+bs*1] += a_0 * b_1;
1531 		CC[1+bs*1] += a_1 * b_1;
1532 		CC[2+bs*1] += a_2 * b_1;
1533 		CC[3+bs*1] += a_3 * b_1;
1534 
1535 
1536 		// k = 1
1537 
1538 		a_0 = A[0+lda*1];
1539 		a_1 = A[1+lda*1];
1540 		a_2 = A[2+lda*1];
1541 		a_3 = A[3+lda*1];
1542 
1543 		b_0 = B[1+ldb*0];
1544 		b_1 = B[1+ldb*1];
1545 
1546 		CC[0+bs*0] += a_0 * b_0;
1547 		CC[1+bs*0] += a_1 * b_0;
1548 		CC[2+bs*0] += a_2 * b_0;
1549 		CC[3+bs*0] += a_3 * b_0;
1550 
1551 		CC[0+bs*1] += a_0 * b_1;
1552 		CC[1+bs*1] += a_1 * b_1;
1553 		CC[2+bs*1] += a_2 * b_1;
1554 		CC[3+bs*1] += a_3 * b_1;
1555 
1556 
1557 		// k = 2
1558 
1559 		a_0 = A[0+lda*2];
1560 		a_1 = A[1+lda*2];
1561 		a_2 = A[2+lda*2];
1562 		a_3 = A[3+lda*2];
1563 
1564 		b_0 = B[2+ldb*0];
1565 		b_1 = B[2+ldb*1];
1566 
1567 		CC[0+bs*0] += a_0 * b_0;
1568 		CC[1+bs*0] += a_1 * b_0;
1569 		CC[2+bs*0] += a_2 * b_0;
1570 		CC[3+bs*0] += a_3 * b_0;
1571 
1572 		CC[0+bs*1] += a_0 * b_1;
1573 		CC[1+bs*1] += a_1 * b_1;
1574 		CC[2+bs*1] += a_2 * b_1;
1575 		CC[3+bs*1] += a_3 * b_1;
1576 
1577 
1578 		// k = 3
1579 
1580 		a_0 = A[0+lda*3];
1581 		a_1 = A[1+lda*3];
1582 		a_2 = A[2+lda*3];
1583 		a_3 = A[3+lda*3];
1584 
1585 		b_0 = B[3+ldb*0];
1586 		b_1 = B[3+ldb*1];
1587 
1588 		CC[0+bs*0] += a_0 * b_0;
1589 		CC[1+bs*0] += a_1 * b_0;
1590 		CC[2+bs*0] += a_2 * b_0;
1591 		CC[3+bs*0] += a_3 * b_0;
1592 
1593 		CC[0+bs*1] += a_0 * b_1;
1594 		CC[1+bs*1] += a_1 * b_1;
1595 		CC[2+bs*1] += a_2 * b_1;
1596 		CC[3+bs*1] += a_3 * b_1;
1597 
1598 		A += 4*lda;
1599 		B += 4;
1600 
1601 		}
1602 
1603 	for(; k<kmax; k++)
1604 		{
1605 
1606 		// k = 0
1607 
1608 		a_0 = A[0+lda*0];
1609 		a_1 = A[1+lda*0];
1610 		a_2 = A[2+lda*0];
1611 		a_3 = A[3+lda*0];
1612 
1613 		b_0 = B[0+ldb*0];
1614 		b_1 = B[0+ldb*1];
1615 
1616 		CC[0+bs*0] += a_0 * b_0;
1617 		CC[1+bs*0] += a_1 * b_0;
1618 		CC[2+bs*0] += a_2 * b_0;
1619 		CC[3+bs*0] += a_3 * b_0;
1620 
1621 		CC[0+bs*1] += a_0 * b_1;
1622 		CC[1+bs*1] += a_1 * b_1;
1623 		CC[2+bs*1] += a_2 * b_1;
1624 		CC[3+bs*1] += a_3 * b_1;
1625 
1626 		A += 1*lda;
1627 		B += 1;
1628 
1629 		}
1630 
1631 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1632 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1633 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
1634 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
1635 
1636 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1637 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1638 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
1639 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
1640 
1641 	return;
1642 
1643 	}
1644 
1645 
1646 
kernel_dgemm_nn_4x1_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)1647 static void kernel_dgemm_nn_4x1_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
1648 	{
1649 
1650 	const int bs = 4;
1651 
1652 	double
1653 		a_0, a_1, a_2, a_3,
1654 		b_0;
1655 
1656 #if defined(TARGET_GENERIC)
1657 	double CC[16] = {0};
1658 #else
1659 	ALIGNED( double CC[16], 64 ) = {0};
1660 #endif
1661 
1662 	int k;
1663 
1664 	for(k=0; k<kmax-3; k+=4)
1665 		{
1666 
1667 		// k = 0
1668 
1669 		a_0 = A[0+lda*0];
1670 		a_1 = A[1+lda*0];
1671 		a_2 = A[2+lda*0];
1672 		a_3 = A[3+lda*0];
1673 
1674 		b_0 = B[0+ldb*0];
1675 
1676 		CC[0+bs*0] += a_0 * b_0;
1677 		CC[1+bs*0] += a_1 * b_0;
1678 		CC[2+bs*0] += a_2 * b_0;
1679 		CC[3+bs*0] += a_3 * b_0;
1680 
1681 
1682 		// k = 1
1683 
1684 		a_0 = A[0+lda*1];
1685 		a_1 = A[1+lda*1];
1686 		a_2 = A[2+lda*1];
1687 		a_3 = A[3+lda*1];
1688 
1689 		b_0 = B[1+ldb*0];
1690 
1691 		CC[0+bs*0] += a_0 * b_0;
1692 		CC[1+bs*0] += a_1 * b_0;
1693 		CC[2+bs*0] += a_2 * b_0;
1694 		CC[3+bs*0] += a_3 * b_0;
1695 
1696 
1697 		// k = 2
1698 
1699 		a_0 = A[0+lda*2];
1700 		a_1 = A[1+lda*2];
1701 		a_2 = A[2+lda*2];
1702 		a_3 = A[3+lda*2];
1703 
1704 		b_0 = B[2+ldb*0];
1705 
1706 		CC[0+bs*0] += a_0 * b_0;
1707 		CC[1+bs*0] += a_1 * b_0;
1708 		CC[2+bs*0] += a_2 * b_0;
1709 		CC[3+bs*0] += a_3 * b_0;
1710 
1711 
1712 		// k = 3
1713 
1714 		a_0 = A[0+lda*3];
1715 		a_1 = A[1+lda*3];
1716 		a_2 = A[2+lda*3];
1717 		a_3 = A[3+lda*3];
1718 
1719 		b_0 = B[3+ldb*0];
1720 
1721 		CC[0+bs*0] += a_0 * b_0;
1722 		CC[1+bs*0] += a_1 * b_0;
1723 		CC[2+bs*0] += a_2 * b_0;
1724 		CC[3+bs*0] += a_3 * b_0;
1725 
1726 		A += 4*lda;
1727 		B += 4;
1728 
1729 		}
1730 
1731 	for(; k<kmax; k++)
1732 		{
1733 
1734 		// k = 0
1735 
1736 		a_0 = A[0+lda*0];
1737 		a_1 = A[1+lda*0];
1738 		a_2 = A[2+lda*0];
1739 		a_3 = A[3+lda*0];
1740 
1741 		b_0 = B[0+ldb*0];
1742 
1743 		CC[0+bs*0] += a_0 * b_0;
1744 		CC[1+bs*0] += a_1 * b_0;
1745 		CC[2+bs*0] += a_2 * b_0;
1746 		CC[3+bs*0] += a_3 * b_0;
1747 
1748 		A += 1*lda;
1749 		B += 1;
1750 
1751 		}
1752 
1753 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1754 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1755 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
1756 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
1757 
1758 	return;
1759 
1760 	}
1761 
1762 
1763 
1764 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_nn_4x4_vs_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)1765 void kernel_dgemm_nn_4x4_vs_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
1766 	{
1767 
1768 	const int bs = 4;
1769 
1770 #if defined(TARGET_GENERIC)
1771 	double CC[16] = {0};
1772 #else
1773 	ALIGNED( double CC[16], 64 ) = {0};
1774 #endif
1775 
1776 	double alpha1 = 1.0;
1777 	double beta1 = 0.0;
1778 
1779 	if(n1<=1)
1780 		{
1781 		kernel_dgemm_nn_4x1_libcccc(kmax, &alpha1, A, lda, B, ldb, &beta1, CC, bs, CC, bs);
1782 		}
1783 	else if(n1==2)
1784 		{
1785 		kernel_dgemm_nn_4x2_libcccc(kmax, &alpha1, A, lda, B, ldb, &beta1, CC, bs, CC, bs);
1786 		}
1787 	else if(n1==3)
1788 		{
1789 		kernel_dgemm_nn_4x3_libcccc(kmax, &alpha1, A, lda, B, ldb, &beta1, CC, bs, CC, bs);
1790 		}
1791 	else //if(n1==4)
1792 		{
1793 		kernel_dgemm_nn_4x4_libcccc(kmax, &alpha1, A, lda, B, ldb, &beta1, CC, bs, CC, bs);
1794 		}
1795 
1796 	if(m1>=4)
1797 		{
1798 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1799 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1800 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
1801 		D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
1802 
1803 		if(n1==1)
1804 			return;
1805 
1806 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1807 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1808 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
1809 		D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
1810 
1811 		if(n1==2)
1812 			return;
1813 
1814 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1815 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
1816 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
1817 		D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
1818 
1819 		if(n1==3)
1820 			return;
1821 
1822 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1823 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
1824 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
1825 		D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
1826 		}
1827 	else if(m1>=3)
1828 		{
1829 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1830 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1831 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
1832 
1833 		if(n1==1)
1834 			return;
1835 
1836 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1837 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1838 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
1839 
1840 		if(n1==2)
1841 			return;
1842 
1843 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1844 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
1845 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
1846 
1847 		if(n1==3)
1848 			return;
1849 
1850 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1851 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
1852 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
1853 		}
1854 	else if(m1>=2)
1855 		{
1856 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1857 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1858 
1859 		if(n1==1)
1860 			return;
1861 
1862 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1863 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1864 
1865 		if(n1==2)
1866 			return;
1867 
1868 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1869 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
1870 
1871 		if(n1==3)
1872 			return;
1873 
1874 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1875 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
1876 		}
1877 	else //if(m1>=1)
1878 		{
1879 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1880 
1881 		if(n1==1)
1882 			return;
1883 
1884 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1885 
1886 		if(n1==2)
1887 			return;
1888 
1889 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1890 
1891 		if(n1==3)
1892 			return;
1893 
1894 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1895 		}
1896 
1897 	return;
1898 
1899 	}
1900 #endif
1901 
1902 
1903 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nn_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)1904 void kernel_dgemm_nn_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
1905 	{
1906 
1907 	kernel_dgemm_nn_4x4_libcccc(kmax, alpha, A, 4, B, ldb, beta, C, ldc, D, ldd);
1908 
1909 	return;
1910 
1911 	}
1912 #endif
1913 
1914 
1915 
1916 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nn_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)1917 void kernel_dgemm_nn_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
1918 	{
1919 
1920 	kernel_dgemm_nn_4x4_vs_libcccc(kmax, alpha, A, 4, B, ldb, beta, C, ldc, D, ldd, m1, n1);
1921 
1922 	return;
1923 
1924 	}
1925 #endif
1926 
1927 
1928 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_tt_4x4_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)1929 void kernel_dgemm_tt_4x4_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
1930 	{
1931 
1932 	const int bs = 4;
1933 
1934 #if defined(TARGET_GENERIC)
1935 	double CC[16] = {0};
1936 #else
1937 	ALIGNED( double CC[16], 64 ) = {0};
1938 #endif
1939 
1940 	double alpha1 = 1.0;
1941 	double beta1 = 0.0;
1942 
1943 	kernel_dgemm_nn_4x4_libcccc(kmax, &alpha1, B, ldb, A, lda, &beta1, CC, bs, CC, bs);
1944 
1945 	double tmp;
1946 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
1947 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
1948 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
1949 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
1950 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
1951 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
1952 
1953 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
1954 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
1955 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
1956 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
1957 
1958 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
1959 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
1960 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
1961 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
1962 
1963 	D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
1964 	D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
1965 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
1966 	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
1967 
1968 	D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
1969 	D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
1970 	D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
1971 	D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
1972 
1973 	return;
1974 
1975 	}
1976 #endif
1977 
1978 
1979 
1980 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_tt_4x4_vs_libcccc(int kmax,double * alpha,double * A,int lda,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)1981 void kernel_dgemm_tt_4x4_vs_libcccc(int kmax, double *alpha, double *A, int lda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
1982 	{
1983 
1984 	const int bs = 4;
1985 
1986 #if defined(TARGET_GENERIC)
1987 	double CC[16] = {0};
1988 #else
1989 	ALIGNED( double CC[16], 64 ) = {0};
1990 #endif
1991 
1992 	double alpha1 = 1.0;
1993 	double beta1 = 0.0;
1994 
1995 	kernel_dgemm_nn_4x4_libcccc(kmax, &alpha1, B, ldb, A, lda, &beta1, CC, bs, CC, bs);
1996 
1997 	double tmp;
1998 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
1999 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
2000 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
2001 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
2002 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
2003 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
2004 
2005 	if(m1>=4)
2006 		{
2007 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2008 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2009 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2010 		D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
2011 
2012 		if(n1==1)
2013 			return;
2014 
2015 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2016 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2017 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2018 		D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
2019 
2020 		if(n1==2)
2021 			return;
2022 
2023 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2024 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2025 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2026 		D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
2027 
2028 		if(n1==3)
2029 			return;
2030 
2031 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2032 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2033 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
2034 		D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
2035 		}
2036 	else if(m1>=3)
2037 		{
2038 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2039 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2040 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2041 
2042 		if(n1==1)
2043 			return;
2044 
2045 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2046 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2047 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2048 
2049 		if(n1==2)
2050 			return;
2051 
2052 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2053 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2054 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2055 
2056 		if(n1==3)
2057 			return;
2058 
2059 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2060 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2061 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
2062 		}
2063 	else if(m1>=2)
2064 		{
2065 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2066 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2067 
2068 		if(n1==1)
2069 			return;
2070 
2071 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2072 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2073 
2074 		if(n1==2)
2075 			return;
2076 
2077 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2078 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2079 
2080 		if(n1==3)
2081 			return;
2082 
2083 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2084 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2085 		}
2086 	else //if(m1>=1)
2087 		{
2088 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2089 
2090 		if(n1==1)
2091 			return;
2092 
2093 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2094 
2095 		if(n1==2)
2096 			return;
2097 
2098 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2099 
2100 		if(n1==3)
2101 			return;
2102 
2103 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2104 		}
2105 
2106 	return;
2107 
2108 	}
2109 #endif
2110 
2111 
2112 
2113 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_tt_4x4_libc4cc(int kmax,double * alpha,double * A,int lda,double * B,double * beta,double * C,int ldc,double * D,int ldd)2114 void kernel_dgemm_tt_4x4_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd)
2115 	{
2116 
2117 	const int bs = 4;
2118 
2119 #if defined(TARGET_GENERIC)
2120 	double CC[16] = {0};
2121 #else
2122 	ALIGNED( double CC[16], 64 ) = {0};
2123 #endif
2124 
2125 	double alpha1 = 1.0;
2126 	double beta1 = 0.0;
2127 
2128 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, B, A, lda, &beta1, CC, bs, CC, bs);
2129 
2130 	double tmp;
2131 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
2132 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
2133 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
2134 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
2135 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
2136 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
2137 
2138 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2139 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2140 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2141 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
2142 
2143 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2144 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2145 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2146 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
2147 
2148 	D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2149 	D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2150 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2151 	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
2152 
2153 	D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2154 	D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2155 	D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
2156 	D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
2157 
2158 	return;
2159 
2160 	}
2161 #endif
2162 
2163 
2164 
2165 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_tt_4x4_vs_libc4cc(int kmax,double * alpha,double * A,int lda,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)2166 void kernel_dgemm_tt_4x4_vs_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
2167 	{
2168 
2169 	const int bs = 4;
2170 
2171 #if defined(TARGET_GENERIC)
2172 	double CC[16] = {0};
2173 #else
2174 	ALIGNED( double CC[16], 64 ) = {0};
2175 #endif
2176 
2177 	double alpha1 = 1.0;
2178 	double beta1 = 0.0;
2179 
2180 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, B, A, lda, &beta1, CC, bs, CC, bs);
2181 
2182 	double tmp;
2183 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
2184 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
2185 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
2186 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
2187 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
2188 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
2189 
2190 	if(m1>=4)
2191 		{
2192 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2193 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2194 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2195 		D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
2196 
2197 		if(n1==1)
2198 			return;
2199 
2200 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2201 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2202 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2203 		D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
2204 
2205 		if(n1==2)
2206 			return;
2207 
2208 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2209 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2210 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2211 		D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
2212 
2213 		if(n1==3)
2214 			return;
2215 
2216 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2217 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2218 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
2219 		D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
2220 		}
2221 	else if(m1>=3)
2222 		{
2223 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2224 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2225 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2226 
2227 		if(n1==1)
2228 			return;
2229 
2230 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2231 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2232 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2233 
2234 		if(n1==2)
2235 			return;
2236 
2237 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2238 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2239 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2240 
2241 		if(n1==3)
2242 			return;
2243 
2244 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2245 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2246 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
2247 		}
2248 	else if(m1>=2)
2249 		{
2250 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2251 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2252 
2253 		if(n1==1)
2254 			return;
2255 
2256 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2257 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2258 
2259 		if(n1==2)
2260 			return;
2261 
2262 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2263 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2264 
2265 		if(n1==3)
2266 			return;
2267 
2268 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2269 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2270 		}
2271 	else //if(m1>=1)
2272 		{
2273 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2274 
2275 		if(n1==1)
2276 			return;
2277 
2278 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2279 
2280 		if(n1==2)
2281 			return;
2282 
2283 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2284 
2285 		if(n1==3)
2286 			return;
2287 
2288 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2289 		}
2290 
2291 	return;
2292 
2293 	}
2294 #endif
2295 
2296 
2297 
2298 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nt_l_4x4_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd)2299 void kernel_dsyrk_nt_l_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
2300 	{
2301 
2302 	const int bs = 4;
2303 
2304 #if defined(TARGET_GENERIC)
2305 	double CC[16] = {0};
2306 #else
2307 	ALIGNED( double CC[16], 64 ) = {0};
2308 #endif
2309 
2310 	double alpha1 = 1.0;
2311 	double beta1 = 0.0;
2312 
2313 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
2314 
2315 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2316 	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2317 	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2318 	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
2319 
2320 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2321 	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2322 	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
2323 
2324 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2325 	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
2326 
2327 	D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
2328 
2329 	return;
2330 
2331 	}
2332 #endif
2333 
2334 
2335 
2336 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER)  || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nt_l_4x4_vs_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)2337 void kernel_dsyrk_nt_l_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
2338 	{
2339 
2340 	const int bs = 4;
2341 
2342 #if defined(TARGET_GENERIC)
2343 	double CC[16] = {0};
2344 #else
2345 	ALIGNED( double CC[16], 64 ) = {0};
2346 #endif
2347 
2348 	double alpha1 = 1.0;
2349 	double beta1 = 0.0;
2350 
2351 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
2352 
2353 	if(m1>=4)
2354 		{
2355 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2356 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2357 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2358 		D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
2359 
2360 		if(n1==1)
2361 			return;
2362 
2363 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2364 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2365 		D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
2366 
2367 		if(n1==2)
2368 			return;
2369 
2370 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2371 		D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
2372 
2373 		if(n1==3)
2374 			return;
2375 
2376 		D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
2377 		}
2378 	else if(m1>=3)
2379 		{
2380 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2381 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2382 		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2383 
2384 		if(n1==1)
2385 			return;
2386 
2387 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2388 		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2389 
2390 		if(n1==2)
2391 			return;
2392 
2393 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2394 		}
2395 	else if(m1>=2)
2396 		{
2397 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2398 		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2399 
2400 		if(n1==1)
2401 			return;
2402 
2403 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2404 		}
2405 	else //if(m1>=1)
2406 		{
2407 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2408 		}
2409 
2410 	return;
2411 
2412 	}
2413 #endif
2414 
2415 
2416 
2417 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nt_u_4x4_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd)2418 void kernel_dsyrk_nt_u_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
2419 	{
2420 
2421 	const int bs = 4;
2422 
2423 #if defined(TARGET_GENERIC)
2424 	double CC[16] = {0};
2425 #else
2426 	ALIGNED( double CC[16], 64 ) = {0};
2427 #endif
2428 
2429 	double alpha1 = 1.0;
2430 	double beta1 = 0.0;
2431 
2432 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
2433 
2434 	D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2435 //	D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2436 //	D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2437 //	D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
2438 
2439 	D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2440 	D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2441 //	D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2442 //	D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
2443 
2444 	D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2445 	D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2446 	D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2447 //	D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
2448 
2449 	D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2450 	D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2451 	D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
2452 	D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
2453 
2454 	return;
2455 
2456 	}
2457 #endif
2458 
2459 
2460 
2461 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nt_u_4x4_vs_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)2462 void kernel_dsyrk_nt_u_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
2463 	{
2464 
2465 	const int bs = 4;
2466 
2467 #if defined(TARGET_GENERIC)
2468 	double CC[16] = {0};
2469 #else
2470 	ALIGNED( double CC[16], 64 ) = {0};
2471 #endif
2472 
2473 	double alpha1 = 1.0;
2474 	double beta1 = 0.0;
2475 
2476 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
2477 
2478 	if(m1>=4)
2479 		{
2480 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2481 //		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2482 //		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2483 //		D[3+ldd*0] = beta[0]*C[3+ldc*0] + alpha[0]*CC[3+bs*0];
2484 
2485 		if(n1==1)
2486 			return;
2487 
2488 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2489 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2490 //		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2491 //		D[3+ldd*1] = beta[0]*C[3+ldc*1] + alpha[0]*CC[3+bs*1];
2492 
2493 		if(n1==2)
2494 			return;
2495 
2496 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2497 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2498 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2499 //		D[3+ldd*2] = beta[0]*C[3+ldc*2] + alpha[0]*CC[3+bs*2];
2500 
2501 		if(n1==3)
2502 			return;
2503 
2504 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2505 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2506 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
2507 		D[3+ldd*3] = beta[0]*C[3+ldc*3] + alpha[0]*CC[3+bs*3];
2508 		}
2509 	else if(m1>=3)
2510 		{
2511 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2512 //		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2513 //		D[2+ldd*0] = beta[0]*C[2+ldc*0] + alpha[0]*CC[2+bs*0];
2514 
2515 		if(n1==1)
2516 			return;
2517 
2518 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2519 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2520 //		D[2+ldd*1] = beta[0]*C[2+ldc*1] + alpha[0]*CC[2+bs*1];
2521 
2522 		if(n1==2)
2523 			return;
2524 
2525 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2526 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2527 		D[2+ldd*2] = beta[0]*C[2+ldc*2] + alpha[0]*CC[2+bs*2];
2528 
2529 		if(n1==3)
2530 			return;
2531 
2532 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2533 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2534 		D[2+ldd*3] = beta[0]*C[2+ldc*3] + alpha[0]*CC[2+bs*3];
2535 		}
2536 	else if(m1>=2)
2537 		{
2538 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2539 //		D[1+ldd*0] = beta[0]*C[1+ldc*0] + alpha[0]*CC[1+bs*0];
2540 
2541 		if(n1==1)
2542 			return;
2543 
2544 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2545 		D[1+ldd*1] = beta[0]*C[1+ldc*1] + alpha[0]*CC[1+bs*1];
2546 
2547 		if(n1==2)
2548 			return;
2549 
2550 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2551 		D[1+ldd*2] = beta[0]*C[1+ldc*2] + alpha[0]*CC[1+bs*2];
2552 
2553 		if(n1==3)
2554 			return;
2555 
2556 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2557 		D[1+ldd*3] = beta[0]*C[1+ldc*3] + alpha[0]*CC[1+bs*3];
2558 		}
2559 	else //if(m1>=1)
2560 		{
2561 		D[0+ldd*0] = beta[0]*C[0+ldc*0] + alpha[0]*CC[0+bs*0];
2562 
2563 		if(n1==1)
2564 			return;
2565 
2566 		D[0+ldd*1] = beta[0]*C[0+ldc*1] + alpha[0]*CC[0+bs*1];
2567 
2568 		if(n1==2)
2569 			return;
2570 
2571 		D[0+ldd*2] = beta[0]*C[0+ldc*2] + alpha[0]*CC[0+bs*2];
2572 
2573 		if(n1==3)
2574 			return;
2575 
2576 		D[0+ldd*3] = beta[0]*C[0+ldc*3] + alpha[0]*CC[0+bs*3];
2577 		}
2578 
2579 	return;
2580 
2581 	}
2582 #endif
2583 
2584 
2585 
2586 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)2587 void kernel_dtrmm_nn_rl_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
2588 	{
2589 
2590 	const int bs = 4;
2591 
2592 	double
2593 		a_0, a_1, a_2, a_3,
2594 		b_0, b_1, b_2, b_3;
2595 
2596 #if defined(TARGET_GENERIC)
2597 	double CC[16] = {0};
2598 #else
2599 	ALIGNED( double CC[16], 64 ) = {0};
2600 #endif
2601 
2602 	int k;
2603 
2604 	k = 0;
2605 
2606 	// k = 0
2607 
2608 	a_0 = A[0];
2609 	a_1 = A[1];
2610 	a_2 = A[2];
2611 	a_3 = A[3];
2612 
2613 	b_0 = B[0+0*ldb];
2614 	CC[0+bs*0] += a_0 * b_0;
2615 	CC[1+bs*0] += a_1 * b_0;
2616 	CC[2+bs*0] += a_2 * b_0;
2617 	CC[3+bs*0] += a_3 * b_0;
2618 
2619 	A += bs;
2620 	B += 1;
2621 	k += 1;
2622 
2623 	if(k>=kmax)
2624 		goto store;
2625 
2626 	// k = 1
2627 
2628 	a_0 = A[0];
2629 	a_1 = A[1];
2630 	a_2 = A[2];
2631 	a_3 = A[3];
2632 
2633 	b_0 = B[0+0*ldb];
2634 	CC[0+bs*0] += a_0 * b_0;
2635 	CC[1+bs*0] += a_1 * b_0;
2636 	CC[2+bs*0] += a_2 * b_0;
2637 	CC[3+bs*0] += a_3 * b_0;
2638 
2639 	b_1 = B[0+1*ldb];
2640 	CC[0+bs*1] += a_0 * b_1;
2641 	CC[1+bs*1] += a_1 * b_1;
2642 	CC[2+bs*1] += a_2 * b_1;
2643 	CC[3+bs*1] += a_3 * b_1;
2644 
2645 	A += bs;
2646 	B += 1;
2647 	k += 1;
2648 
2649 	if(k>=kmax)
2650 		goto store;
2651 
2652 	// k = 2
2653 
2654 	a_0 = A[0];
2655 	a_1 = A[1];
2656 	a_2 = A[2];
2657 	a_3 = A[3];
2658 
2659 	b_0 = B[0+0*ldb];
2660 	CC[0+bs*0] += a_0 * b_0;
2661 	CC[1+bs*0] += a_1 * b_0;
2662 	CC[2+bs*0] += a_2 * b_0;
2663 	CC[3+bs*0] += a_3 * b_0;
2664 
2665 	b_1 = B[0+1*ldb];
2666 	CC[0+bs*1] += a_0 * b_1;
2667 	CC[1+bs*1] += a_1 * b_1;
2668 	CC[2+bs*1] += a_2 * b_1;
2669 	CC[3+bs*1] += a_3 * b_1;
2670 
2671 	b_2 = B[0+2*ldb];
2672 	CC[0+bs*2] += a_0 * b_2;
2673 	CC[1+bs*2] += a_1 * b_2;
2674 	CC[2+bs*2] += a_2 * b_2;
2675 	CC[3+bs*2] += a_3 * b_2;
2676 
2677 	A += bs;
2678 	B += 1;
2679 	k += 1;
2680 
2681 	if(k>=kmax)
2682 		goto store;
2683 
2684 	// k = 3
2685 
2686 	a_0 = A[0];
2687 	a_1 = A[1];
2688 	a_2 = A[2];
2689 	a_3 = A[3];
2690 
2691 	b_0 = B[0+0*ldb];
2692 	CC[0+bs*0] += a_0 * b_0;
2693 	CC[1+bs*0] += a_1 * b_0;
2694 	CC[2+bs*0] += a_2 * b_0;
2695 	CC[3+bs*0] += a_3 * b_0;
2696 
2697 	b_1 = B[0+1*ldb];
2698 	CC[0+bs*1] += a_0 * b_1;
2699 	CC[1+bs*1] += a_1 * b_1;
2700 	CC[2+bs*1] += a_2 * b_1;
2701 	CC[3+bs*1] += a_3 * b_1;
2702 
2703 	b_2 = B[0+2*ldb];
2704 	CC[0+bs*2] += a_0 * b_2;
2705 	CC[1+bs*2] += a_1 * b_2;
2706 	CC[2+bs*2] += a_2 * b_2;
2707 	CC[3+bs*2] += a_3 * b_2;
2708 
2709 	b_3 = B[0+3*ldb];
2710 	CC[0+bs*3] += a_0 * b_3;
2711 	CC[1+bs*3] += a_1 * b_3;
2712 	CC[2+bs*3] += a_2 * b_3;
2713 	CC[3+bs*3] += a_3 * b_3;
2714 
2715 	A += bs;
2716 	B += 1;
2717 	k += 1;
2718 
2719 	store:
2720 
2721 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
2722 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
2723 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
2724 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
2725 
2726 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
2727 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
2728 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
2729 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
2730 
2731 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
2732 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
2733 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
2734 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
2735 
2736 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
2737 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
2738 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
2739 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
2740 
2741 	double beta1 = 1.0;
2742 
2743 	kernel_dgemm_nn_4x4_lib4ccc(kmax-k, alpha, A, B, ldb, &beta1, CC, bs, D, ldd);
2744 
2745 	return;
2746 
2747 	}
2748 #endif
2749 
2750 
2751 
2752 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)2753 void kernel_dtrmm_nn_rl_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
2754 	{
2755 
2756 	const int bs = 4;
2757 
2758 	double
2759 		a_0, a_1, a_2, a_3,
2760 		b_0, b_1, b_2, b_3;
2761 
2762 #if defined(TARGET_GENERIC)
2763 	double CC[16] = {0};
2764 #else
2765 	ALIGNED( double CC[16], 64 ) = {0};
2766 #endif
2767 
2768 	int k;
2769 
2770 	k = 0;
2771 
2772 	// k = 0
2773 
2774 	a_0 = A[0];
2775 	a_1 = A[1];
2776 	a_2 = A[2];
2777 	a_3 = A[3];
2778 
2779 	b_0 = B[0+0*ldb];
2780 	CC[0+bs*0] += a_0 * b_0;
2781 	CC[1+bs*0] += a_1 * b_0;
2782 	CC[2+bs*0] += a_2 * b_0;
2783 	CC[3+bs*0] += a_3 * b_0;
2784 
2785 	A += bs;
2786 	B += 1;
2787 	k += 1;
2788 
2789 	if(k>=kmax)
2790 		goto store;
2791 
2792 	// k = 1
2793 
2794 	a_0 = A[0];
2795 	a_1 = A[1];
2796 	a_2 = A[2];
2797 	a_3 = A[3];
2798 
2799 	b_0 = B[0+0*ldb];
2800 	CC[0+bs*0] += a_0 * b_0;
2801 	CC[1+bs*0] += a_1 * b_0;
2802 	CC[2+bs*0] += a_2 * b_0;
2803 	CC[3+bs*0] += a_3 * b_0;
2804 
2805 	b_1 = B[0+1*ldb];
2806 	CC[0+bs*1] += a_0 * b_1;
2807 	CC[1+bs*1] += a_1 * b_1;
2808 	CC[2+bs*1] += a_2 * b_1;
2809 	CC[3+bs*1] += a_3 * b_1;
2810 
2811 	A += bs;
2812 	B += 1;
2813 	k += 1;
2814 
2815 	if(k>=kmax)
2816 		goto store;
2817 
2818 	// k = 2
2819 
2820 	a_0 = A[0];
2821 	a_1 = A[1];
2822 	a_2 = A[2];
2823 	a_3 = A[3];
2824 
2825 	b_0 = B[0+0*ldb];
2826 	CC[0+bs*0] += a_0 * b_0;
2827 	CC[1+bs*0] += a_1 * b_0;
2828 	CC[2+bs*0] += a_2 * b_0;
2829 	CC[3+bs*0] += a_3 * b_0;
2830 
2831 	b_1 = B[0+1*ldb];
2832 	CC[0+bs*1] += a_0 * b_1;
2833 	CC[1+bs*1] += a_1 * b_1;
2834 	CC[2+bs*1] += a_2 * b_1;
2835 	CC[3+bs*1] += a_3 * b_1;
2836 
2837 	b_2 = B[0+2*ldb];
2838 	CC[0+bs*2] += a_0 * b_2;
2839 	CC[1+bs*2] += a_1 * b_2;
2840 	CC[2+bs*2] += a_2 * b_2;
2841 	CC[3+bs*2] += a_3 * b_2;
2842 
2843 	A += bs;
2844 	B += 1;
2845 	k += 1;
2846 
2847 	if(k>=kmax)
2848 		goto store;
2849 
2850 	// k = 3
2851 
2852 	a_0 = A[0];
2853 	a_1 = A[1];
2854 	a_2 = A[2];
2855 	a_3 = A[3];
2856 
2857 	b_0 = B[0+0*ldb];
2858 	CC[0+bs*0] += a_0 * b_0;
2859 	CC[1+bs*0] += a_1 * b_0;
2860 	CC[2+bs*0] += a_2 * b_0;
2861 	CC[3+bs*0] += a_3 * b_0;
2862 
2863 	b_1 = B[0+1*ldb];
2864 	CC[0+bs*1] += a_0 * b_1;
2865 	CC[1+bs*1] += a_1 * b_1;
2866 	CC[2+bs*1] += a_2 * b_1;
2867 	CC[3+bs*1] += a_3 * b_1;
2868 
2869 	b_2 = B[0+2*ldb];
2870 	CC[0+bs*2] += a_0 * b_2;
2871 	CC[1+bs*2] += a_1 * b_2;
2872 	CC[2+bs*2] += a_2 * b_2;
2873 	CC[3+bs*2] += a_3 * b_2;
2874 
2875 	b_3 = B[0+3*ldb];
2876 	CC[0+bs*3] += a_0 * b_3;
2877 	CC[1+bs*3] += a_1 * b_3;
2878 	CC[2+bs*3] += a_2 * b_3;
2879 	CC[3+bs*3] += a_3 * b_3;
2880 
2881 	A += bs;
2882 	B += 1;
2883 	k += 1;
2884 
2885 	store:
2886 
2887 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
2888 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
2889 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
2890 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
2891 
2892 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
2893 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
2894 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
2895 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
2896 
2897 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
2898 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
2899 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
2900 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
2901 
2902 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
2903 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
2904 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
2905 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
2906 
2907 	double beta1 = 1.0;
2908 
2909 	kernel_dgemm_nn_4x4_lib4ccc(kmax-k, alpha, A, B, ldb, &beta1, CC, bs, CC, bs);
2910 
2911 	if(m1>=4)
2912 		{
2913 		D[0+ldd*0] = CC[0+bs*0];
2914 		D[1+ldd*0] = CC[1+bs*0];
2915 		D[2+ldd*0] = CC[2+bs*0];
2916 		D[3+ldd*0] = CC[3+bs*0];
2917 
2918 		if(n1==1)
2919 			return;
2920 
2921 		D[0+ldd*1] = CC[0+bs*1];
2922 		D[1+ldd*1] = CC[1+bs*1];
2923 		D[2+ldd*1] = CC[2+bs*1];
2924 		D[3+ldd*1] = CC[3+bs*1];
2925 
2926 		if(n1==2)
2927 			return;
2928 
2929 		D[0+ldd*2] = CC[0+bs*2];
2930 		D[1+ldd*2] = CC[1+bs*2];
2931 		D[2+ldd*2] = CC[2+bs*2];
2932 		D[3+ldd*2] = CC[3+bs*2];
2933 
2934 		if(n1==3)
2935 			return;
2936 
2937 		D[0+ldd*3] = CC[0+bs*3];
2938 		D[1+ldd*3] = CC[1+bs*3];
2939 		D[2+ldd*3] = CC[2+bs*3];
2940 		D[3+ldd*3] = CC[3+bs*3];
2941 		}
2942 	else if(m1>=3)
2943 		{
2944 		D[0+ldd*0] = CC[0+bs*0];
2945 		D[1+ldd*0] = CC[1+bs*0];
2946 		D[2+ldd*0] = CC[2+bs*0];
2947 
2948 		if(n1==1)
2949 			return;
2950 
2951 		D[0+ldd*1] = CC[0+bs*1];
2952 		D[1+ldd*1] = CC[1+bs*1];
2953 		D[2+ldd*1] = CC[2+bs*1];
2954 
2955 		if(n1==2)
2956 			return;
2957 
2958 		D[0+ldd*2] = CC[0+bs*2];
2959 		D[1+ldd*2] = CC[1+bs*2];
2960 		D[2+ldd*2] = CC[2+bs*2];
2961 
2962 		if(n1==3)
2963 			return;
2964 
2965 		D[0+ldd*3] = CC[0+bs*3];
2966 		D[1+ldd*3] = CC[1+bs*3];
2967 		D[2+ldd*3] = CC[2+bs*3];
2968 		}
2969 	else if(m1>=2)
2970 		{
2971 		D[0+ldd*0] = CC[0+bs*0];
2972 		D[1+ldd*0] = CC[1+bs*0];
2973 
2974 		if(n1==1)
2975 			return;
2976 
2977 		D[0+ldd*1] = CC[0+bs*1];
2978 		D[1+ldd*1] = CC[1+bs*1];
2979 
2980 		if(n1==2)
2981 			return;
2982 
2983 		D[0+ldd*2] = CC[0+bs*2];
2984 		D[1+ldd*2] = CC[1+bs*2];
2985 
2986 		if(n1==3)
2987 			return;
2988 
2989 		D[0+ldd*3] = CC[0+bs*3];
2990 		D[1+ldd*3] = CC[1+bs*3];
2991 		}
2992 	else //if(m1>=1)
2993 		{
2994 		D[0+ldd*0] = CC[0+bs*0];
2995 
2996 		if(n1==1)
2997 			return;
2998 
2999 		D[0+ldd*1] = CC[0+bs*1];
3000 
3001 		if(n1==2)
3002 			return;
3003 
3004 		D[0+ldd*2] = CC[0+bs*2];
3005 
3006 		if(n1==3)
3007 			return;
3008 
3009 		D[0+ldd*3] = CC[0+bs*3];
3010 		}
3011 
3012 	return;
3013 
3014 	}
3015 #endif
3016 
3017 
3018 
3019 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_4x4_tran_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd)3020 void kernel_dtrmm_nn_rl_4x4_tran_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd)
3021 	{
3022 
3023 	const int bs = 4;
3024 
3025 	double
3026 		a_0, a_1, a_2, a_3,
3027 		b_0, b_1, b_2, b_3;
3028 
3029 #if defined(TARGET_GENERIC)
3030 	double CC[16] = {0};
3031 #else
3032 	ALIGNED( double CC[16], 64 ) = {0};
3033 #endif
3034 
3035 	int k;
3036 
3037 	k = 0;
3038 
3039 	// k = 0
3040 
3041 	a_0 = A[0];
3042 	a_1 = A[1];
3043 	a_2 = A[2];
3044 	a_3 = A[3];
3045 
3046 	b_0 = B[0+0*ldb];
3047 	CC[0+bs*0] += a_0 * b_0;
3048 	CC[1+bs*0] += a_1 * b_0;
3049 	CC[2+bs*0] += a_2 * b_0;
3050 	CC[3+bs*0] += a_3 * b_0;
3051 
3052 	A += bs;
3053 	B += 1;
3054 	k += 1;
3055 
3056 	if(k>=kmax)
3057 		goto store;
3058 
3059 	// k = 1
3060 
3061 	a_0 = A[0];
3062 	a_1 = A[1];
3063 	a_2 = A[2];
3064 	a_3 = A[3];
3065 
3066 	b_0 = B[0+0*ldb];
3067 	CC[0+bs*0] += a_0 * b_0;
3068 	CC[1+bs*0] += a_1 * b_0;
3069 	CC[2+bs*0] += a_2 * b_0;
3070 	CC[3+bs*0] += a_3 * b_0;
3071 
3072 	b_1 = B[0+1*ldb];
3073 	CC[0+bs*1] += a_0 * b_1;
3074 	CC[1+bs*1] += a_1 * b_1;
3075 	CC[2+bs*1] += a_2 * b_1;
3076 	CC[3+bs*1] += a_3 * b_1;
3077 
3078 	A += bs;
3079 	B += 1;
3080 	k += 1;
3081 
3082 	if(k>=kmax)
3083 		goto store;
3084 
3085 	// k = 2
3086 
3087 	a_0 = A[0];
3088 	a_1 = A[1];
3089 	a_2 = A[2];
3090 	a_3 = A[3];
3091 
3092 	b_0 = B[0+0*ldb];
3093 	CC[0+bs*0] += a_0 * b_0;
3094 	CC[1+bs*0] += a_1 * b_0;
3095 	CC[2+bs*0] += a_2 * b_0;
3096 	CC[3+bs*0] += a_3 * b_0;
3097 
3098 	b_1 = B[0+1*ldb];
3099 	CC[0+bs*1] += a_0 * b_1;
3100 	CC[1+bs*1] += a_1 * b_1;
3101 	CC[2+bs*1] += a_2 * b_1;
3102 	CC[3+bs*1] += a_3 * b_1;
3103 
3104 	b_2 = B[0+2*ldb];
3105 	CC[0+bs*2] += a_0 * b_2;
3106 	CC[1+bs*2] += a_1 * b_2;
3107 	CC[2+bs*2] += a_2 * b_2;
3108 	CC[3+bs*2] += a_3 * b_2;
3109 
3110 	A += bs;
3111 	B += 1;
3112 	k += 1;
3113 
3114 	if(k>=kmax)
3115 		goto store;
3116 
3117 	// k = 3
3118 
3119 	a_0 = A[0];
3120 	a_1 = A[1];
3121 	a_2 = A[2];
3122 	a_3 = A[3];
3123 
3124 	b_0 = B[0+0*ldb];
3125 	CC[0+bs*0] += a_0 * b_0;
3126 	CC[1+bs*0] += a_1 * b_0;
3127 	CC[2+bs*0] += a_2 * b_0;
3128 	CC[3+bs*0] += a_3 * b_0;
3129 
3130 	b_1 = B[0+1*ldb];
3131 	CC[0+bs*1] += a_0 * b_1;
3132 	CC[1+bs*1] += a_1 * b_1;
3133 	CC[2+bs*1] += a_2 * b_1;
3134 	CC[3+bs*1] += a_3 * b_1;
3135 
3136 	b_2 = B[0+2*ldb];
3137 	CC[0+bs*2] += a_0 * b_2;
3138 	CC[1+bs*2] += a_1 * b_2;
3139 	CC[2+bs*2] += a_2 * b_2;
3140 	CC[3+bs*2] += a_3 * b_2;
3141 
3142 	b_3 = B[0+3*ldb];
3143 	CC[0+bs*3] += a_0 * b_3;
3144 	CC[1+bs*3] += a_1 * b_3;
3145 	CC[2+bs*3] += a_2 * b_3;
3146 	CC[3+bs*3] += a_3 * b_3;
3147 
3148 	A += bs;
3149 	B += 1;
3150 	k += 1;
3151 
3152 	double alpha1 = 1.0;
3153 	double beta1 = 1.0;
3154 
3155 	kernel_dgemm_nn_4x4_lib4ccc(kmax-k, &alpha1, A, B, ldb, &beta1, CC, bs, CC, bs);
3156 
3157 	store:
3158 
3159 	// scale & tranpose & store
3160 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
3161 	D[0+ldd*1] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
3162 	D[0+ldd*2] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
3163 	D[0+ldd*3] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
3164 
3165 	D[1+ldd*0] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
3166 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
3167 	D[1+ldd*2] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
3168 	D[1+ldd*3] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
3169 
3170 	D[2+ldd*0] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
3171 	D[2+ldd*1] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
3172 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
3173 	D[2+ldd*3] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
3174 
3175 	D[3+ldd*0] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
3176 	D[3+ldd*1] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
3177 	D[3+ldd*2] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
3178 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
3179 
3180 	return;
3181 
3182 	}
3183 #endif
3184 
3185 
3186 
3187 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_4x4_tran_vs_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd,int m1,int n1)3188 void kernel_dtrmm_nn_rl_4x4_tran_vs_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1)
3189 	{
3190 
3191 	const int bs = 4;
3192 
3193 	double
3194 		tmp,
3195 		a_0, a_1, a_2, a_3,
3196 		b_0, b_1, b_2, b_3;
3197 
3198 #if defined(TARGET_GENERIC)
3199 	double CC[16] = {0};
3200 #else
3201 	ALIGNED( double CC[16], 64 ) = {0};
3202 #endif
3203 
3204 	int k;
3205 
3206 	k = 0;
3207 
3208 	// k = 0
3209 
3210 	a_0 = A[0];
3211 	a_1 = A[1];
3212 	a_2 = A[2];
3213 	a_3 = A[3];
3214 
3215 	b_0 = B[0+0*ldb];
3216 	CC[0+bs*0] += a_0 * b_0;
3217 	CC[1+bs*0] += a_1 * b_0;
3218 	CC[2+bs*0] += a_2 * b_0;
3219 	CC[3+bs*0] += a_3 * b_0;
3220 
3221 	A += bs;
3222 	B += 1;
3223 	k += 1;
3224 
3225 	if(k>=kmax)
3226 		goto store;
3227 
3228 	// k = 1
3229 
3230 	a_0 = A[0];
3231 	a_1 = A[1];
3232 	a_2 = A[2];
3233 	a_3 = A[3];
3234 
3235 	b_0 = B[0+0*ldb];
3236 	CC[0+bs*0] += a_0 * b_0;
3237 	CC[1+bs*0] += a_1 * b_0;
3238 	CC[2+bs*0] += a_2 * b_0;
3239 	CC[3+bs*0] += a_3 * b_0;
3240 
3241 	b_1 = B[0+1*ldb];
3242 	CC[0+bs*1] += a_0 * b_1;
3243 	CC[1+bs*1] += a_1 * b_1;
3244 	CC[2+bs*1] += a_2 * b_1;
3245 	CC[3+bs*1] += a_3 * b_1;
3246 
3247 	A += bs;
3248 	B += 1;
3249 	k += 1;
3250 
3251 	if(k>=kmax)
3252 		goto store;
3253 
3254 	// k = 2
3255 
3256 	a_0 = A[0];
3257 	a_1 = A[1];
3258 	a_2 = A[2];
3259 	a_3 = A[3];
3260 
3261 	b_0 = B[0+0*ldb];
3262 	CC[0+bs*0] += a_0 * b_0;
3263 	CC[1+bs*0] += a_1 * b_0;
3264 	CC[2+bs*0] += a_2 * b_0;
3265 	CC[3+bs*0] += a_3 * b_0;
3266 
3267 	b_1 = B[0+1*ldb];
3268 	CC[0+bs*1] += a_0 * b_1;
3269 	CC[1+bs*1] += a_1 * b_1;
3270 	CC[2+bs*1] += a_2 * b_1;
3271 	CC[3+bs*1] += a_3 * b_1;
3272 
3273 	b_2 = B[0+2*ldb];
3274 	CC[0+bs*2] += a_0 * b_2;
3275 	CC[1+bs*2] += a_1 * b_2;
3276 	CC[2+bs*2] += a_2 * b_2;
3277 	CC[3+bs*2] += a_3 * b_2;
3278 
3279 	A += bs;
3280 	B += 1;
3281 	k += 1;
3282 
3283 	if(k>=kmax)
3284 		goto store;
3285 
3286 	// k = 3
3287 
3288 	a_0 = A[0];
3289 	a_1 = A[1];
3290 	a_2 = A[2];
3291 	a_3 = A[3];
3292 
3293 	b_0 = B[0+0*ldb];
3294 	CC[0+bs*0] += a_0 * b_0;
3295 	CC[1+bs*0] += a_1 * b_0;
3296 	CC[2+bs*0] += a_2 * b_0;
3297 	CC[3+bs*0] += a_3 * b_0;
3298 
3299 	b_1 = B[0+1*ldb];
3300 	CC[0+bs*1] += a_0 * b_1;
3301 	CC[1+bs*1] += a_1 * b_1;
3302 	CC[2+bs*1] += a_2 * b_1;
3303 	CC[3+bs*1] += a_3 * b_1;
3304 
3305 	b_2 = B[0+2*ldb];
3306 	CC[0+bs*2] += a_0 * b_2;
3307 	CC[1+bs*2] += a_1 * b_2;
3308 	CC[2+bs*2] += a_2 * b_2;
3309 	CC[3+bs*2] += a_3 * b_2;
3310 
3311 	b_3 = B[0+3*ldb];
3312 	CC[0+bs*3] += a_0 * b_3;
3313 	CC[1+bs*3] += a_1 * b_3;
3314 	CC[2+bs*3] += a_2 * b_3;
3315 	CC[3+bs*3] += a_3 * b_3;
3316 
3317 	A += bs;
3318 	B += 1;
3319 	k += 1;
3320 
3321 	double alpha1 = 1.0;
3322 	double beta1 = 1.0;
3323 
3324 	kernel_dgemm_nn_4x4_lib4ccc(kmax-k, &alpha1, A, B, ldb, &beta1, CC, bs, CC, bs);
3325 
3326 	store:
3327 
3328 	// scale
3329 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
3330 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
3331 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
3332 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
3333 
3334 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
3335 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
3336 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
3337 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
3338 
3339 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
3340 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
3341 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
3342 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
3343 
3344 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
3345 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
3346 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
3347 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
3348 
3349 	// transpose
3350 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
3351 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
3352 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
3353 
3354 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
3355 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
3356 
3357 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
3358 
3359 	// store
3360 	if(m1>=4)
3361 		{
3362 		D[0+ldd*0] = CC[0+bs*0];
3363 		D[1+ldd*0] = CC[1+bs*0];
3364 		D[2+ldd*0] = CC[2+bs*0];
3365 		D[3+ldd*0] = CC[3+bs*0];
3366 
3367 		if(n1==1)
3368 			return;
3369 
3370 		D[0+ldd*1] = CC[0+bs*1];
3371 		D[1+ldd*1] = CC[1+bs*1];
3372 		D[2+ldd*1] = CC[2+bs*1];
3373 		D[3+ldd*1] = CC[3+bs*1];
3374 
3375 		if(n1==2)
3376 			return;
3377 
3378 		D[0+ldd*2] = CC[0+bs*2];
3379 		D[1+ldd*2] = CC[1+bs*2];
3380 		D[2+ldd*2] = CC[2+bs*2];
3381 		D[3+ldd*2] = CC[3+bs*2];
3382 
3383 		if(n1==3)
3384 			return;
3385 
3386 		D[0+ldd*3] = CC[0+bs*3];
3387 		D[1+ldd*3] = CC[1+bs*3];
3388 		D[2+ldd*3] = CC[2+bs*3];
3389 		D[3+ldd*3] = CC[3+bs*3];
3390 		}
3391 	else if(m1>=3)
3392 		{
3393 		D[0+ldd*0] = CC[0+bs*0];
3394 		D[1+ldd*0] = CC[1+bs*0];
3395 		D[2+ldd*0] = CC[2+bs*0];
3396 
3397 		if(n1==1)
3398 			return;
3399 
3400 		D[0+ldd*1] = CC[0+bs*1];
3401 		D[1+ldd*1] = CC[1+bs*1];
3402 		D[2+ldd*1] = CC[2+bs*1];
3403 
3404 		if(n1==2)
3405 			return;
3406 
3407 		D[0+ldd*2] = CC[0+bs*2];
3408 		D[1+ldd*2] = CC[1+bs*2];
3409 		D[2+ldd*2] = CC[2+bs*2];
3410 
3411 		if(n1==3)
3412 			return;
3413 
3414 		D[0+ldd*3] = CC[0+bs*3];
3415 		D[1+ldd*3] = CC[1+bs*3];
3416 		D[2+ldd*3] = CC[2+bs*3];
3417 		}
3418 	else if(m1>=2)
3419 		{
3420 		D[0+ldd*0] = CC[0+bs*0];
3421 		D[1+ldd*0] = CC[1+bs*0];
3422 
3423 		if(n1==1)
3424 			return;
3425 
3426 		D[0+ldd*1] = CC[0+bs*1];
3427 		D[1+ldd*1] = CC[1+bs*1];
3428 
3429 		if(n1==2)
3430 			return;
3431 
3432 		D[0+ldd*2] = CC[0+bs*2];
3433 		D[1+ldd*2] = CC[1+bs*2];
3434 
3435 		if(n1==3)
3436 			return;
3437 
3438 		D[0+ldd*3] = CC[0+bs*3];
3439 		D[1+ldd*3] = CC[1+bs*3];
3440 		}
3441 	else //if(m1>=1)
3442 		{
3443 		D[0+ldd*0] = CC[0+bs*0];
3444 
3445 		if(n1==1)
3446 			return;
3447 
3448 		D[0+ldd*1] = CC[0+bs*1];
3449 
3450 		if(n1==2)
3451 			return;
3452 
3453 		D[0+ldd*2] = CC[0+bs*2];
3454 
3455 		if(n1==3)
3456 			return;
3457 
3458 		D[0+ldd*3] = CC[0+bs*3];
3459 		}
3460 
3461 	return;
3462 
3463 	}
3464 #endif
3465 
3466 
3467 
3468 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_one_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)3469 void kernel_dtrmm_nn_rl_one_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
3470 	{
3471 
3472 	const int bs = 4;
3473 
3474 	double
3475 		a_0, a_1, a_2, a_3,
3476 		b_0, b_1, b_2, b_3;
3477 
3478 #if defined(TARGET_GENERIC)
3479 	double CC[16] = {0};
3480 #else
3481 	ALIGNED( double CC[16], 64 ) = {0};
3482 #endif
3483 
3484 	int k;
3485 
3486 	k = 0;
3487 
3488 	// k = 0
3489 
3490 	a_0 = A[0];
3491 	a_1 = A[1];
3492 	a_2 = A[2];
3493 	a_3 = A[3];
3494 
3495 	CC[0+bs*0] += a_0;
3496 	CC[1+bs*0] += a_1;
3497 	CC[2+bs*0] += a_2;
3498 	CC[3+bs*0] += a_3;
3499 
3500 	A += bs;
3501 	B += 1;
3502 	k += 1;
3503 
3504 	if(k>=kmax)
3505 		goto store;
3506 
3507 	// k = 1
3508 
3509 	a_0 = A[0];
3510 	a_1 = A[1];
3511 	a_2 = A[2];
3512 	a_3 = A[3];
3513 
3514 	b_0 = B[0+0*ldb];
3515 	CC[0+bs*0] += a_0 * b_0;
3516 	CC[1+bs*0] += a_1 * b_0;
3517 	CC[2+bs*0] += a_2 * b_0;
3518 	CC[3+bs*0] += a_3 * b_0;
3519 
3520 	CC[0+bs*1] += a_0;
3521 	CC[1+bs*1] += a_1;
3522 	CC[2+bs*1] += a_2;
3523 	CC[3+bs*1] += a_3;
3524 
3525 	A += bs;
3526 	B += 1;
3527 	k += 1;
3528 
3529 	if(k>=kmax)
3530 		goto store;
3531 
3532 	// k = 2
3533 
3534 	a_0 = A[0];
3535 	a_1 = A[1];
3536 	a_2 = A[2];
3537 	a_3 = A[3];
3538 
3539 	b_0 = B[0+0*ldb];
3540 	CC[0+bs*0] += a_0 * b_0;
3541 	CC[1+bs*0] += a_1 * b_0;
3542 	CC[2+bs*0] += a_2 * b_0;
3543 	CC[3+bs*0] += a_3 * b_0;
3544 
3545 	b_1 = B[0+1*ldb];
3546 	CC[0+bs*1] += a_0 * b_1;
3547 	CC[1+bs*1] += a_1 * b_1;
3548 	CC[2+bs*1] += a_2 * b_1;
3549 	CC[3+bs*1] += a_3 * b_1;
3550 
3551 	CC[0+bs*2] += a_0;
3552 	CC[1+bs*2] += a_1;
3553 	CC[2+bs*2] += a_2;
3554 	CC[3+bs*2] += a_3;
3555 
3556 	A += bs;
3557 	B += 1;
3558 	k += 1;
3559 
3560 	if(k>=kmax)
3561 		goto store;
3562 
3563 	// k = 3
3564 
3565 	a_0 = A[0];
3566 	a_1 = A[1];
3567 	a_2 = A[2];
3568 	a_3 = A[3];
3569 
3570 	b_0 = B[0+0*ldb];
3571 	CC[0+bs*0] += a_0 * b_0;
3572 	CC[1+bs*0] += a_1 * b_0;
3573 	CC[2+bs*0] += a_2 * b_0;
3574 	CC[3+bs*0] += a_3 * b_0;
3575 
3576 	b_1 = B[0+1*ldb];
3577 	CC[0+bs*1] += a_0 * b_1;
3578 	CC[1+bs*1] += a_1 * b_1;
3579 	CC[2+bs*1] += a_2 * b_1;
3580 	CC[3+bs*1] += a_3 * b_1;
3581 
3582 	b_2 = B[0+2*ldb];
3583 	CC[0+bs*2] += a_0 * b_2;
3584 	CC[1+bs*2] += a_1 * b_2;
3585 	CC[2+bs*2] += a_2 * b_2;
3586 	CC[3+bs*2] += a_3 * b_2;
3587 
3588 	CC[0+bs*3] += a_0;
3589 	CC[1+bs*3] += a_1;
3590 	CC[2+bs*3] += a_2;
3591 	CC[3+bs*3] += a_3;
3592 
3593 	A += bs;
3594 	B += 1;
3595 	k += 1;
3596 
3597 	store:
3598 
3599 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
3600 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
3601 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
3602 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
3603 
3604 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
3605 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
3606 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
3607 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
3608 
3609 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
3610 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
3611 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
3612 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
3613 
3614 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
3615 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
3616 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
3617 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
3618 
3619 	double beta1 = 1.0;
3620 
3621 	kernel_dgemm_nn_4x4_lib4ccc(kmax-k, alpha, A, B, ldb, &beta1, CC, bs, D, ldd);
3622 
3623 	return;
3624 
3625 	}
3626 #endif
3627 
3628 
3629 
3630 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_one_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)3631 void kernel_dtrmm_nn_rl_one_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
3632 	{
3633 
3634 	const int bs = 4;
3635 
3636 	double
3637 		a_0, a_1, a_2, a_3,
3638 		b_0, b_1, b_2, b_3;
3639 
3640 #if defined(TARGET_GENERIC)
3641 	double CC[16] = {0};
3642 #else
3643 	ALIGNED( double CC[16], 64 ) = {0};
3644 #endif
3645 
3646 	int k;
3647 
3648 	k = 0;
3649 
3650 	// k = 0
3651 
3652 	a_0 = A[0];
3653 	a_1 = A[1];
3654 	a_2 = A[2];
3655 	a_3 = A[3];
3656 
3657 	CC[0+bs*0] += a_0;
3658 	CC[1+bs*0] += a_1;
3659 	CC[2+bs*0] += a_2;
3660 	CC[3+bs*0] += a_3;
3661 
3662 	A += bs;
3663 	B += 1;
3664 	k += 1;
3665 
3666 	if(k>=kmax)
3667 		goto store;
3668 
3669 	// k = 1
3670 
3671 	a_0 = A[0];
3672 	a_1 = A[1];
3673 	a_2 = A[2];
3674 	a_3 = A[3];
3675 
3676 	b_0 = B[0+0*ldb];
3677 	CC[0+bs*0] += a_0 * b_0;
3678 	CC[1+bs*0] += a_1 * b_0;
3679 	CC[2+bs*0] += a_2 * b_0;
3680 	CC[3+bs*0] += a_3 * b_0;
3681 
3682 	CC[0+bs*1] += a_0;
3683 	CC[1+bs*1] += a_1;
3684 	CC[2+bs*1] += a_2;
3685 	CC[3+bs*1] += a_3;
3686 
3687 	A += bs;
3688 	B += 1;
3689 	k += 1;
3690 
3691 	if(k>=kmax)
3692 		goto store;
3693 
3694 	// k = 2
3695 
3696 	a_0 = A[0];
3697 	a_1 = A[1];
3698 	a_2 = A[2];
3699 	a_3 = A[3];
3700 
3701 	b_0 = B[0+0*ldb];
3702 	CC[0+bs*0] += a_0 * b_0;
3703 	CC[1+bs*0] += a_1 * b_0;
3704 	CC[2+bs*0] += a_2 * b_0;
3705 	CC[3+bs*0] += a_3 * b_0;
3706 
3707 	b_1 = B[0+1*ldb];
3708 	CC[0+bs*1] += a_0 * b_1;
3709 	CC[1+bs*1] += a_1 * b_1;
3710 	CC[2+bs*1] += a_2 * b_1;
3711 	CC[3+bs*1] += a_3 * b_1;
3712 
3713 	CC[0+bs*2] += a_0;
3714 	CC[1+bs*2] += a_1;
3715 	CC[2+bs*2] += a_2;
3716 	CC[3+bs*2] += a_3;
3717 
3718 	A += bs;
3719 	B += 1;
3720 	k += 1;
3721 
3722 	if(k>=kmax)
3723 		goto store;
3724 
3725 	// k = 3
3726 
3727 	a_0 = A[0];
3728 	a_1 = A[1];
3729 	a_2 = A[2];
3730 	a_3 = A[3];
3731 
3732 	b_0 = B[0+0*ldb];
3733 	CC[0+bs*0] += a_0 * b_0;
3734 	CC[1+bs*0] += a_1 * b_0;
3735 	CC[2+bs*0] += a_2 * b_0;
3736 	CC[3+bs*0] += a_3 * b_0;
3737 
3738 	b_1 = B[0+1*ldb];
3739 	CC[0+bs*1] += a_0 * b_1;
3740 	CC[1+bs*1] += a_1 * b_1;
3741 	CC[2+bs*1] += a_2 * b_1;
3742 	CC[3+bs*1] += a_3 * b_1;
3743 
3744 	b_2 = B[0+2*ldb];
3745 	CC[0+bs*2] += a_0 * b_2;
3746 	CC[1+bs*2] += a_1 * b_2;
3747 	CC[2+bs*2] += a_2 * b_2;
3748 	CC[3+bs*2] += a_3 * b_2;
3749 
3750 	CC[0+bs*3] += a_0;
3751 	CC[1+bs*3] += a_1;
3752 	CC[2+bs*3] += a_2;
3753 	CC[3+bs*3] += a_3;
3754 
3755 	A += bs;
3756 	B += 1;
3757 	k += 1;
3758 
3759 	store:
3760 
3761 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
3762 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
3763 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
3764 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
3765 
3766 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
3767 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
3768 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
3769 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
3770 
3771 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
3772 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
3773 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
3774 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
3775 
3776 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
3777 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
3778 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
3779 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
3780 
3781 	double beta1 = 1.0;
3782 
3783 	kernel_dgemm_nn_4x4_lib4ccc(kmax-k, alpha, A, B, ldb, &beta1, CC, bs, CC, bs);
3784 
3785 	if(m1>=4)
3786 		{
3787 		D[0+ldd*0] = CC[0+bs*0];
3788 		D[1+ldd*0] = CC[1+bs*0];
3789 		D[2+ldd*0] = CC[2+bs*0];
3790 		D[3+ldd*0] = CC[3+bs*0];
3791 
3792 		if(n1==1)
3793 			return;
3794 
3795 		D[0+ldd*1] = CC[0+bs*1];
3796 		D[1+ldd*1] = CC[1+bs*1];
3797 		D[2+ldd*1] = CC[2+bs*1];
3798 		D[3+ldd*1] = CC[3+bs*1];
3799 
3800 		if(n1==2)
3801 			return;
3802 
3803 		D[0+ldd*2] = CC[0+bs*2];
3804 		D[1+ldd*2] = CC[1+bs*2];
3805 		D[2+ldd*2] = CC[2+bs*2];
3806 		D[3+ldd*2] = CC[3+bs*2];
3807 
3808 		if(n1==3)
3809 			return;
3810 
3811 		D[0+ldd*3] = CC[0+bs*3];
3812 		D[1+ldd*3] = CC[1+bs*3];
3813 		D[2+ldd*3] = CC[2+bs*3];
3814 		D[3+ldd*3] = CC[3+bs*3];
3815 		}
3816 	else if(m1>=3)
3817 		{
3818 		D[0+ldd*0] = CC[0+bs*0];
3819 		D[1+ldd*0] = CC[1+bs*0];
3820 		D[2+ldd*0] = CC[2+bs*0];
3821 
3822 		if(n1==1)
3823 			return;
3824 
3825 		D[0+ldd*1] = CC[0+bs*1];
3826 		D[1+ldd*1] = CC[1+bs*1];
3827 		D[2+ldd*1] = CC[2+bs*1];
3828 
3829 		if(n1==2)
3830 			return;
3831 
3832 		D[0+ldd*2] = CC[0+bs*2];
3833 		D[1+ldd*2] = CC[1+bs*2];
3834 		D[2+ldd*2] = CC[2+bs*2];
3835 
3836 		if(n1==3)
3837 			return;
3838 
3839 		D[0+ldd*3] = CC[0+bs*3];
3840 		D[1+ldd*3] = CC[1+bs*3];
3841 		D[2+ldd*3] = CC[2+bs*3];
3842 		}
3843 	else if(m1>=2)
3844 		{
3845 		D[0+ldd*0] = CC[0+bs*0];
3846 		D[1+ldd*0] = CC[1+bs*0];
3847 
3848 		if(n1==1)
3849 			return;
3850 
3851 		D[0+ldd*1] = CC[0+bs*1];
3852 		D[1+ldd*1] = CC[1+bs*1];
3853 
3854 		if(n1==2)
3855 			return;
3856 
3857 		D[0+ldd*2] = CC[0+bs*2];
3858 		D[1+ldd*2] = CC[1+bs*2];
3859 
3860 		if(n1==3)
3861 			return;
3862 
3863 		D[0+ldd*3] = CC[0+bs*3];
3864 		D[1+ldd*3] = CC[1+bs*3];
3865 		}
3866 	else //if(m1>=1)
3867 		{
3868 		D[0+ldd*0] = CC[0+bs*0];
3869 
3870 		if(n1==1)
3871 			return;
3872 
3873 		D[0+ldd*1] = CC[0+bs*1];
3874 
3875 		if(n1==2)
3876 			return;
3877 
3878 		D[0+ldd*2] = CC[0+bs*2];
3879 
3880 		if(n1==3)
3881 			return;
3882 
3883 		D[0+ldd*3] = CC[0+bs*3];
3884 		}
3885 
3886 	return;
3887 
3888 	}
3889 #endif
3890 
3891 
3892 
3893 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_one_4x4_tran_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd)3894 void kernel_dtrmm_nn_rl_one_4x4_tran_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd)
3895 	{
3896 
3897 	const int bs = 4;
3898 
3899 	double
3900 		a_0, a_1, a_2, a_3,
3901 		b_0, b_1, b_2, b_3;
3902 
3903 #if defined(TARGET_GENERIC)
3904 	double CC[16] = {0};
3905 #else
3906 	ALIGNED( double CC[16], 64 ) = {0};
3907 #endif
3908 
3909 	int k;
3910 
3911 	k = 0;
3912 
3913 	// k = 0
3914 
3915 	a_0 = A[0];
3916 	a_1 = A[1];
3917 	a_2 = A[2];
3918 	a_3 = A[3];
3919 
3920 	CC[0+bs*0] += a_0;
3921 	CC[1+bs*0] += a_1;
3922 	CC[2+bs*0] += a_2;
3923 	CC[3+bs*0] += a_3;
3924 
3925 	A += bs;
3926 	B += 1;
3927 	k += 1;
3928 
3929 	if(k>=kmax)
3930 		goto store;
3931 
3932 	// k = 1
3933 
3934 	a_0 = A[0];
3935 	a_1 = A[1];
3936 	a_2 = A[2];
3937 	a_3 = A[3];
3938 
3939 	b_0 = B[0+0*ldb];
3940 	CC[0+bs*0] += a_0 * b_0;
3941 	CC[1+bs*0] += a_1 * b_0;
3942 	CC[2+bs*0] += a_2 * b_0;
3943 	CC[3+bs*0] += a_3 * b_0;
3944 
3945 	CC[0+bs*1] += a_0;
3946 	CC[1+bs*1] += a_1;
3947 	CC[2+bs*1] += a_2;
3948 	CC[3+bs*1] += a_3;
3949 
3950 	A += bs;
3951 	B += 1;
3952 	k += 1;
3953 
3954 	if(k>=kmax)
3955 		goto store;
3956 
3957 	// k = 2
3958 
3959 	a_0 = A[0];
3960 	a_1 = A[1];
3961 	a_2 = A[2];
3962 	a_3 = A[3];
3963 
3964 	b_0 = B[0+0*ldb];
3965 	CC[0+bs*0] += a_0 * b_0;
3966 	CC[1+bs*0] += a_1 * b_0;
3967 	CC[2+bs*0] += a_2 * b_0;
3968 	CC[3+bs*0] += a_3 * b_0;
3969 
3970 	b_1 = B[0+1*ldb];
3971 	CC[0+bs*1] += a_0 * b_1;
3972 	CC[1+bs*1] += a_1 * b_1;
3973 	CC[2+bs*1] += a_2 * b_1;
3974 	CC[3+bs*1] += a_3 * b_1;
3975 
3976 	CC[0+bs*2] += a_0;
3977 	CC[1+bs*2] += a_1;
3978 	CC[2+bs*2] += a_2;
3979 	CC[3+bs*2] += a_3;
3980 
3981 	A += bs;
3982 	B += 1;
3983 	k += 1;
3984 
3985 	if(k>=kmax)
3986 		goto store;
3987 
3988 	// k = 3
3989 
3990 	a_0 = A[0];
3991 	a_1 = A[1];
3992 	a_2 = A[2];
3993 	a_3 = A[3];
3994 
3995 	b_0 = B[0+0*ldb];
3996 	CC[0+bs*0] += a_0 * b_0;
3997 	CC[1+bs*0] += a_1 * b_0;
3998 	CC[2+bs*0] += a_2 * b_0;
3999 	CC[3+bs*0] += a_3 * b_0;
4000 
4001 	b_1 = B[0+1*ldb];
4002 	CC[0+bs*1] += a_0 * b_1;
4003 	CC[1+bs*1] += a_1 * b_1;
4004 	CC[2+bs*1] += a_2 * b_1;
4005 	CC[3+bs*1] += a_3 * b_1;
4006 
4007 	b_2 = B[0+2*ldb];
4008 	CC[0+bs*2] += a_0 * b_2;
4009 	CC[1+bs*2] += a_1 * b_2;
4010 	CC[2+bs*2] += a_2 * b_2;
4011 	CC[3+bs*2] += a_3 * b_2;
4012 
4013 	CC[0+bs*3] += a_0;
4014 	CC[1+bs*3] += a_1;
4015 	CC[2+bs*3] += a_2;
4016 	CC[3+bs*3] += a_3;
4017 
4018 	A += bs;
4019 	B += 1;
4020 	k += 1;
4021 
4022 	double alpha1 = 1.0;
4023 	double beta1 = 1.0;
4024 
4025 	kernel_dgemm_nn_4x4_lib4ccc(kmax-k, &alpha1, A, B, ldb, &beta1, CC, bs, CC, bs);
4026 
4027 	store:
4028 
4029 	// scale & tranpose & store
4030 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
4031 	D[0+ldd*1] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
4032 	D[0+ldd*2] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
4033 	D[0+ldd*3] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
4034 
4035 	D[1+ldd*0] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
4036 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
4037 	D[1+ldd*2] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
4038 	D[1+ldd*3] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
4039 
4040 	D[2+ldd*0] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
4041 	D[2+ldd*1] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
4042 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
4043 	D[2+ldd*3] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
4044 
4045 	D[3+ldd*0] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
4046 	D[3+ldd*1] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
4047 	D[3+ldd*2] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
4048 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
4049 
4050 	return;
4051 
4052 	}
4053 #endif
4054 
4055 
4056 
4057 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_one_4x4_tran_vs_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd,int m1,int n1)4058 void kernel_dtrmm_nn_rl_one_4x4_tran_vs_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1)
4059 	{
4060 
4061 	const int bs = 4;
4062 
4063 	double
4064 		tmp,
4065 		a_0, a_1, a_2, a_3,
4066 		b_0, b_1, b_2, b_3;
4067 
4068 #if defined(TARGET_GENERIC)
4069 	double CC[16] = {0};
4070 #else
4071 	ALIGNED( double CC[16], 64 ) = {0};
4072 #endif
4073 
4074 	int k;
4075 
4076 	k = 0;
4077 
4078 	// k = 0
4079 
4080 	a_0 = A[0];
4081 	a_1 = A[1];
4082 	a_2 = A[2];
4083 	a_3 = A[3];
4084 
4085 	CC[0+bs*0] += a_0;
4086 	CC[1+bs*0] += a_1;
4087 	CC[2+bs*0] += a_2;
4088 	CC[3+bs*0] += a_3;
4089 
4090 	A += bs;
4091 	B += 1;
4092 	k += 1;
4093 
4094 	if(k>=kmax)
4095 		goto store;
4096 
4097 	// k = 1
4098 
4099 	a_0 = A[0];
4100 	a_1 = A[1];
4101 	a_2 = A[2];
4102 	a_3 = A[3];
4103 
4104 	b_0 = B[0+0*ldb];
4105 	CC[0+bs*0] += a_0 * b_0;
4106 	CC[1+bs*0] += a_1 * b_0;
4107 	CC[2+bs*0] += a_2 * b_0;
4108 	CC[3+bs*0] += a_3 * b_0;
4109 
4110 	CC[0+bs*1] += a_0;
4111 	CC[1+bs*1] += a_1;
4112 	CC[2+bs*1] += a_2;
4113 	CC[3+bs*1] += a_3;
4114 
4115 	A += bs;
4116 	B += 1;
4117 	k += 1;
4118 
4119 	if(k>=kmax)
4120 		goto store;
4121 
4122 	// k = 2
4123 
4124 	a_0 = A[0];
4125 	a_1 = A[1];
4126 	a_2 = A[2];
4127 	a_3 = A[3];
4128 
4129 	b_0 = B[0+0*ldb];
4130 	CC[0+bs*0] += a_0 * b_0;
4131 	CC[1+bs*0] += a_1 * b_0;
4132 	CC[2+bs*0] += a_2 * b_0;
4133 	CC[3+bs*0] += a_3 * b_0;
4134 
4135 	b_1 = B[0+1*ldb];
4136 	CC[0+bs*1] += a_0 * b_1;
4137 	CC[1+bs*1] += a_1 * b_1;
4138 	CC[2+bs*1] += a_2 * b_1;
4139 	CC[3+bs*1] += a_3 * b_1;
4140 
4141 	CC[0+bs*2] += a_0;
4142 	CC[1+bs*2] += a_1;
4143 	CC[2+bs*2] += a_2;
4144 	CC[3+bs*2] += a_3;
4145 
4146 	A += bs;
4147 	B += 1;
4148 	k += 1;
4149 
4150 	if(k>=kmax)
4151 		goto store;
4152 
4153 	// k = 3
4154 
4155 	a_0 = A[0];
4156 	a_1 = A[1];
4157 	a_2 = A[2];
4158 	a_3 = A[3];
4159 
4160 	b_0 = B[0+0*ldb];
4161 	CC[0+bs*0] += a_0 * b_0;
4162 	CC[1+bs*0] += a_1 * b_0;
4163 	CC[2+bs*0] += a_2 * b_0;
4164 	CC[3+bs*0] += a_3 * b_0;
4165 
4166 	b_1 = B[0+1*ldb];
4167 	CC[0+bs*1] += a_0 * b_1;
4168 	CC[1+bs*1] += a_1 * b_1;
4169 	CC[2+bs*1] += a_2 * b_1;
4170 	CC[3+bs*1] += a_3 * b_1;
4171 
4172 	b_2 = B[0+2*ldb];
4173 	CC[0+bs*2] += a_0 * b_2;
4174 	CC[1+bs*2] += a_1 * b_2;
4175 	CC[2+bs*2] += a_2 * b_2;
4176 	CC[3+bs*2] += a_3 * b_2;
4177 
4178 	CC[0+bs*3] += a_0;
4179 	CC[1+bs*3] += a_1;
4180 	CC[2+bs*3] += a_2;
4181 	CC[3+bs*3] += a_3;
4182 
4183 	A += bs;
4184 	B += 1;
4185 	k += 1;
4186 
4187 	double alpha1 = 1.0;
4188 	double beta1 = 1.0;
4189 
4190 	kernel_dgemm_nn_4x4_lib4ccc(kmax-k, &alpha1, A, B, ldb, &beta1, CC, bs, CC, bs);
4191 
4192 	store:
4193 
4194 	// scale
4195 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
4196 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
4197 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
4198 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
4199 
4200 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
4201 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
4202 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
4203 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
4204 
4205 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
4206 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
4207 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
4208 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
4209 
4210 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
4211 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
4212 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
4213 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
4214 
4215 	// transpose
4216 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
4217 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
4218 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
4219 
4220 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
4221 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
4222 
4223 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
4224 
4225 	// store
4226 	if(m1>=4)
4227 		{
4228 		D[0+ldd*0] = CC[0+bs*0];
4229 		D[1+ldd*0] = CC[1+bs*0];
4230 		D[2+ldd*0] = CC[2+bs*0];
4231 		D[3+ldd*0] = CC[3+bs*0];
4232 
4233 		if(n1==1)
4234 			return;
4235 
4236 		D[0+ldd*1] = CC[0+bs*1];
4237 		D[1+ldd*1] = CC[1+bs*1];
4238 		D[2+ldd*1] = CC[2+bs*1];
4239 		D[3+ldd*1] = CC[3+bs*1];
4240 
4241 		if(n1==2)
4242 			return;
4243 
4244 		D[0+ldd*2] = CC[0+bs*2];
4245 		D[1+ldd*2] = CC[1+bs*2];
4246 		D[2+ldd*2] = CC[2+bs*2];
4247 		D[3+ldd*2] = CC[3+bs*2];
4248 
4249 		if(n1==3)
4250 			return;
4251 
4252 		D[0+ldd*3] = CC[0+bs*3];
4253 		D[1+ldd*3] = CC[1+bs*3];
4254 		D[2+ldd*3] = CC[2+bs*3];
4255 		D[3+ldd*3] = CC[3+bs*3];
4256 		}
4257 	else if(m1>=3)
4258 		{
4259 		D[0+ldd*0] = CC[0+bs*0];
4260 		D[1+ldd*0] = CC[1+bs*0];
4261 		D[2+ldd*0] = CC[2+bs*0];
4262 
4263 		if(n1==1)
4264 			return;
4265 
4266 		D[0+ldd*1] = CC[0+bs*1];
4267 		D[1+ldd*1] = CC[1+bs*1];
4268 		D[2+ldd*1] = CC[2+bs*1];
4269 
4270 		if(n1==2)
4271 			return;
4272 
4273 		D[0+ldd*2] = CC[0+bs*2];
4274 		D[1+ldd*2] = CC[1+bs*2];
4275 		D[2+ldd*2] = CC[2+bs*2];
4276 
4277 		if(n1==3)
4278 			return;
4279 
4280 		D[0+ldd*3] = CC[0+bs*3];
4281 		D[1+ldd*3] = CC[1+bs*3];
4282 		D[2+ldd*3] = CC[2+bs*3];
4283 		}
4284 	else if(m1>=2)
4285 		{
4286 		D[0+ldd*0] = CC[0+bs*0];
4287 		D[1+ldd*0] = CC[1+bs*0];
4288 
4289 		if(n1==1)
4290 			return;
4291 
4292 		D[0+ldd*1] = CC[0+bs*1];
4293 		D[1+ldd*1] = CC[1+bs*1];
4294 
4295 		if(n1==2)
4296 			return;
4297 
4298 		D[0+ldd*2] = CC[0+bs*2];
4299 		D[1+ldd*2] = CC[1+bs*2];
4300 
4301 		if(n1==3)
4302 			return;
4303 
4304 		D[0+ldd*3] = CC[0+bs*3];
4305 		D[1+ldd*3] = CC[1+bs*3];
4306 		}
4307 	else //if(m1>=1)
4308 		{
4309 		D[0+ldd*0] = CC[0+bs*0];
4310 
4311 		if(n1==1)
4312 			return;
4313 
4314 		D[0+ldd*1] = CC[0+bs*1];
4315 
4316 		if(n1==2)
4317 			return;
4318 
4319 		D[0+ldd*2] = CC[0+bs*2];
4320 
4321 		if(n1==3)
4322 			return;
4323 
4324 		D[0+ldd*3] = CC[0+bs*3];
4325 		}
4326 
4327 	return;
4328 
4329 	}
4330 #endif
4331 
4332 
4333 
4334 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_ru_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)4335 void kernel_dtrmm_nn_ru_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
4336 	{
4337 
4338 	const int bs = 4;
4339 
4340 	double
4341 		a_0, a_1, a_2, a_3,
4342 		b_0, b_1, b_2, b_3;
4343 
4344 #if defined(TARGET_GENERIC)
4345 	double CC[16] = {0};
4346 #else
4347 	ALIGNED( double CC[16], 64 ) = {0};
4348 #endif
4349 
4350 	double alpha1 = 1.0;
4351 
4352 	// assume always kmax>=4 !!!
4353 
4354 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
4355 
4356 	A += kmax*bs;
4357 	B += kmax;
4358 
4359 	// k = 0
4360 
4361 	a_0 = A[0];
4362 	a_1 = A[1];
4363 	a_2 = A[2];
4364 	a_3 = A[3];
4365 
4366 	b_0 = B[0+0*ldb];
4367 	CC[0+bs*0] += a_0 * b_0;
4368 	CC[1+bs*0] += a_1 * b_0;
4369 	CC[2+bs*0] += a_2 * b_0;
4370 	CC[3+bs*0] += a_3 * b_0;
4371 
4372 	b_0 = B[0+1*ldb];
4373 	CC[0+bs*1] += a_0 * b_0;
4374 	CC[1+bs*1] += a_1 * b_0;
4375 	CC[2+bs*1] += a_2 * b_0;
4376 	CC[3+bs*1] += a_3 * b_0;
4377 
4378 	b_0 = B[0+2*ldb];
4379 	CC[0+bs*2] += a_0 * b_0;
4380 	CC[1+bs*2] += a_1 * b_0;
4381 	CC[2+bs*2] += a_2 * b_0;
4382 	CC[3+bs*2] += a_3 * b_0;
4383 
4384 	b_0 = B[0+3*ldb];
4385 	CC[0+bs*3] += a_0 * b_0;
4386 	CC[1+bs*3] += a_1 * b_0;
4387 	CC[2+bs*3] += a_2 * b_0;
4388 	CC[3+bs*3] += a_3 * b_0;
4389 
4390 	A += bs;
4391 	B += 1;
4392 //	k += 1;
4393 
4394 	// k = 1
4395 
4396 	a_0 = A[0];
4397 	a_1 = A[1];
4398 	a_2 = A[2];
4399 	a_3 = A[3];
4400 
4401 	b_0 = B[0+1*ldb];
4402 	CC[0+bs*1] += a_0 * b_0;
4403 	CC[1+bs*1] += a_1 * b_0;
4404 	CC[2+bs*1] += a_2 * b_0;
4405 	CC[3+bs*1] += a_3 * b_0;
4406 
4407 	b_0 = B[0+2*ldb];
4408 	CC[0+bs*2] += a_0 * b_0;
4409 	CC[1+bs*2] += a_1 * b_0;
4410 	CC[2+bs*2] += a_2 * b_0;
4411 	CC[3+bs*2] += a_3 * b_0;
4412 
4413 	b_0 = B[0+3*ldb];
4414 	CC[0+bs*3] += a_0 * b_0;
4415 	CC[1+bs*3] += a_1 * b_0;
4416 	CC[2+bs*3] += a_2 * b_0;
4417 	CC[3+bs*3] += a_3 * b_0;
4418 
4419 	A += bs;
4420 	B += 1;
4421 //	k += 1;
4422 
4423 	// k = 2
4424 
4425 	a_0 = A[0];
4426 	a_1 = A[1];
4427 	a_2 = A[2];
4428 	a_3 = A[3];
4429 
4430 	b_0 = B[0+2*ldb];
4431 	CC[0+bs*2] += a_0 * b_0;
4432 	CC[1+bs*2] += a_1 * b_0;
4433 	CC[2+bs*2] += a_2 * b_0;
4434 	CC[3+bs*2] += a_3 * b_0;
4435 
4436 	b_0 = B[0+3*ldb];
4437 	CC[0+bs*3] += a_0 * b_0;
4438 	CC[1+bs*3] += a_1 * b_0;
4439 	CC[2+bs*3] += a_2 * b_0;
4440 	CC[3+bs*3] += a_3 * b_0;
4441 
4442 	A += bs;
4443 	B += 1;
4444 //	k += 1;
4445 
4446 	// k = 3
4447 
4448 	a_0 = A[0];
4449 	a_1 = A[1];
4450 	a_2 = A[2];
4451 	a_3 = A[3];
4452 
4453 	b_0 = B[0+3*ldb];
4454 	CC[0+bs*3] += a_0 * b_0;
4455 	CC[1+bs*3] += a_1 * b_0;
4456 	CC[2+bs*3] += a_2 * b_0;
4457 	CC[3+bs*3] += a_3 * b_0;
4458 
4459 	A += bs;
4460 	B += 1;
4461 //	k += 1;
4462 
4463 	store:
4464 
4465 	D[0+ldd*0] = alpha[0]*CC[0+bs*0];
4466 	D[1+ldd*0] = alpha[0]*CC[1+bs*0];
4467 	D[2+ldd*0] = alpha[0]*CC[2+bs*0];
4468 	D[3+ldd*0] = alpha[0]*CC[3+bs*0];
4469 
4470 	D[0+ldd*1] = alpha[0]*CC[0+bs*1];
4471 	D[1+ldd*1] = alpha[0]*CC[1+bs*1];
4472 	D[2+ldd*1] = alpha[0]*CC[2+bs*1];
4473 	D[3+ldd*1] = alpha[0]*CC[3+bs*1];
4474 
4475 	D[0+ldd*2] = alpha[0]*CC[0+bs*2];
4476 	D[1+ldd*2] = alpha[0]*CC[1+bs*2];
4477 	D[2+ldd*2] = alpha[0]*CC[2+bs*2];
4478 	D[3+ldd*2] = alpha[0]*CC[3+bs*2];
4479 
4480 	D[0+ldd*3] = alpha[0]*CC[0+bs*3];
4481 	D[1+ldd*3] = alpha[0]*CC[1+bs*3];
4482 	D[2+ldd*3] = alpha[0]*CC[2+bs*3];
4483 	D[3+ldd*3] = alpha[0]*CC[3+bs*3];
4484 
4485 	return;
4486 
4487 	}
4488 #endif
4489 
4490 
4491 
4492 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_ru_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)4493 void kernel_dtrmm_nn_ru_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
4494 	{
4495 
4496 	const int bs = 4;
4497 
4498 	double
4499 		a_0, a_1, a_2, a_3,
4500 		b_0, b_1, b_2, b_3;
4501 
4502 #if defined(TARGET_GENERIC)
4503 	double CC[16] = {0};
4504 #else
4505 	ALIGNED( double CC[16], 64 ) = {0};
4506 #endif
4507 
4508 	double alpha1 = 1.0;
4509 
4510 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
4511 
4512 	A += kmax*bs;
4513 	B += kmax;
4514 
4515 	if(n1>=4)
4516 		{
4517 
4518 		// k = 0
4519 
4520 		a_0 = A[0];
4521 		a_1 = A[1];
4522 		a_2 = A[2];
4523 		a_3 = A[3];
4524 
4525 		b_0 = B[0+0*ldb];
4526 		CC[0+bs*0] += a_0 * b_0;
4527 		CC[1+bs*0] += a_1 * b_0;
4528 		CC[2+bs*0] += a_2 * b_0;
4529 		CC[3+bs*0] += a_3 * b_0;
4530 
4531 		b_0 = B[0+1*ldb];
4532 		CC[0+bs*1] += a_0 * b_0;
4533 		CC[1+bs*1] += a_1 * b_0;
4534 		CC[2+bs*1] += a_2 * b_0;
4535 		CC[3+bs*1] += a_3 * b_0;
4536 
4537 		b_0 = B[0+2*ldb];
4538 		CC[0+bs*2] += a_0 * b_0;
4539 		CC[1+bs*2] += a_1 * b_0;
4540 		CC[2+bs*2] += a_2 * b_0;
4541 		CC[3+bs*2] += a_3 * b_0;
4542 
4543 		b_0 = B[0+3*ldb];
4544 		CC[0+bs*3] += a_0 * b_0;
4545 		CC[1+bs*3] += a_1 * b_0;
4546 		CC[2+bs*3] += a_2 * b_0;
4547 		CC[3+bs*3] += a_3 * b_0;
4548 
4549 		A += bs;
4550 		B += 1;
4551 //		k += 1;
4552 
4553 		// k = 1
4554 
4555 		a_0 = A[0];
4556 		a_1 = A[1];
4557 		a_2 = A[2];
4558 		a_3 = A[3];
4559 
4560 		b_0 = B[0+1*ldb];
4561 		CC[0+bs*1] += a_0 * b_0;
4562 		CC[1+bs*1] += a_1 * b_0;
4563 		CC[2+bs*1] += a_2 * b_0;
4564 		CC[3+bs*1] += a_3 * b_0;
4565 
4566 		b_0 = B[0+2*ldb];
4567 		CC[0+bs*2] += a_0 * b_0;
4568 		CC[1+bs*2] += a_1 * b_0;
4569 		CC[2+bs*2] += a_2 * b_0;
4570 		CC[3+bs*2] += a_3 * b_0;
4571 
4572 		b_0 = B[0+3*ldb];
4573 		CC[0+bs*3] += a_0 * b_0;
4574 		CC[1+bs*3] += a_1 * b_0;
4575 		CC[2+bs*3] += a_2 * b_0;
4576 		CC[3+bs*3] += a_3 * b_0;
4577 
4578 		A += bs;
4579 		B += 1;
4580 //		k += 1;
4581 
4582 		// k = 2
4583 
4584 		a_0 = A[0];
4585 		a_1 = A[1];
4586 		a_2 = A[2];
4587 		a_3 = A[3];
4588 
4589 		b_0 = B[0+2*ldb];
4590 		CC[0+bs*2] += a_0 * b_0;
4591 		CC[1+bs*2] += a_1 * b_0;
4592 		CC[2+bs*2] += a_2 * b_0;
4593 		CC[3+bs*2] += a_3 * b_0;
4594 
4595 		b_0 = B[0+3*ldb];
4596 		CC[0+bs*3] += a_0 * b_0;
4597 		CC[1+bs*3] += a_1 * b_0;
4598 		CC[2+bs*3] += a_2 * b_0;
4599 		CC[3+bs*3] += a_3 * b_0;
4600 
4601 		A += bs;
4602 		B += 1;
4603 //		k += 1;
4604 
4605 		// k = 3
4606 
4607 		a_0 = A[0];
4608 		a_1 = A[1];
4609 		a_2 = A[2];
4610 		a_3 = A[3];
4611 
4612 		b_0 = B[0+3*ldb];
4613 		CC[0+bs*3] += a_0 * b_0;
4614 		CC[1+bs*3] += a_1 * b_0;
4615 		CC[2+bs*3] += a_2 * b_0;
4616 		CC[3+bs*3] += a_3 * b_0;
4617 
4618 		A += bs;
4619 		B += 1;
4620 //		k += 1;
4621 
4622 		}
4623 	else if(n1==3)
4624 		{
4625 
4626 		// k = 0
4627 
4628 		a_0 = A[0];
4629 		a_1 = A[1];
4630 		a_2 = A[2];
4631 		a_3 = A[3];
4632 
4633 		b_0 = B[0+0*ldb];
4634 		CC[0+bs*0] += a_0 * b_0;
4635 		CC[1+bs*0] += a_1 * b_0;
4636 		CC[2+bs*0] += a_2 * b_0;
4637 		CC[3+bs*0] += a_3 * b_0;
4638 
4639 		b_0 = B[0+1*ldb];
4640 		CC[0+bs*1] += a_0 * b_0;
4641 		CC[1+bs*1] += a_1 * b_0;
4642 		CC[2+bs*1] += a_2 * b_0;
4643 		CC[3+bs*1] += a_3 * b_0;
4644 
4645 		b_0 = B[0+2*ldb];
4646 		CC[0+bs*2] += a_0 * b_0;
4647 		CC[1+bs*2] += a_1 * b_0;
4648 		CC[2+bs*2] += a_2 * b_0;
4649 		CC[3+bs*2] += a_3 * b_0;
4650 
4651 		A += bs;
4652 		B += 1;
4653 //		k += 1;
4654 
4655 		// k = 1
4656 
4657 		a_0 = A[0];
4658 		a_1 = A[1];
4659 		a_2 = A[2];
4660 		a_3 = A[3];
4661 
4662 		b_0 = B[0+1*ldb];
4663 		CC[0+bs*1] += a_0 * b_0;
4664 		CC[1+bs*1] += a_1 * b_0;
4665 		CC[2+bs*1] += a_2 * b_0;
4666 		CC[3+bs*1] += a_3 * b_0;
4667 
4668 		b_0 = B[0+2*ldb];
4669 		CC[0+bs*2] += a_0 * b_0;
4670 		CC[1+bs*2] += a_1 * b_0;
4671 		CC[2+bs*2] += a_2 * b_0;
4672 		CC[3+bs*2] += a_3 * b_0;
4673 
4674 		A += bs;
4675 		B += 1;
4676 //		k += 1;
4677 
4678 		// k = 2
4679 
4680 		a_0 = A[0];
4681 		a_1 = A[1];
4682 		a_2 = A[2];
4683 		a_3 = A[3];
4684 
4685 		b_0 = B[0+2*ldb];
4686 		CC[0+bs*2] += a_0 * b_0;
4687 		CC[1+bs*2] += a_1 * b_0;
4688 		CC[2+bs*2] += a_2 * b_0;
4689 		CC[3+bs*2] += a_3 * b_0;
4690 
4691 		A += bs;
4692 		B += 1;
4693 //		k += 1;
4694 
4695 		// k = 3
4696 
4697 		A += bs;
4698 		B += 1;
4699 //		k += 1;
4700 
4701 		}
4702 	else if(n1==2)
4703 		{
4704 
4705 		// k = 0
4706 
4707 		a_0 = A[0];
4708 		a_1 = A[1];
4709 		a_2 = A[2];
4710 		a_3 = A[3];
4711 
4712 		b_0 = B[0+0*ldb];
4713 		CC[0+bs*0] += a_0 * b_0;
4714 		CC[1+bs*0] += a_1 * b_0;
4715 		CC[2+bs*0] += a_2 * b_0;
4716 		CC[3+bs*0] += a_3 * b_0;
4717 
4718 		b_0 = B[0+1*ldb];
4719 		CC[0+bs*1] += a_0 * b_0;
4720 		CC[1+bs*1] += a_1 * b_0;
4721 		CC[2+bs*1] += a_2 * b_0;
4722 		CC[3+bs*1] += a_3 * b_0;
4723 
4724 		A += bs;
4725 		B += 1;
4726 //		k += 1;
4727 
4728 		// k = 1
4729 
4730 		a_0 = A[0];
4731 		a_1 = A[1];
4732 		a_2 = A[2];
4733 		a_3 = A[3];
4734 
4735 		b_0 = B[0+1*ldb];
4736 		CC[0+bs*1] += a_0 * b_0;
4737 		CC[1+bs*1] += a_1 * b_0;
4738 		CC[2+bs*1] += a_2 * b_0;
4739 		CC[3+bs*1] += a_3 * b_0;
4740 
4741 		A += bs;
4742 		B += 1;
4743 //		k += 1;
4744 
4745 		// k = 2
4746 
4747 		A += bs;
4748 		B += 1;
4749 //		k += 1;
4750 
4751 		// k = 3
4752 
4753 		A += bs;
4754 		B += 1;
4755 //		k += 1;
4756 
4757 		}
4758 	else if(n1==1)
4759 		{
4760 
4761 		// k = 0
4762 
4763 		a_0 = A[0];
4764 		a_1 = A[1];
4765 		a_2 = A[2];
4766 		a_3 = A[3];
4767 
4768 		b_0 = B[0+0*ldb];
4769 		CC[0+bs*0] += a_0 * b_0;
4770 		CC[1+bs*0] += a_1 * b_0;
4771 		CC[2+bs*0] += a_2 * b_0;
4772 		CC[3+bs*0] += a_3 * b_0;
4773 
4774 		A += bs;
4775 		B += 1;
4776 //		k += 1;
4777 
4778 		// k = 1
4779 
4780 		A += bs;
4781 		B += 1;
4782 //		k += 1;
4783 
4784 		// k = 2
4785 
4786 		A += bs;
4787 		B += 1;
4788 //		k += 1;
4789 
4790 		// k = 3
4791 
4792 		A += bs;
4793 		B += 1;
4794 //		k += 1;
4795 
4796 		}
4797 
4798 	store:
4799 
4800 	CC[0+bs*0] = alpha[0]*CC[0+bs*0];
4801 	CC[1+bs*0] = alpha[0]*CC[1+bs*0];
4802 	CC[2+bs*0] = alpha[0]*CC[2+bs*0];
4803 	CC[3+bs*0] = alpha[0]*CC[3+bs*0];
4804 
4805 	CC[0+bs*1] = alpha[0]*CC[0+bs*1];
4806 	CC[1+bs*1] = alpha[0]*CC[1+bs*1];
4807 	CC[2+bs*1] = alpha[0]*CC[2+bs*1];
4808 	CC[3+bs*1] = alpha[0]*CC[3+bs*1];
4809 
4810 	CC[0+bs*2] = alpha[0]*CC[0+bs*2];
4811 	CC[1+bs*2] = alpha[0]*CC[1+bs*2];
4812 	CC[2+bs*2] = alpha[0]*CC[2+bs*2];
4813 	CC[3+bs*2] = alpha[0]*CC[3+bs*2];
4814 
4815 	CC[0+bs*3] = alpha[0]*CC[0+bs*3];
4816 	CC[1+bs*3] = alpha[0]*CC[1+bs*3];
4817 	CC[2+bs*3] = alpha[0]*CC[2+bs*3];
4818 	CC[3+bs*3] = alpha[0]*CC[3+bs*3];
4819 
4820 	if(m1>=4)
4821 		{
4822 		D[0+ldd*0] = CC[0+bs*0];
4823 		D[1+ldd*0] = CC[1+bs*0];
4824 		D[2+ldd*0] = CC[2+bs*0];
4825 		D[3+ldd*0] = CC[3+bs*0];
4826 
4827 		if(n1==1)
4828 			return;
4829 
4830 		D[0+ldd*1] = CC[0+bs*1];
4831 		D[1+ldd*1] = CC[1+bs*1];
4832 		D[2+ldd*1] = CC[2+bs*1];
4833 		D[3+ldd*1] = CC[3+bs*1];
4834 
4835 		if(n1==2)
4836 			return;
4837 
4838 		D[0+ldd*2] = CC[0+bs*2];
4839 		D[1+ldd*2] = CC[1+bs*2];
4840 		D[2+ldd*2] = CC[2+bs*2];
4841 		D[3+ldd*2] = CC[3+bs*2];
4842 
4843 		if(n1==3)
4844 			return;
4845 
4846 		D[0+ldd*3] = CC[0+bs*3];
4847 		D[1+ldd*3] = CC[1+bs*3];
4848 		D[2+ldd*3] = CC[2+bs*3];
4849 		D[3+ldd*3] = CC[3+bs*3];
4850 		}
4851 	else if(m1>=3)
4852 		{
4853 		D[0+ldd*0] = CC[0+bs*0];
4854 		D[1+ldd*0] = CC[1+bs*0];
4855 		D[2+ldd*0] = CC[2+bs*0];
4856 
4857 		if(n1==1)
4858 			return;
4859 
4860 		D[0+ldd*1] = CC[0+bs*1];
4861 		D[1+ldd*1] = CC[1+bs*1];
4862 		D[2+ldd*1] = CC[2+bs*1];
4863 
4864 		if(n1==2)
4865 			return;
4866 
4867 		D[0+ldd*2] = CC[0+bs*2];
4868 		D[1+ldd*2] = CC[1+bs*2];
4869 		D[2+ldd*2] = CC[2+bs*2];
4870 
4871 		if(n1==3)
4872 			return;
4873 
4874 		D[0+ldd*3] = CC[0+bs*3];
4875 		D[1+ldd*3] = CC[1+bs*3];
4876 		D[2+ldd*3] = CC[2+bs*3];
4877 		}
4878 	else if(m1>=2)
4879 		{
4880 		D[0+ldd*0] = CC[0+bs*0];
4881 		D[1+ldd*0] = CC[1+bs*0];
4882 
4883 		if(n1==1)
4884 			return;
4885 
4886 		D[0+ldd*1] = CC[0+bs*1];
4887 		D[1+ldd*1] = CC[1+bs*1];
4888 
4889 		if(n1==2)
4890 			return;
4891 
4892 		D[0+ldd*2] = CC[0+bs*2];
4893 		D[1+ldd*2] = CC[1+bs*2];
4894 
4895 		if(n1==3)
4896 			return;
4897 
4898 		D[0+ldd*3] = CC[0+bs*3];
4899 		D[1+ldd*3] = CC[1+bs*3];
4900 		}
4901 	else //if(m1>=1)
4902 		{
4903 		D[0+ldd*0] = CC[0+bs*0];
4904 
4905 		if(n1==1)
4906 			return;
4907 
4908 		D[0+ldd*1] = CC[0+bs*1];
4909 
4910 		if(n1==2)
4911 			return;
4912 
4913 		D[0+ldd*2] = CC[0+bs*2];
4914 
4915 		if(n1==3)
4916 			return;
4917 
4918 		D[0+ldd*3] = CC[0+bs*3];
4919 		}
4920 
4921 	return;
4922 
4923 	}
4924 #endif
4925 
4926 
4927 
4928 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_ru_4x4_tran_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd)4929 void kernel_dtrmm_nn_ru_4x4_tran_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd)
4930 	{
4931 
4932 	const int bs = 4;
4933 
4934 	double
4935 		a_0, a_1, a_2, a_3,
4936 		b_0, b_1, b_2, b_3;
4937 
4938 #if defined(TARGET_GENERIC)
4939 	double CC[16] = {0};
4940 #else
4941 	ALIGNED( double CC[16], 64 ) = {0};
4942 #endif
4943 
4944 	double alpha1 = 1.0;
4945 
4946 	// assume always kmax>=4 !!!
4947 
4948 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
4949 
4950 	A += kmax*bs;
4951 	B += kmax;
4952 
4953 	// k = 0
4954 
4955 	a_0 = A[0];
4956 	a_1 = A[1];
4957 	a_2 = A[2];
4958 	a_3 = A[3];
4959 
4960 	b_0 = B[0+0*ldb];
4961 	CC[0+bs*0] += a_0 * b_0;
4962 	CC[1+bs*0] += a_1 * b_0;
4963 	CC[2+bs*0] += a_2 * b_0;
4964 	CC[3+bs*0] += a_3 * b_0;
4965 
4966 	b_0 = B[0+1*ldb];
4967 	CC[0+bs*1] += a_0 * b_0;
4968 	CC[1+bs*1] += a_1 * b_0;
4969 	CC[2+bs*1] += a_2 * b_0;
4970 	CC[3+bs*1] += a_3 * b_0;
4971 
4972 	b_0 = B[0+2*ldb];
4973 	CC[0+bs*2] += a_0 * b_0;
4974 	CC[1+bs*2] += a_1 * b_0;
4975 	CC[2+bs*2] += a_2 * b_0;
4976 	CC[3+bs*2] += a_3 * b_0;
4977 
4978 	b_0 = B[0+3*ldb];
4979 	CC[0+bs*3] += a_0 * b_0;
4980 	CC[1+bs*3] += a_1 * b_0;
4981 	CC[2+bs*3] += a_2 * b_0;
4982 	CC[3+bs*3] += a_3 * b_0;
4983 
4984 	A += bs;
4985 	B += 1;
4986 //	k += 1;
4987 
4988 	// k = 1
4989 
4990 	a_0 = A[0];
4991 	a_1 = A[1];
4992 	a_2 = A[2];
4993 	a_3 = A[3];
4994 
4995 	b_0 = B[0+1*ldb];
4996 	CC[0+bs*1] += a_0 * b_0;
4997 	CC[1+bs*1] += a_1 * b_0;
4998 	CC[2+bs*1] += a_2 * b_0;
4999 	CC[3+bs*1] += a_3 * b_0;
5000 
5001 	b_0 = B[0+2*ldb];
5002 	CC[0+bs*2] += a_0 * b_0;
5003 	CC[1+bs*2] += a_1 * b_0;
5004 	CC[2+bs*2] += a_2 * b_0;
5005 	CC[3+bs*2] += a_3 * b_0;
5006 
5007 	b_0 = B[0+3*ldb];
5008 	CC[0+bs*3] += a_0 * b_0;
5009 	CC[1+bs*3] += a_1 * b_0;
5010 	CC[2+bs*3] += a_2 * b_0;
5011 	CC[3+bs*3] += a_3 * b_0;
5012 
5013 	A += bs;
5014 	B += 1;
5015 //	k += 1;
5016 
5017 	// k = 2
5018 
5019 	a_0 = A[0];
5020 	a_1 = A[1];
5021 	a_2 = A[2];
5022 	a_3 = A[3];
5023 
5024 	b_0 = B[0+2*ldb];
5025 	CC[0+bs*2] += a_0 * b_0;
5026 	CC[1+bs*2] += a_1 * b_0;
5027 	CC[2+bs*2] += a_2 * b_0;
5028 	CC[3+bs*2] += a_3 * b_0;
5029 
5030 	b_0 = B[0+3*ldb];
5031 	CC[0+bs*3] += a_0 * b_0;
5032 	CC[1+bs*3] += a_1 * b_0;
5033 	CC[2+bs*3] += a_2 * b_0;
5034 	CC[3+bs*3] += a_3 * b_0;
5035 
5036 	A += bs;
5037 	B += 1;
5038 //	k += 1;
5039 
5040 	// k = 3
5041 
5042 	a_0 = A[0];
5043 	a_1 = A[1];
5044 	a_2 = A[2];
5045 	a_3 = A[3];
5046 
5047 	b_0 = B[0+3*ldb];
5048 	CC[0+bs*3] += a_0 * b_0;
5049 	CC[1+bs*3] += a_1 * b_0;
5050 	CC[2+bs*3] += a_2 * b_0;
5051 	CC[3+bs*3] += a_3 * b_0;
5052 
5053 	A += bs;
5054 	B += 1;
5055 //	k += 1;
5056 
5057 	store:
5058 
5059 	// scale & tranpose & store
5060 	D[0+ldd*0] = alpha[0]*CC[0+bs*0];
5061 	D[0+ldd*1] = alpha[0]*CC[1+bs*0];
5062 	D[0+ldd*2] = alpha[0]*CC[2+bs*0];
5063 	D[0+ldd*3] = alpha[0]*CC[3+bs*0];
5064 
5065 	D[1+ldd*0] = alpha[0]*CC[0+bs*1];
5066 	D[1+ldd*1] = alpha[0]*CC[1+bs*1];
5067 	D[1+ldd*2] = alpha[0]*CC[2+bs*1];
5068 	D[1+ldd*3] = alpha[0]*CC[3+bs*1];
5069 
5070 	D[2+ldd*0] = alpha[0]*CC[0+bs*2];
5071 	D[2+ldd*1] = alpha[0]*CC[1+bs*2];
5072 	D[2+ldd*2] = alpha[0]*CC[2+bs*2];
5073 	D[2+ldd*3] = alpha[0]*CC[3+bs*2];
5074 
5075 	D[3+ldd*0] = alpha[0]*CC[0+bs*3];
5076 	D[3+ldd*1] = alpha[0]*CC[1+bs*3];
5077 	D[3+ldd*2] = alpha[0]*CC[2+bs*3];
5078 	D[3+ldd*3] = alpha[0]*CC[3+bs*3];
5079 
5080 	return;
5081 
5082 	}
5083 #endif
5084 
5085 
5086 
5087 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_ru_4x4_tran_vs_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd,int m1,int n1)5088 void kernel_dtrmm_nn_ru_4x4_tran_vs_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1)
5089 	{
5090 
5091 	const int bs = 4;
5092 
5093 	double
5094 		tmp,
5095 		a_0, a_1, a_2, a_3,
5096 		b_0, b_1, b_2, b_3;
5097 
5098 #if defined(TARGET_GENERIC)
5099 	double CC[16] = {0};
5100 #else
5101 	ALIGNED( double CC[16], 64 ) = {0};
5102 #endif
5103 
5104 	double alpha1 = 1.0;
5105 
5106 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
5107 
5108 	A += kmax*bs;
5109 	B += kmax;
5110 
5111 	// XXX m1 and n1 are swapped !!!!!
5112 	if(m1>=4)
5113 		{
5114 
5115 		// k = 0
5116 
5117 		a_0 = A[0];
5118 		a_1 = A[1];
5119 		a_2 = A[2];
5120 		a_3 = A[3];
5121 
5122 		b_0 = B[0+0*ldb];
5123 		CC[0+bs*0] += a_0 * b_0;
5124 		CC[1+bs*0] += a_1 * b_0;
5125 		CC[2+bs*0] += a_2 * b_0;
5126 		CC[3+bs*0] += a_3 * b_0;
5127 
5128 		b_0 = B[0+1*ldb];
5129 		CC[0+bs*1] += a_0 * b_0;
5130 		CC[1+bs*1] += a_1 * b_0;
5131 		CC[2+bs*1] += a_2 * b_0;
5132 		CC[3+bs*1] += a_3 * b_0;
5133 
5134 		b_0 = B[0+2*ldb];
5135 		CC[0+bs*2] += a_0 * b_0;
5136 		CC[1+bs*2] += a_1 * b_0;
5137 		CC[2+bs*2] += a_2 * b_0;
5138 		CC[3+bs*2] += a_3 * b_0;
5139 
5140 		b_0 = B[0+3*ldb];
5141 		CC[0+bs*3] += a_0 * b_0;
5142 		CC[1+bs*3] += a_1 * b_0;
5143 		CC[2+bs*3] += a_2 * b_0;
5144 		CC[3+bs*3] += a_3 * b_0;
5145 
5146 		A += bs;
5147 		B += 1;
5148 //		k += 1;
5149 
5150 		// k = 1
5151 
5152 		a_0 = A[0];
5153 		a_1 = A[1];
5154 		a_2 = A[2];
5155 		a_3 = A[3];
5156 
5157 		b_0 = B[0+1*ldb];
5158 		CC[0+bs*1] += a_0 * b_0;
5159 		CC[1+bs*1] += a_1 * b_0;
5160 		CC[2+bs*1] += a_2 * b_0;
5161 		CC[3+bs*1] += a_3 * b_0;
5162 
5163 		b_0 = B[0+2*ldb];
5164 		CC[0+bs*2] += a_0 * b_0;
5165 		CC[1+bs*2] += a_1 * b_0;
5166 		CC[2+bs*2] += a_2 * b_0;
5167 		CC[3+bs*2] += a_3 * b_0;
5168 
5169 		b_0 = B[0+3*ldb];
5170 		CC[0+bs*3] += a_0 * b_0;
5171 		CC[1+bs*3] += a_1 * b_0;
5172 		CC[2+bs*3] += a_2 * b_0;
5173 		CC[3+bs*3] += a_3 * b_0;
5174 
5175 		A += bs;
5176 		B += 1;
5177 //		k += 1;
5178 
5179 		// k = 2
5180 
5181 		a_0 = A[0];
5182 		a_1 = A[1];
5183 		a_2 = A[2];
5184 		a_3 = A[3];
5185 
5186 		b_0 = B[0+2*ldb];
5187 		CC[0+bs*2] += a_0 * b_0;
5188 		CC[1+bs*2] += a_1 * b_0;
5189 		CC[2+bs*2] += a_2 * b_0;
5190 		CC[3+bs*2] += a_3 * b_0;
5191 
5192 		b_0 = B[0+3*ldb];
5193 		CC[0+bs*3] += a_0 * b_0;
5194 		CC[1+bs*3] += a_1 * b_0;
5195 		CC[2+bs*3] += a_2 * b_0;
5196 		CC[3+bs*3] += a_3 * b_0;
5197 
5198 		A += bs;
5199 		B += 1;
5200 //		k += 1;
5201 
5202 		// k = 3
5203 
5204 		a_0 = A[0];
5205 		a_1 = A[1];
5206 		a_2 = A[2];
5207 		a_3 = A[3];
5208 
5209 		b_0 = B[0+3*ldb];
5210 		CC[0+bs*3] += a_0 * b_0;
5211 		CC[1+bs*3] += a_1 * b_0;
5212 		CC[2+bs*3] += a_2 * b_0;
5213 		CC[3+bs*3] += a_3 * b_0;
5214 
5215 		A += bs;
5216 		B += 1;
5217 //		k += 1;
5218 
5219 		}
5220 	else if(m1==3)
5221 		{
5222 
5223 		// k = 0
5224 
5225 		a_0 = A[0];
5226 		a_1 = A[1];
5227 		a_2 = A[2];
5228 		a_3 = A[3];
5229 
5230 		b_0 = B[0+0*ldb];
5231 		CC[0+bs*0] += a_0 * b_0;
5232 		CC[1+bs*0] += a_1 * b_0;
5233 		CC[2+bs*0] += a_2 * b_0;
5234 		CC[3+bs*0] += a_3 * b_0;
5235 
5236 		b_0 = B[0+1*ldb];
5237 		CC[0+bs*1] += a_0 * b_0;
5238 		CC[1+bs*1] += a_1 * b_0;
5239 		CC[2+bs*1] += a_2 * b_0;
5240 		CC[3+bs*1] += a_3 * b_0;
5241 
5242 		b_0 = B[0+2*ldb];
5243 		CC[0+bs*2] += a_0 * b_0;
5244 		CC[1+bs*2] += a_1 * b_0;
5245 		CC[2+bs*2] += a_2 * b_0;
5246 		CC[3+bs*2] += a_3 * b_0;
5247 
5248 		A += bs;
5249 		B += 1;
5250 //		k += 1;
5251 
5252 		// k = 1
5253 
5254 		a_0 = A[0];
5255 		a_1 = A[1];
5256 		a_2 = A[2];
5257 		a_3 = A[3];
5258 
5259 		b_0 = B[0+1*ldb];
5260 		CC[0+bs*1] += a_0 * b_0;
5261 		CC[1+bs*1] += a_1 * b_0;
5262 		CC[2+bs*1] += a_2 * b_0;
5263 		CC[3+bs*1] += a_3 * b_0;
5264 
5265 		b_0 = B[0+2*ldb];
5266 		CC[0+bs*2] += a_0 * b_0;
5267 		CC[1+bs*2] += a_1 * b_0;
5268 		CC[2+bs*2] += a_2 * b_0;
5269 		CC[3+bs*2] += a_3 * b_0;
5270 
5271 		A += bs;
5272 		B += 1;
5273 //		k += 1;
5274 
5275 		// k = 2
5276 
5277 		a_0 = A[0];
5278 		a_1 = A[1];
5279 		a_2 = A[2];
5280 		a_3 = A[3];
5281 
5282 		b_0 = B[0+2*ldb];
5283 		CC[0+bs*2] += a_0 * b_0;
5284 		CC[1+bs*2] += a_1 * b_0;
5285 		CC[2+bs*2] += a_2 * b_0;
5286 		CC[3+bs*2] += a_3 * b_0;
5287 
5288 		A += bs;
5289 		B += 1;
5290 //		k += 1;
5291 
5292 		// k = 3
5293 
5294 		A += bs;
5295 		B += 1;
5296 //		k += 1;
5297 
5298 		}
5299 	else if(m1==2)
5300 		{
5301 
5302 		// k = 0
5303 
5304 		a_0 = A[0];
5305 		a_1 = A[1];
5306 		a_2 = A[2];
5307 		a_3 = A[3];
5308 
5309 		b_0 = B[0+0*ldb];
5310 		CC[0+bs*0] += a_0 * b_0;
5311 		CC[1+bs*0] += a_1 * b_0;
5312 		CC[2+bs*0] += a_2 * b_0;
5313 		CC[3+bs*0] += a_3 * b_0;
5314 
5315 		b_0 = B[0+1*ldb];
5316 		CC[0+bs*1] += a_0 * b_0;
5317 		CC[1+bs*1] += a_1 * b_0;
5318 		CC[2+bs*1] += a_2 * b_0;
5319 		CC[3+bs*1] += a_3 * b_0;
5320 
5321 		A += bs;
5322 		B += 1;
5323 //		k += 1;
5324 
5325 		// k = 1
5326 
5327 		a_0 = A[0];
5328 		a_1 = A[1];
5329 		a_2 = A[2];
5330 		a_3 = A[3];
5331 
5332 		b_0 = B[0+1*ldb];
5333 		CC[0+bs*1] += a_0 * b_0;
5334 		CC[1+bs*1] += a_1 * b_0;
5335 		CC[2+bs*1] += a_2 * b_0;
5336 		CC[3+bs*1] += a_3 * b_0;
5337 
5338 		A += bs;
5339 		B += 1;
5340 //		k += 1;
5341 
5342 		// k = 2
5343 
5344 		A += bs;
5345 		B += 1;
5346 //		k += 1;
5347 
5348 		// k = 3
5349 
5350 		A += bs;
5351 		B += 1;
5352 //		k += 1;
5353 
5354 		}
5355 	else if(m1==1)
5356 		{
5357 
5358 		// k = 0
5359 
5360 		a_0 = A[0];
5361 		a_1 = A[1];
5362 		a_2 = A[2];
5363 		a_3 = A[3];
5364 
5365 		b_0 = B[0+0*ldb];
5366 		CC[0+bs*0] += a_0 * b_0;
5367 		CC[1+bs*0] += a_1 * b_0;
5368 		CC[2+bs*0] += a_2 * b_0;
5369 		CC[3+bs*0] += a_3 * b_0;
5370 
5371 		A += bs;
5372 		B += 1;
5373 //		k += 1;
5374 
5375 		// k = 1
5376 
5377 		A += bs;
5378 		B += 1;
5379 //		k += 1;
5380 
5381 		// k = 2
5382 
5383 		A += bs;
5384 		B += 1;
5385 //		k += 1;
5386 
5387 		// k = 3
5388 
5389 		A += bs;
5390 		B += 1;
5391 //		k += 1;
5392 
5393 		}
5394 
5395 	store:
5396 
5397 	// scale
5398 	CC[0+bs*0] = alpha[0]*CC[0+bs*0];
5399 	CC[1+bs*0] = alpha[0]*CC[1+bs*0];
5400 	CC[2+bs*0] = alpha[0]*CC[2+bs*0];
5401 	CC[3+bs*0] = alpha[0]*CC[3+bs*0];
5402 
5403 	CC[0+bs*1] = alpha[0]*CC[0+bs*1];
5404 	CC[1+bs*1] = alpha[0]*CC[1+bs*1];
5405 	CC[2+bs*1] = alpha[0]*CC[2+bs*1];
5406 	CC[3+bs*1] = alpha[0]*CC[3+bs*1];
5407 
5408 	CC[0+bs*2] = alpha[0]*CC[0+bs*2];
5409 	CC[1+bs*2] = alpha[0]*CC[1+bs*2];
5410 	CC[2+bs*2] = alpha[0]*CC[2+bs*2];
5411 	CC[3+bs*2] = alpha[0]*CC[3+bs*2];
5412 
5413 	CC[0+bs*3] = alpha[0]*CC[0+bs*3];
5414 	CC[1+bs*3] = alpha[0]*CC[1+bs*3];
5415 	CC[2+bs*3] = alpha[0]*CC[2+bs*3];
5416 	CC[3+bs*3] = alpha[0]*CC[3+bs*3];
5417 
5418 	// transpose
5419 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
5420 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
5421 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
5422 
5423 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
5424 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
5425 
5426 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
5427 
5428 	// store
5429 	if(m1>=4)
5430 		{
5431 		D[0+ldd*0] = CC[0+bs*0];
5432 		D[1+ldd*0] = CC[1+bs*0];
5433 		D[2+ldd*0] = CC[2+bs*0];
5434 		D[3+ldd*0] = CC[3+bs*0];
5435 
5436 		if(n1==1)
5437 			return;
5438 
5439 		D[0+ldd*1] = CC[0+bs*1];
5440 		D[1+ldd*1] = CC[1+bs*1];
5441 		D[2+ldd*1] = CC[2+bs*1];
5442 		D[3+ldd*1] = CC[3+bs*1];
5443 
5444 		if(n1==2)
5445 			return;
5446 
5447 		D[0+ldd*2] = CC[0+bs*2];
5448 		D[1+ldd*2] = CC[1+bs*2];
5449 		D[2+ldd*2] = CC[2+bs*2];
5450 		D[3+ldd*2] = CC[3+bs*2];
5451 
5452 		if(n1==3)
5453 			return;
5454 
5455 		D[0+ldd*3] = CC[0+bs*3];
5456 		D[1+ldd*3] = CC[1+bs*3];
5457 		D[2+ldd*3] = CC[2+bs*3];
5458 		D[3+ldd*3] = CC[3+bs*3];
5459 		}
5460 	else if(m1>=3)
5461 		{
5462 		D[0+ldd*0] = CC[0+bs*0];
5463 		D[1+ldd*0] = CC[1+bs*0];
5464 		D[2+ldd*0] = CC[2+bs*0];
5465 
5466 		if(n1==1)
5467 			return;
5468 
5469 		D[0+ldd*1] = CC[0+bs*1];
5470 		D[1+ldd*1] = CC[1+bs*1];
5471 		D[2+ldd*1] = CC[2+bs*1];
5472 
5473 		if(n1==2)
5474 			return;
5475 
5476 		D[0+ldd*2] = CC[0+bs*2];
5477 		D[1+ldd*2] = CC[1+bs*2];
5478 		D[2+ldd*2] = CC[2+bs*2];
5479 
5480 		if(n1==3)
5481 			return;
5482 
5483 		D[0+ldd*3] = CC[0+bs*3];
5484 		D[1+ldd*3] = CC[1+bs*3];
5485 		D[2+ldd*3] = CC[2+bs*3];
5486 		}
5487 	else if(m1>=2)
5488 		{
5489 		D[0+ldd*0] = CC[0+bs*0];
5490 		D[1+ldd*0] = CC[1+bs*0];
5491 
5492 		if(n1==1)
5493 			return;
5494 
5495 		D[0+ldd*1] = CC[0+bs*1];
5496 		D[1+ldd*1] = CC[1+bs*1];
5497 
5498 		if(n1==2)
5499 			return;
5500 
5501 		D[0+ldd*2] = CC[0+bs*2];
5502 		D[1+ldd*2] = CC[1+bs*2];
5503 
5504 		if(n1==3)
5505 			return;
5506 
5507 		D[0+ldd*3] = CC[0+bs*3];
5508 		D[1+ldd*3] = CC[1+bs*3];
5509 		}
5510 	else //if(m1>=1)
5511 		{
5512 		D[0+ldd*0] = CC[0+bs*0];
5513 
5514 		if(n1==1)
5515 			return;
5516 
5517 		D[0+ldd*1] = CC[0+bs*1];
5518 
5519 		if(n1==2)
5520 			return;
5521 
5522 		D[0+ldd*2] = CC[0+bs*2];
5523 
5524 		if(n1==3)
5525 			return;
5526 
5527 		D[0+ldd*3] = CC[0+bs*3];
5528 		}
5529 
5530 	return;
5531 
5532 	}
5533 #endif
5534 
5535 
5536 
5537 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_ru_one_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)5538 void kernel_dtrmm_nn_ru_one_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
5539 	{
5540 
5541 	const int bs = 4;
5542 
5543 	double
5544 		a_0, a_1, a_2, a_3,
5545 		b_0, b_1, b_2, b_3;
5546 
5547 #if defined(TARGET_GENERIC)
5548 	double CC[16] = {0};
5549 #else
5550 	ALIGNED( double CC[16], 64 ) = {0};
5551 #endif
5552 
5553 	double alpha1 = 1.0;
5554 
5555 	// assume always kmax>=4 !!!
5556 
5557 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
5558 
5559 	A += kmax*bs;
5560 	B += kmax;
5561 
5562 	// k = 0
5563 
5564 	a_0 = A[0];
5565 	a_1 = A[1];
5566 	a_2 = A[2];
5567 	a_3 = A[3];
5568 
5569 	CC[0+bs*0] += a_0;
5570 	CC[1+bs*0] += a_1;
5571 	CC[2+bs*0] += a_2;
5572 	CC[3+bs*0] += a_3;
5573 
5574 	b_0 = B[0+1*ldb];
5575 	CC[0+bs*1] += a_0 * b_0;
5576 	CC[1+bs*1] += a_1 * b_0;
5577 	CC[2+bs*1] += a_2 * b_0;
5578 	CC[3+bs*1] += a_3 * b_0;
5579 
5580 	b_0 = B[0+2*ldb];
5581 	CC[0+bs*2] += a_0 * b_0;
5582 	CC[1+bs*2] += a_1 * b_0;
5583 	CC[2+bs*2] += a_2 * b_0;
5584 	CC[3+bs*2] += a_3 * b_0;
5585 
5586 	b_0 = B[0+3*ldb];
5587 	CC[0+bs*3] += a_0 * b_0;
5588 	CC[1+bs*3] += a_1 * b_0;
5589 	CC[2+bs*3] += a_2 * b_0;
5590 	CC[3+bs*3] += a_3 * b_0;
5591 
5592 	A += bs;
5593 	B += 1;
5594 //	k += 1;
5595 
5596 	// k = 1
5597 
5598 	a_0 = A[0];
5599 	a_1 = A[1];
5600 	a_2 = A[2];
5601 	a_3 = A[3];
5602 
5603 	CC[0+bs*1] += a_0;
5604 	CC[1+bs*1] += a_1;
5605 	CC[2+bs*1] += a_2;
5606 	CC[3+bs*1] += a_3;
5607 
5608 	b_0 = B[0+2*ldb];
5609 	CC[0+bs*2] += a_0 * b_0;
5610 	CC[1+bs*2] += a_1 * b_0;
5611 	CC[2+bs*2] += a_2 * b_0;
5612 	CC[3+bs*2] += a_3 * b_0;
5613 
5614 	b_0 = B[0+3*ldb];
5615 	CC[0+bs*3] += a_0 * b_0;
5616 	CC[1+bs*3] += a_1 * b_0;
5617 	CC[2+bs*3] += a_2 * b_0;
5618 	CC[3+bs*3] += a_3 * b_0;
5619 
5620 	A += bs;
5621 	B += 1;
5622 //	k += 1;
5623 
5624 	// k = 2
5625 
5626 	a_0 = A[0];
5627 	a_1 = A[1];
5628 	a_2 = A[2];
5629 	a_3 = A[3];
5630 
5631 	CC[0+bs*2] += a_0;
5632 	CC[1+bs*2] += a_1;
5633 	CC[2+bs*2] += a_2;
5634 	CC[3+bs*2] += a_3;
5635 
5636 	b_0 = B[0+3*ldb];
5637 	CC[0+bs*3] += a_0 * b_0;
5638 	CC[1+bs*3] += a_1 * b_0;
5639 	CC[2+bs*3] += a_2 * b_0;
5640 	CC[3+bs*3] += a_3 * b_0;
5641 
5642 	A += bs;
5643 	B += 1;
5644 //	k += 1;
5645 
5646 	// k = 3
5647 
5648 	a_0 = A[0];
5649 	a_1 = A[1];
5650 	a_2 = A[2];
5651 	a_3 = A[3];
5652 
5653 	CC[0+bs*3] += a_0;
5654 	CC[1+bs*3] += a_1;
5655 	CC[2+bs*3] += a_2;
5656 	CC[3+bs*3] += a_3;
5657 
5658 	A += bs;
5659 	B += 1;
5660 //	k += 1;
5661 
5662 	store:
5663 
5664 	D[0+ldd*0] = alpha[0]*CC[0+bs*0];
5665 	D[1+ldd*0] = alpha[0]*CC[1+bs*0];
5666 	D[2+ldd*0] = alpha[0]*CC[2+bs*0];
5667 	D[3+ldd*0] = alpha[0]*CC[3+bs*0];
5668 
5669 	D[0+ldd*1] = alpha[0]*CC[0+bs*1];
5670 	D[1+ldd*1] = alpha[0]*CC[1+bs*1];
5671 	D[2+ldd*1] = alpha[0]*CC[2+bs*1];
5672 	D[3+ldd*1] = alpha[0]*CC[3+bs*1];
5673 
5674 	D[0+ldd*2] = alpha[0]*CC[0+bs*2];
5675 	D[1+ldd*2] = alpha[0]*CC[1+bs*2];
5676 	D[2+ldd*2] = alpha[0]*CC[2+bs*2];
5677 	D[3+ldd*2] = alpha[0]*CC[3+bs*2];
5678 
5679 	D[0+ldd*3] = alpha[0]*CC[0+bs*3];
5680 	D[1+ldd*3] = alpha[0]*CC[1+bs*3];
5681 	D[2+ldd*3] = alpha[0]*CC[2+bs*3];
5682 	D[3+ldd*3] = alpha[0]*CC[3+bs*3];
5683 
5684 	return;
5685 
5686 	}
5687 #endif
5688 
5689 
5690 
5691 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_ru_one_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)5692 void kernel_dtrmm_nn_ru_one_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
5693 	{
5694 
5695 	const int bs = 4;
5696 
5697 	double
5698 		a_0, a_1, a_2, a_3,
5699 		b_0, b_1, b_2, b_3;
5700 
5701 #if defined(TARGET_GENERIC)
5702 	double CC[16] = {0};
5703 #else
5704 	ALIGNED( double CC[16], 64 ) = {0};
5705 #endif
5706 
5707 	double alpha1 = 1.0;
5708 
5709 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
5710 
5711 	A += kmax*bs;
5712 	B += kmax;
5713 
5714 	if(n1>=4)
5715 		{
5716 
5717 		// k = 0
5718 
5719 		a_0 = A[0];
5720 		a_1 = A[1];
5721 		a_2 = A[2];
5722 		a_3 = A[3];
5723 
5724 		CC[0+bs*0] += a_0;
5725 		CC[1+bs*0] += a_1;
5726 		CC[2+bs*0] += a_2;
5727 		CC[3+bs*0] += a_3;
5728 
5729 		b_0 = B[0+1*ldb];
5730 		CC[0+bs*1] += a_0 * b_0;
5731 		CC[1+bs*1] += a_1 * b_0;
5732 		CC[2+bs*1] += a_2 * b_0;
5733 		CC[3+bs*1] += a_3 * b_0;
5734 
5735 		b_0 = B[0+2*ldb];
5736 		CC[0+bs*2] += a_0 * b_0;
5737 		CC[1+bs*2] += a_1 * b_0;
5738 		CC[2+bs*2] += a_2 * b_0;
5739 		CC[3+bs*2] += a_3 * b_0;
5740 
5741 		b_0 = B[0+3*ldb];
5742 		CC[0+bs*3] += a_0 * b_0;
5743 		CC[1+bs*3] += a_1 * b_0;
5744 		CC[2+bs*3] += a_2 * b_0;
5745 		CC[3+bs*3] += a_3 * b_0;
5746 
5747 		A += bs;
5748 		B += 1;
5749 //		k += 1;
5750 
5751 		// k = 1
5752 
5753 		a_0 = A[0];
5754 		a_1 = A[1];
5755 		a_2 = A[2];
5756 		a_3 = A[3];
5757 
5758 		CC[0+bs*1] += a_0;
5759 		CC[1+bs*1] += a_1;
5760 		CC[2+bs*1] += a_2;
5761 		CC[3+bs*1] += a_3;
5762 
5763 		b_0 = B[0+2*ldb];
5764 		CC[0+bs*2] += a_0 * b_0;
5765 		CC[1+bs*2] += a_1 * b_0;
5766 		CC[2+bs*2] += a_2 * b_0;
5767 		CC[3+bs*2] += a_3 * b_0;
5768 
5769 		b_0 = B[0+3*ldb];
5770 		CC[0+bs*3] += a_0 * b_0;
5771 		CC[1+bs*3] += a_1 * b_0;
5772 		CC[2+bs*3] += a_2 * b_0;
5773 		CC[3+bs*3] += a_3 * b_0;
5774 
5775 		A += bs;
5776 		B += 1;
5777 //		k += 1;
5778 
5779 		// k = 2
5780 
5781 		a_0 = A[0];
5782 		a_1 = A[1];
5783 		a_2 = A[2];
5784 		a_3 = A[3];
5785 
5786 		CC[0+bs*2] += a_0;
5787 		CC[1+bs*2] += a_1;
5788 		CC[2+bs*2] += a_2;
5789 		CC[3+bs*2] += a_3;
5790 
5791 		b_0 = B[0+3*ldb];
5792 		CC[0+bs*3] += a_0 * b_0;
5793 		CC[1+bs*3] += a_1 * b_0;
5794 		CC[2+bs*3] += a_2 * b_0;
5795 		CC[3+bs*3] += a_3 * b_0;
5796 
5797 		A += bs;
5798 		B += 1;
5799 //		k += 1;
5800 
5801 		// k = 3
5802 
5803 		a_0 = A[0];
5804 		a_1 = A[1];
5805 		a_2 = A[2];
5806 		a_3 = A[3];
5807 
5808 		CC[0+bs*3] += a_0;
5809 		CC[1+bs*3] += a_1;
5810 		CC[2+bs*3] += a_2;
5811 		CC[3+bs*3] += a_3;
5812 
5813 		A += bs;
5814 		B += 1;
5815 //		k += 1;
5816 
5817 		}
5818 	else if(n1==3)
5819 		{
5820 
5821 		// k = 0
5822 
5823 		a_0 = A[0];
5824 		a_1 = A[1];
5825 		a_2 = A[2];
5826 		a_3 = A[3];
5827 
5828 		CC[0+bs*0] += a_0;
5829 		CC[1+bs*0] += a_1;
5830 		CC[2+bs*0] += a_2;
5831 		CC[3+bs*0] += a_3;
5832 
5833 		b_0 = B[0+1*ldb];
5834 		CC[0+bs*1] += a_0 * b_0;
5835 		CC[1+bs*1] += a_1 * b_0;
5836 		CC[2+bs*1] += a_2 * b_0;
5837 		CC[3+bs*1] += a_3 * b_0;
5838 
5839 		b_0 = B[0+2*ldb];
5840 		CC[0+bs*2] += a_0 * b_0;
5841 		CC[1+bs*2] += a_1 * b_0;
5842 		CC[2+bs*2] += a_2 * b_0;
5843 		CC[3+bs*2] += a_3 * b_0;
5844 
5845 		A += bs;
5846 		B += 1;
5847 //		k += 1;
5848 
5849 		// k = 1
5850 
5851 		a_0 = A[0];
5852 		a_1 = A[1];
5853 		a_2 = A[2];
5854 		a_3 = A[3];
5855 
5856 		CC[0+bs*1] += a_0;
5857 		CC[1+bs*1] += a_1;
5858 		CC[2+bs*1] += a_2;
5859 		CC[3+bs*1] += a_3;
5860 
5861 		b_0 = B[0+2*ldb];
5862 		CC[0+bs*2] += a_0 * b_0;
5863 		CC[1+bs*2] += a_1 * b_0;
5864 		CC[2+bs*2] += a_2 * b_0;
5865 		CC[3+bs*2] += a_3 * b_0;
5866 
5867 		A += bs;
5868 		B += 1;
5869 //		k += 1;
5870 
5871 		// k = 2
5872 
5873 		a_0 = A[0];
5874 		a_1 = A[1];
5875 		a_2 = A[2];
5876 		a_3 = A[3];
5877 
5878 		CC[0+bs*2] += a_0;
5879 		CC[1+bs*2] += a_1;
5880 		CC[2+bs*2] += a_2;
5881 		CC[3+bs*2] += a_3;
5882 
5883 		A += bs;
5884 		B += 1;
5885 //		k += 1;
5886 
5887 		// k = 3
5888 
5889 		A += bs;
5890 		B += 1;
5891 //		k += 1;
5892 
5893 		}
5894 	else if(n1==2)
5895 		{
5896 
5897 		// k = 0
5898 
5899 		a_0 = A[0];
5900 		a_1 = A[1];
5901 		a_2 = A[2];
5902 		a_3 = A[3];
5903 
5904 		CC[0+bs*0] += a_0;
5905 		CC[1+bs*0] += a_1;
5906 		CC[2+bs*0] += a_2;
5907 		CC[3+bs*0] += a_3;
5908 
5909 		b_0 = B[0+1*ldb];
5910 		CC[0+bs*1] += a_0 * b_0;
5911 		CC[1+bs*1] += a_1 * b_0;
5912 		CC[2+bs*1] += a_2 * b_0;
5913 		CC[3+bs*1] += a_3 * b_0;
5914 
5915 		A += bs;
5916 		B += 1;
5917 //		k += 1;
5918 
5919 		// k = 1
5920 
5921 		a_0 = A[0];
5922 		a_1 = A[1];
5923 		a_2 = A[2];
5924 		a_3 = A[3];
5925 
5926 		CC[0+bs*1] += a_0;
5927 		CC[1+bs*1] += a_1;
5928 		CC[2+bs*1] += a_2;
5929 		CC[3+bs*1] += a_3;
5930 
5931 		A += bs;
5932 		B += 1;
5933 //		k += 1;
5934 
5935 		// k = 2
5936 
5937 		A += bs;
5938 		B += 1;
5939 //		k += 1;
5940 
5941 		// k = 3
5942 
5943 		A += bs;
5944 		B += 1;
5945 //		k += 1;
5946 
5947 		}
5948 	else if(n1==1)
5949 		{
5950 
5951 		// k = 0
5952 
5953 		a_0 = A[0];
5954 		a_1 = A[1];
5955 		a_2 = A[2];
5956 		a_3 = A[3];
5957 
5958 		CC[0+bs*0] += a_0;
5959 		CC[1+bs*0] += a_1;
5960 		CC[2+bs*0] += a_2;
5961 		CC[3+bs*0] += a_3;
5962 
5963 		A += bs;
5964 		B += 1;
5965 //		k += 1;
5966 
5967 		// k = 1
5968 
5969 		A += bs;
5970 		B += 1;
5971 //		k += 1;
5972 
5973 		// k = 2
5974 
5975 		A += bs;
5976 		B += 1;
5977 //		k += 1;
5978 
5979 		// k = 3
5980 
5981 		A += bs;
5982 		B += 1;
5983 //		k += 1;
5984 
5985 		}
5986 
5987 	store:
5988 
5989 	CC[0+bs*0] = alpha[0]*CC[0+bs*0];
5990 	CC[1+bs*0] = alpha[0]*CC[1+bs*0];
5991 	CC[2+bs*0] = alpha[0]*CC[2+bs*0];
5992 	CC[3+bs*0] = alpha[0]*CC[3+bs*0];
5993 
5994 	CC[0+bs*1] = alpha[0]*CC[0+bs*1];
5995 	CC[1+bs*1] = alpha[0]*CC[1+bs*1];
5996 	CC[2+bs*1] = alpha[0]*CC[2+bs*1];
5997 	CC[3+bs*1] = alpha[0]*CC[3+bs*1];
5998 
5999 	CC[0+bs*2] = alpha[0]*CC[0+bs*2];
6000 	CC[1+bs*2] = alpha[0]*CC[1+bs*2];
6001 	CC[2+bs*2] = alpha[0]*CC[2+bs*2];
6002 	CC[3+bs*2] = alpha[0]*CC[3+bs*2];
6003 
6004 	CC[0+bs*3] = alpha[0]*CC[0+bs*3];
6005 	CC[1+bs*3] = alpha[0]*CC[1+bs*3];
6006 	CC[2+bs*3] = alpha[0]*CC[2+bs*3];
6007 	CC[3+bs*3] = alpha[0]*CC[3+bs*3];
6008 
6009 	if(m1>=4)
6010 		{
6011 		D[0+ldd*0] = CC[0+bs*0];
6012 		D[1+ldd*0] = CC[1+bs*0];
6013 		D[2+ldd*0] = CC[2+bs*0];
6014 		D[3+ldd*0] = CC[3+bs*0];
6015 
6016 		if(n1==1)
6017 			return;
6018 
6019 		D[0+ldd*1] = CC[0+bs*1];
6020 		D[1+ldd*1] = CC[1+bs*1];
6021 		D[2+ldd*1] = CC[2+bs*1];
6022 		D[3+ldd*1] = CC[3+bs*1];
6023 
6024 		if(n1==2)
6025 			return;
6026 
6027 		D[0+ldd*2] = CC[0+bs*2];
6028 		D[1+ldd*2] = CC[1+bs*2];
6029 		D[2+ldd*2] = CC[2+bs*2];
6030 		D[3+ldd*2] = CC[3+bs*2];
6031 
6032 		if(n1==3)
6033 			return;
6034 
6035 		D[0+ldd*3] = CC[0+bs*3];
6036 		D[1+ldd*3] = CC[1+bs*3];
6037 		D[2+ldd*3] = CC[2+bs*3];
6038 		D[3+ldd*3] = CC[3+bs*3];
6039 		}
6040 	else if(m1>=3)
6041 		{
6042 		D[0+ldd*0] = CC[0+bs*0];
6043 		D[1+ldd*0] = CC[1+bs*0];
6044 		D[2+ldd*0] = CC[2+bs*0];
6045 
6046 		if(n1==1)
6047 			return;
6048 
6049 		D[0+ldd*1] = CC[0+bs*1];
6050 		D[1+ldd*1] = CC[1+bs*1];
6051 		D[2+ldd*1] = CC[2+bs*1];
6052 
6053 		if(n1==2)
6054 			return;
6055 
6056 		D[0+ldd*2] = CC[0+bs*2];
6057 		D[1+ldd*2] = CC[1+bs*2];
6058 		D[2+ldd*2] = CC[2+bs*2];
6059 
6060 		if(n1==3)
6061 			return;
6062 
6063 		D[0+ldd*3] = CC[0+bs*3];
6064 		D[1+ldd*3] = CC[1+bs*3];
6065 		D[2+ldd*3] = CC[2+bs*3];
6066 		}
6067 	else if(m1>=2)
6068 		{
6069 		D[0+ldd*0] = CC[0+bs*0];
6070 		D[1+ldd*0] = CC[1+bs*0];
6071 
6072 		if(n1==1)
6073 			return;
6074 
6075 		D[0+ldd*1] = CC[0+bs*1];
6076 		D[1+ldd*1] = CC[1+bs*1];
6077 
6078 		if(n1==2)
6079 			return;
6080 
6081 		D[0+ldd*2] = CC[0+bs*2];
6082 		D[1+ldd*2] = CC[1+bs*2];
6083 
6084 		if(n1==3)
6085 			return;
6086 
6087 		D[0+ldd*3] = CC[0+bs*3];
6088 		D[1+ldd*3] = CC[1+bs*3];
6089 		}
6090 	else //if(m1>=1)
6091 		{
6092 		D[0+ldd*0] = CC[0+bs*0];
6093 
6094 		if(n1==1)
6095 			return;
6096 
6097 		D[0+ldd*1] = CC[0+bs*1];
6098 
6099 		if(n1==2)
6100 			return;
6101 
6102 		D[0+ldd*2] = CC[0+bs*2];
6103 
6104 		if(n1==3)
6105 			return;
6106 
6107 		D[0+ldd*3] = CC[0+bs*3];
6108 		}
6109 
6110 	return;
6111 
6112 	}
6113 #endif
6114 
6115 
6116 
6117 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_ru_one_4x4_tran_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd)6118 void kernel_dtrmm_nn_ru_one_4x4_tran_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd)
6119 	{
6120 
6121 	const int bs = 4;
6122 
6123 	double
6124 		a_0, a_1, a_2, a_3,
6125 		b_0, b_1, b_2, b_3;
6126 
6127 #if defined(TARGET_GENERIC)
6128 	double CC[16] = {0};
6129 #else
6130 	ALIGNED( double CC[16], 64 ) = {0};
6131 #endif
6132 
6133 	double alpha1 = 1.0;
6134 
6135 	// assume always kmax>=4 !!!
6136 
6137 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
6138 
6139 	A += kmax*bs;
6140 	B += kmax;
6141 
6142 	// k = 0
6143 
6144 	a_0 = A[0];
6145 	a_1 = A[1];
6146 	a_2 = A[2];
6147 	a_3 = A[3];
6148 
6149 	CC[0+bs*0] += a_0;
6150 	CC[1+bs*0] += a_1;
6151 	CC[2+bs*0] += a_2;
6152 	CC[3+bs*0] += a_3;
6153 
6154 	b_0 = B[0+1*ldb];
6155 	CC[0+bs*1] += a_0 * b_0;
6156 	CC[1+bs*1] += a_1 * b_0;
6157 	CC[2+bs*1] += a_2 * b_0;
6158 	CC[3+bs*1] += a_3 * b_0;
6159 
6160 	b_0 = B[0+2*ldb];
6161 	CC[0+bs*2] += a_0 * b_0;
6162 	CC[1+bs*2] += a_1 * b_0;
6163 	CC[2+bs*2] += a_2 * b_0;
6164 	CC[3+bs*2] += a_3 * b_0;
6165 
6166 	b_0 = B[0+3*ldb];
6167 	CC[0+bs*3] += a_0 * b_0;
6168 	CC[1+bs*3] += a_1 * b_0;
6169 	CC[2+bs*3] += a_2 * b_0;
6170 	CC[3+bs*3] += a_3 * b_0;
6171 
6172 	A += bs;
6173 	B += 1;
6174 //	k += 1;
6175 
6176 	// k = 1
6177 
6178 	a_0 = A[0];
6179 	a_1 = A[1];
6180 	a_2 = A[2];
6181 	a_3 = A[3];
6182 
6183 	CC[0+bs*1] += a_0;
6184 	CC[1+bs*1] += a_1;
6185 	CC[2+bs*1] += a_2;
6186 	CC[3+bs*1] += a_3;
6187 
6188 	b_0 = B[0+2*ldb];
6189 	CC[0+bs*2] += a_0 * b_0;
6190 	CC[1+bs*2] += a_1 * b_0;
6191 	CC[2+bs*2] += a_2 * b_0;
6192 	CC[3+bs*2] += a_3 * b_0;
6193 
6194 	b_0 = B[0+3*ldb];
6195 	CC[0+bs*3] += a_0 * b_0;
6196 	CC[1+bs*3] += a_1 * b_0;
6197 	CC[2+bs*3] += a_2 * b_0;
6198 	CC[3+bs*3] += a_3 * b_0;
6199 
6200 	A += bs;
6201 	B += 1;
6202 //	k += 1;
6203 
6204 	// k = 2
6205 
6206 	a_0 = A[0];
6207 	a_1 = A[1];
6208 	a_2 = A[2];
6209 	a_3 = A[3];
6210 
6211 	CC[0+bs*2] += a_0;
6212 	CC[1+bs*2] += a_1;
6213 	CC[2+bs*2] += a_2;
6214 	CC[3+bs*2] += a_3;
6215 
6216 	b_0 = B[0+3*ldb];
6217 	CC[0+bs*3] += a_0 * b_0;
6218 	CC[1+bs*3] += a_1 * b_0;
6219 	CC[2+bs*3] += a_2 * b_0;
6220 	CC[3+bs*3] += a_3 * b_0;
6221 
6222 	A += bs;
6223 	B += 1;
6224 //	k += 1;
6225 
6226 	// k = 3
6227 
6228 	a_0 = A[0];
6229 	a_1 = A[1];
6230 	a_2 = A[2];
6231 	a_3 = A[3];
6232 
6233 	CC[0+bs*3] += a_0;
6234 	CC[1+bs*3] += a_1;
6235 	CC[2+bs*3] += a_2;
6236 	CC[3+bs*3] += a_3;
6237 
6238 	A += bs;
6239 	B += 1;
6240 //	k += 1;
6241 
6242 	store:
6243 
6244 	// scale & tranpose & store
6245 	D[0+ldd*0] = alpha[0]*CC[0+bs*0];
6246 	D[0+ldd*1] = alpha[0]*CC[1+bs*0];
6247 	D[0+ldd*2] = alpha[0]*CC[2+bs*0];
6248 	D[0+ldd*3] = alpha[0]*CC[3+bs*0];
6249 
6250 	D[1+ldd*0] = alpha[0]*CC[0+bs*1];
6251 	D[1+ldd*1] = alpha[0]*CC[1+bs*1];
6252 	D[1+ldd*2] = alpha[0]*CC[2+bs*1];
6253 	D[1+ldd*3] = alpha[0]*CC[3+bs*1];
6254 
6255 	D[2+ldd*0] = alpha[0]*CC[0+bs*2];
6256 	D[2+ldd*1] = alpha[0]*CC[1+bs*2];
6257 	D[2+ldd*2] = alpha[0]*CC[2+bs*2];
6258 	D[2+ldd*3] = alpha[0]*CC[3+bs*2];
6259 
6260 	D[3+ldd*0] = alpha[0]*CC[0+bs*3];
6261 	D[3+ldd*1] = alpha[0]*CC[1+bs*3];
6262 	D[3+ldd*2] = alpha[0]*CC[2+bs*3];
6263 	D[3+ldd*3] = alpha[0]*CC[3+bs*3];
6264 
6265 	return;
6266 
6267 	}
6268 #endif
6269 
6270 
6271 
6272 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_ru_one_4x4_tran_vs_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd,int m1,int n1)6273 void kernel_dtrmm_nn_ru_one_4x4_tran_vs_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1)
6274 	{
6275 
6276 	const int bs = 4;
6277 
6278 	double
6279 		tmp,
6280 		a_0, a_1, a_2, a_3,
6281 		b_0, b_1, b_2, b_3;
6282 
6283 #if defined(TARGET_GENERIC)
6284 	double CC[16] = {0};
6285 #else
6286 	ALIGNED( double CC[16], 64 ) = {0};
6287 #endif
6288 
6289 	double alpha1 = 1.0;
6290 
6291 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
6292 
6293 	A += kmax*bs;
6294 	B += kmax;
6295 
6296 	// XXX m1 and n1 are swapped !!!!!
6297 	if(m1>=4)
6298 		{
6299 
6300 		// k = 0
6301 
6302 		a_0 = A[0];
6303 		a_1 = A[1];
6304 		a_2 = A[2];
6305 		a_3 = A[3];
6306 
6307 		CC[0+bs*0] += a_0;
6308 		CC[1+bs*0] += a_1;
6309 		CC[2+bs*0] += a_2;
6310 		CC[3+bs*0] += a_3;
6311 
6312 		b_0 = B[0+1*ldb];
6313 		CC[0+bs*1] += a_0 * b_0;
6314 		CC[1+bs*1] += a_1 * b_0;
6315 		CC[2+bs*1] += a_2 * b_0;
6316 		CC[3+bs*1] += a_3 * b_0;
6317 
6318 		b_0 = B[0+2*ldb];
6319 		CC[0+bs*2] += a_0 * b_0;
6320 		CC[1+bs*2] += a_1 * b_0;
6321 		CC[2+bs*2] += a_2 * b_0;
6322 		CC[3+bs*2] += a_3 * b_0;
6323 
6324 		b_0 = B[0+3*ldb];
6325 		CC[0+bs*3] += a_0 * b_0;
6326 		CC[1+bs*3] += a_1 * b_0;
6327 		CC[2+bs*3] += a_2 * b_0;
6328 		CC[3+bs*3] += a_3 * b_0;
6329 
6330 		A += bs;
6331 		B += 1;
6332 //		k += 1;
6333 
6334 		// k = 1
6335 
6336 		a_0 = A[0];
6337 		a_1 = A[1];
6338 		a_2 = A[2];
6339 		a_3 = A[3];
6340 
6341 		CC[0+bs*1] += a_0;
6342 		CC[1+bs*1] += a_1;
6343 		CC[2+bs*1] += a_2;
6344 		CC[3+bs*1] += a_3;
6345 
6346 		b_0 = B[0+2*ldb];
6347 		CC[0+bs*2] += a_0 * b_0;
6348 		CC[1+bs*2] += a_1 * b_0;
6349 		CC[2+bs*2] += a_2 * b_0;
6350 		CC[3+bs*2] += a_3 * b_0;
6351 
6352 		b_0 = B[0+3*ldb];
6353 		CC[0+bs*3] += a_0 * b_0;
6354 		CC[1+bs*3] += a_1 * b_0;
6355 		CC[2+bs*3] += a_2 * b_0;
6356 		CC[3+bs*3] += a_3 * b_0;
6357 
6358 		A += bs;
6359 		B += 1;
6360 //		k += 1;
6361 
6362 		// k = 2
6363 
6364 		a_0 = A[0];
6365 		a_1 = A[1];
6366 		a_2 = A[2];
6367 		a_3 = A[3];
6368 
6369 		CC[0+bs*2] += a_0;
6370 		CC[1+bs*2] += a_1;
6371 		CC[2+bs*2] += a_2;
6372 		CC[3+bs*2] += a_3;
6373 
6374 		b_0 = B[0+3*ldb];
6375 		CC[0+bs*3] += a_0 * b_0;
6376 		CC[1+bs*3] += a_1 * b_0;
6377 		CC[2+bs*3] += a_2 * b_0;
6378 		CC[3+bs*3] += a_3 * b_0;
6379 
6380 		A += bs;
6381 		B += 1;
6382 //		k += 1;
6383 
6384 		// k = 3
6385 
6386 		a_0 = A[0];
6387 		a_1 = A[1];
6388 		a_2 = A[2];
6389 		a_3 = A[3];
6390 
6391 		CC[0+bs*3] += a_0;
6392 		CC[1+bs*3] += a_1;
6393 		CC[2+bs*3] += a_2;
6394 		CC[3+bs*3] += a_3;
6395 
6396 		A += bs;
6397 		B += 1;
6398 //		k += 1;
6399 
6400 		}
6401 	else if(m1==3)
6402 		{
6403 
6404 		// k = 0
6405 
6406 		a_0 = A[0];
6407 		a_1 = A[1];
6408 		a_2 = A[2];
6409 		a_3 = A[3];
6410 
6411 		CC[0+bs*0] += a_0;
6412 		CC[1+bs*0] += a_1;
6413 		CC[2+bs*0] += a_2;
6414 		CC[3+bs*0] += a_3;
6415 
6416 		b_0 = B[0+1*ldb];
6417 		CC[0+bs*1] += a_0 * b_0;
6418 		CC[1+bs*1] += a_1 * b_0;
6419 		CC[2+bs*1] += a_2 * b_0;
6420 		CC[3+bs*1] += a_3 * b_0;
6421 
6422 		b_0 = B[0+2*ldb];
6423 		CC[0+bs*2] += a_0 * b_0;
6424 		CC[1+bs*2] += a_1 * b_0;
6425 		CC[2+bs*2] += a_2 * b_0;
6426 		CC[3+bs*2] += a_3 * b_0;
6427 
6428 		A += bs;
6429 		B += 1;
6430 //		k += 1;
6431 
6432 		// k = 1
6433 
6434 		a_0 = A[0];
6435 		a_1 = A[1];
6436 		a_2 = A[2];
6437 		a_3 = A[3];
6438 
6439 		CC[0+bs*1] += a_0;
6440 		CC[1+bs*1] += a_1;
6441 		CC[2+bs*1] += a_2;
6442 		CC[3+bs*1] += a_3;
6443 
6444 		b_0 = B[0+2*ldb];
6445 		CC[0+bs*2] += a_0 * b_0;
6446 		CC[1+bs*2] += a_1 * b_0;
6447 		CC[2+bs*2] += a_2 * b_0;
6448 		CC[3+bs*2] += a_3 * b_0;
6449 
6450 		A += bs;
6451 		B += 1;
6452 //		k += 1;
6453 
6454 		// k = 2
6455 
6456 		a_0 = A[0];
6457 		a_1 = A[1];
6458 		a_2 = A[2];
6459 		a_3 = A[3];
6460 
6461 		CC[0+bs*2] += a_0;
6462 		CC[1+bs*2] += a_1;
6463 		CC[2+bs*2] += a_2;
6464 		CC[3+bs*2] += a_3;
6465 
6466 		A += bs;
6467 		B += 1;
6468 //		k += 1;
6469 
6470 		// k = 3
6471 
6472 		A += bs;
6473 		B += 1;
6474 //		k += 1;
6475 
6476 		}
6477 	else if(m1==2)
6478 		{
6479 
6480 		// k = 0
6481 
6482 		a_0 = A[0];
6483 		a_1 = A[1];
6484 		a_2 = A[2];
6485 		a_3 = A[3];
6486 
6487 		CC[0+bs*0] += a_0;
6488 		CC[1+bs*0] += a_1;
6489 		CC[2+bs*0] += a_2;
6490 		CC[3+bs*0] += a_3;
6491 
6492 		b_0 = B[0+1*ldb];
6493 		CC[0+bs*1] += a_0 * b_0;
6494 		CC[1+bs*1] += a_1 * b_0;
6495 		CC[2+bs*1] += a_2 * b_0;
6496 		CC[3+bs*1] += a_3 * b_0;
6497 
6498 		A += bs;
6499 		B += 1;
6500 //		k += 1;
6501 
6502 		// k = 1
6503 
6504 		a_0 = A[0];
6505 		a_1 = A[1];
6506 		a_2 = A[2];
6507 		a_3 = A[3];
6508 
6509 		CC[0+bs*1] += a_0;
6510 		CC[1+bs*1] += a_1;
6511 		CC[2+bs*1] += a_2;
6512 		CC[3+bs*1] += a_3;
6513 
6514 		A += bs;
6515 		B += 1;
6516 //		k += 1;
6517 
6518 		// k = 2
6519 
6520 		A += bs;
6521 		B += 1;
6522 //		k += 1;
6523 
6524 		// k = 3
6525 
6526 		A += bs;
6527 		B += 1;
6528 //		k += 1;
6529 
6530 		}
6531 	else if(m1==1)
6532 		{
6533 
6534 		// k = 0
6535 
6536 		a_0 = A[0];
6537 		a_1 = A[1];
6538 		a_2 = A[2];
6539 		a_3 = A[3];
6540 
6541 		CC[0+bs*0] += a_0;
6542 		CC[1+bs*0] += a_1;
6543 		CC[2+bs*0] += a_2;
6544 		CC[3+bs*0] += a_3;
6545 
6546 		A += bs;
6547 		B += 1;
6548 //		k += 1;
6549 
6550 		// k = 1
6551 
6552 		A += bs;
6553 		B += 1;
6554 //		k += 1;
6555 
6556 		// k = 2
6557 
6558 		A += bs;
6559 		B += 1;
6560 //		k += 1;
6561 
6562 		// k = 3
6563 
6564 		A += bs;
6565 		B += 1;
6566 //		k += 1;
6567 
6568 		}
6569 
6570 	store:
6571 
6572 	// scale
6573 	CC[0+bs*0] = alpha[0]*CC[0+bs*0];
6574 	CC[1+bs*0] = alpha[0]*CC[1+bs*0];
6575 	CC[2+bs*0] = alpha[0]*CC[2+bs*0];
6576 	CC[3+bs*0] = alpha[0]*CC[3+bs*0];
6577 
6578 	CC[0+bs*1] = alpha[0]*CC[0+bs*1];
6579 	CC[1+bs*1] = alpha[0]*CC[1+bs*1];
6580 	CC[2+bs*1] = alpha[0]*CC[2+bs*1];
6581 	CC[3+bs*1] = alpha[0]*CC[3+bs*1];
6582 
6583 	CC[0+bs*2] = alpha[0]*CC[0+bs*2];
6584 	CC[1+bs*2] = alpha[0]*CC[1+bs*2];
6585 	CC[2+bs*2] = alpha[0]*CC[2+bs*2];
6586 	CC[3+bs*2] = alpha[0]*CC[3+bs*2];
6587 
6588 	CC[0+bs*3] = alpha[0]*CC[0+bs*3];
6589 	CC[1+bs*3] = alpha[0]*CC[1+bs*3];
6590 	CC[2+bs*3] = alpha[0]*CC[2+bs*3];
6591 	CC[3+bs*3] = alpha[0]*CC[3+bs*3];
6592 
6593 	// transpose
6594 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
6595 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
6596 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
6597 
6598 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
6599 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
6600 
6601 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
6602 
6603 	// store
6604 	if(m1>=4)
6605 		{
6606 		D[0+ldd*0] = CC[0+bs*0];
6607 		D[1+ldd*0] = CC[1+bs*0];
6608 		D[2+ldd*0] = CC[2+bs*0];
6609 		D[3+ldd*0] = CC[3+bs*0];
6610 
6611 		if(n1==1)
6612 			return;
6613 
6614 		D[0+ldd*1] = CC[0+bs*1];
6615 		D[1+ldd*1] = CC[1+bs*1];
6616 		D[2+ldd*1] = CC[2+bs*1];
6617 		D[3+ldd*1] = CC[3+bs*1];
6618 
6619 		if(n1==2)
6620 			return;
6621 
6622 		D[0+ldd*2] = CC[0+bs*2];
6623 		D[1+ldd*2] = CC[1+bs*2];
6624 		D[2+ldd*2] = CC[2+bs*2];
6625 		D[3+ldd*2] = CC[3+bs*2];
6626 
6627 		if(n1==3)
6628 			return;
6629 
6630 		D[0+ldd*3] = CC[0+bs*3];
6631 		D[1+ldd*3] = CC[1+bs*3];
6632 		D[2+ldd*3] = CC[2+bs*3];
6633 		D[3+ldd*3] = CC[3+bs*3];
6634 		}
6635 	else if(m1>=3)
6636 		{
6637 		D[0+ldd*0] = CC[0+bs*0];
6638 		D[1+ldd*0] = CC[1+bs*0];
6639 		D[2+ldd*0] = CC[2+bs*0];
6640 
6641 		if(n1==1)
6642 			return;
6643 
6644 		D[0+ldd*1] = CC[0+bs*1];
6645 		D[1+ldd*1] = CC[1+bs*1];
6646 		D[2+ldd*1] = CC[2+bs*1];
6647 
6648 		if(n1==2)
6649 			return;
6650 
6651 		D[0+ldd*2] = CC[0+bs*2];
6652 		D[1+ldd*2] = CC[1+bs*2];
6653 		D[2+ldd*2] = CC[2+bs*2];
6654 
6655 		if(n1==3)
6656 			return;
6657 
6658 		D[0+ldd*3] = CC[0+bs*3];
6659 		D[1+ldd*3] = CC[1+bs*3];
6660 		D[2+ldd*3] = CC[2+bs*3];
6661 		}
6662 	else if(m1>=2)
6663 		{
6664 		D[0+ldd*0] = CC[0+bs*0];
6665 		D[1+ldd*0] = CC[1+bs*0];
6666 
6667 		if(n1==1)
6668 			return;
6669 
6670 		D[0+ldd*1] = CC[0+bs*1];
6671 		D[1+ldd*1] = CC[1+bs*1];
6672 
6673 		if(n1==2)
6674 			return;
6675 
6676 		D[0+ldd*2] = CC[0+bs*2];
6677 		D[1+ldd*2] = CC[1+bs*2];
6678 
6679 		if(n1==3)
6680 			return;
6681 
6682 		D[0+ldd*3] = CC[0+bs*3];
6683 		D[1+ldd*3] = CC[1+bs*3];
6684 		}
6685 	else //if(m1>=1)
6686 		{
6687 		D[0+ldd*0] = CC[0+bs*0];
6688 
6689 		if(n1==1)
6690 			return;
6691 
6692 		D[0+ldd*1] = CC[0+bs*1];
6693 
6694 		if(n1==2)
6695 			return;
6696 
6697 		D[0+ldd*2] = CC[0+bs*2];
6698 
6699 		if(n1==3)
6700 			return;
6701 
6702 		D[0+ldd*3] = CC[0+bs*3];
6703 		}
6704 
6705 	return;
6706 
6707 	}
6708 #endif
6709 
6710 
6711 
6712 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_4x4_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd)6713 void kernel_dtrmm_nt_rl_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
6714 	{
6715 
6716 	const int bs = 4;
6717 
6718 	double
6719 		a_0, a_1, a_2, a_3,
6720 		b_0, b_1, b_2, b_3;
6721 
6722 #if defined(TARGET_GENERIC)
6723 	double CC[16] = {0};
6724 #else
6725 	ALIGNED( double CC[16], 64 ) = {0};
6726 #endif
6727 
6728 	double alpha1 = 1.0;
6729 	double beta1  = 0.0;
6730 
6731 	// assume always kmax>=4 !!!
6732 
6733 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
6734 
6735 	A += kmax*bs;
6736 	B += kmax*bs;
6737 
6738 	// k = 0
6739 
6740 	a_0 = A[0];
6741 	a_1 = A[1];
6742 	a_2 = A[2];
6743 	a_3 = A[3];
6744 
6745 	b_0 = B[0+0*bs];
6746 	CC[0+bs*0] += a_0 * b_0;
6747 	CC[1+bs*0] += a_1 * b_0;
6748 	CC[2+bs*0] += a_2 * b_0;
6749 	CC[3+bs*0] += a_3 * b_0;
6750 
6751 	b_0 = B[1+0*bs];
6752 	CC[0+bs*1] += a_0 * b_0;
6753 	CC[1+bs*1] += a_1 * b_0;
6754 	CC[2+bs*1] += a_2 * b_0;
6755 	CC[3+bs*1] += a_3 * b_0;
6756 
6757 	b_0 = B[2+0*bs];
6758 	CC[0+bs*2] += a_0 * b_0;
6759 	CC[1+bs*2] += a_1 * b_0;
6760 	CC[2+bs*2] += a_2 * b_0;
6761 	CC[3+bs*2] += a_3 * b_0;
6762 
6763 	b_0 = B[3+0*bs];
6764 	CC[0+bs*3] += a_0 * b_0;
6765 	CC[1+bs*3] += a_1 * b_0;
6766 	CC[2+bs*3] += a_2 * b_0;
6767 	CC[3+bs*3] += a_3 * b_0;
6768 
6769 	A += bs;
6770 	B += bs;
6771 //	k += 1;
6772 
6773 	// k = 1
6774 
6775 	a_0 = A[0];
6776 	a_1 = A[1];
6777 	a_2 = A[2];
6778 	a_3 = A[3];
6779 
6780 	b_0 = B[1+0*bs];
6781 	CC[0+bs*1] += a_0 * b_0;
6782 	CC[1+bs*1] += a_1 * b_0;
6783 	CC[2+bs*1] += a_2 * b_0;
6784 	CC[3+bs*1] += a_3 * b_0;
6785 
6786 	b_0 = B[2+0*bs];
6787 	CC[0+bs*2] += a_0 * b_0;
6788 	CC[1+bs*2] += a_1 * b_0;
6789 	CC[2+bs*2] += a_2 * b_0;
6790 	CC[3+bs*2] += a_3 * b_0;
6791 
6792 	b_0 = B[3+0*bs];
6793 	CC[0+bs*3] += a_0 * b_0;
6794 	CC[1+bs*3] += a_1 * b_0;
6795 	CC[2+bs*3] += a_2 * b_0;
6796 	CC[3+bs*3] += a_3 * b_0;
6797 
6798 	A += bs;
6799 	B += bs;
6800 //	k += 1;
6801 
6802 	// k = 2
6803 
6804 	a_0 = A[0];
6805 	a_1 = A[1];
6806 	a_2 = A[2];
6807 	a_3 = A[3];
6808 
6809 	b_0 = B[2+0*bs];
6810 	CC[0+bs*2] += a_0 * b_0;
6811 	CC[1+bs*2] += a_1 * b_0;
6812 	CC[2+bs*2] += a_2 * b_0;
6813 	CC[3+bs*2] += a_3 * b_0;
6814 
6815 	b_0 = B[3+0*bs];
6816 	CC[0+bs*3] += a_0 * b_0;
6817 	CC[1+bs*3] += a_1 * b_0;
6818 	CC[2+bs*3] += a_2 * b_0;
6819 	CC[3+bs*3] += a_3 * b_0;
6820 
6821 	A += bs;
6822 	B += bs;
6823 //	k += 1;
6824 
6825 	// k = 3
6826 
6827 	a_0 = A[0];
6828 	a_1 = A[1];
6829 	a_2 = A[2];
6830 	a_3 = A[3];
6831 
6832 	b_0 = B[3+0*bs];
6833 	CC[0+bs*3] += a_0 * b_0;
6834 	CC[1+bs*3] += a_1 * b_0;
6835 	CC[2+bs*3] += a_2 * b_0;
6836 	CC[3+bs*3] += a_3 * b_0;
6837 
6838 	A += bs;
6839 	B += bs;
6840 //	k += 1;
6841 
6842 	store:
6843 
6844 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
6845 	D[1+ldd*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
6846 	D[2+ldd*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
6847 	D[3+ldd*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
6848 
6849 	D[0+ldd*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
6850 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
6851 	D[2+ldd*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
6852 	D[3+ldd*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
6853 
6854 	D[0+ldd*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
6855 	D[1+ldd*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
6856 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
6857 	D[3+ldd*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
6858 
6859 	D[0+ldd*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
6860 	D[1+ldd*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
6861 	D[2+ldd*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
6862 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
6863 
6864 	return;
6865 
6866 	}
6867 #endif
6868 
6869 
6870 
6871 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_4x4_vs_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)6872 void kernel_dtrmm_nt_rl_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
6873 	{
6874 
6875 	const int bs = 4;
6876 
6877 	double
6878 		a_0, a_1, a_2, a_3,
6879 		b_0, b_1, b_2, b_3;
6880 
6881 #if defined(TARGET_GENERIC)
6882 	double CC[16] = {0};
6883 #else
6884 	ALIGNED( double CC[16], 64 ) = {0};
6885 #endif
6886 
6887 	double alpha1 = 1.0;
6888 	double beta1  = 0.0;
6889 
6890 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
6891 
6892 	A += kmax*bs;
6893 	B += kmax*bs;
6894 
6895 	if(n1>=4)
6896 		{
6897 
6898 		// k = 0
6899 
6900 		a_0 = A[0];
6901 		a_1 = A[1];
6902 		a_2 = A[2];
6903 		a_3 = A[3];
6904 
6905 		b_0 = B[0+0*bs];
6906 		CC[0+bs*0] += a_0 * b_0;
6907 		CC[1+bs*0] += a_1 * b_0;
6908 		CC[2+bs*0] += a_2 * b_0;
6909 		CC[3+bs*0] += a_3 * b_0;
6910 
6911 		b_0 = B[1+0*bs];
6912 		CC[0+bs*1] += a_0 * b_0;
6913 		CC[1+bs*1] += a_1 * b_0;
6914 		CC[2+bs*1] += a_2 * b_0;
6915 		CC[3+bs*1] += a_3 * b_0;
6916 
6917 		b_0 = B[2+0*bs];
6918 		CC[0+bs*2] += a_0 * b_0;
6919 		CC[1+bs*2] += a_1 * b_0;
6920 		CC[2+bs*2] += a_2 * b_0;
6921 		CC[3+bs*2] += a_3 * b_0;
6922 
6923 		b_0 = B[3+0*bs];
6924 		CC[0+bs*3] += a_0 * b_0;
6925 		CC[1+bs*3] += a_1 * b_0;
6926 		CC[2+bs*3] += a_2 * b_0;
6927 		CC[3+bs*3] += a_3 * b_0;
6928 
6929 		A += bs;
6930 		B += bs;
6931 //		k += 1;
6932 
6933 		// k = 1
6934 
6935 		a_0 = A[0];
6936 		a_1 = A[1];
6937 		a_2 = A[2];
6938 		a_3 = A[3];
6939 
6940 		b_0 = B[1+0*bs];
6941 		CC[0+bs*1] += a_0 * b_0;
6942 		CC[1+bs*1] += a_1 * b_0;
6943 		CC[2+bs*1] += a_2 * b_0;
6944 		CC[3+bs*1] += a_3 * b_0;
6945 
6946 		b_0 = B[2+0*bs];
6947 		CC[0+bs*2] += a_0 * b_0;
6948 		CC[1+bs*2] += a_1 * b_0;
6949 		CC[2+bs*2] += a_2 * b_0;
6950 		CC[3+bs*2] += a_3 * b_0;
6951 
6952 		b_0 = B[3+0*bs];
6953 		CC[0+bs*3] += a_0 * b_0;
6954 		CC[1+bs*3] += a_1 * b_0;
6955 		CC[2+bs*3] += a_2 * b_0;
6956 		CC[3+bs*3] += a_3 * b_0;
6957 
6958 		A += bs;
6959 		B += bs;
6960 //		k += 1;
6961 
6962 		// k = 2
6963 
6964 		a_0 = A[0];
6965 		a_1 = A[1];
6966 		a_2 = A[2];
6967 		a_3 = A[3];
6968 
6969 		b_0 = B[2+0*bs];
6970 		CC[0+bs*2] += a_0 * b_0;
6971 		CC[1+bs*2] += a_1 * b_0;
6972 		CC[2+bs*2] += a_2 * b_0;
6973 		CC[3+bs*2] += a_3 * b_0;
6974 
6975 		b_0 = B[3+0*bs];
6976 		CC[0+bs*3] += a_0 * b_0;
6977 		CC[1+bs*3] += a_1 * b_0;
6978 		CC[2+bs*3] += a_2 * b_0;
6979 		CC[3+bs*3] += a_3 * b_0;
6980 
6981 		A += bs;
6982 		B += bs;
6983 //		k += 1;
6984 
6985 		// k = 3
6986 
6987 		a_0 = A[0];
6988 		a_1 = A[1];
6989 		a_2 = A[2];
6990 		a_3 = A[3];
6991 
6992 		b_0 = B[3+0*bs];
6993 		CC[0+bs*3] += a_0 * b_0;
6994 		CC[1+bs*3] += a_1 * b_0;
6995 		CC[2+bs*3] += a_2 * b_0;
6996 		CC[3+bs*3] += a_3 * b_0;
6997 
6998 		A += bs;
6999 		B += bs;
7000 //		k += 1;
7001 
7002 		}
7003 	else if(n1==3)
7004 		{
7005 
7006 		// k = 0
7007 
7008 		a_0 = A[0];
7009 		a_1 = A[1];
7010 		a_2 = A[2];
7011 		a_3 = A[3];
7012 
7013 		b_0 = B[0+0*bs];
7014 		CC[0+bs*0] += a_0 * b_0;
7015 		CC[1+bs*0] += a_1 * b_0;
7016 		CC[2+bs*0] += a_2 * b_0;
7017 		CC[3+bs*0] += a_3 * b_0;
7018 
7019 		b_0 = B[1+0*bs];
7020 		CC[0+bs*1] += a_0 * b_0;
7021 		CC[1+bs*1] += a_1 * b_0;
7022 		CC[2+bs*1] += a_2 * b_0;
7023 		CC[3+bs*1] += a_3 * b_0;
7024 
7025 		b_0 = B[2+0*bs];
7026 		CC[0+bs*2] += a_0 * b_0;
7027 		CC[1+bs*2] += a_1 * b_0;
7028 		CC[2+bs*2] += a_2 * b_0;
7029 		CC[3+bs*2] += a_3 * b_0;
7030 
7031 		A += bs;
7032 		B += bs;
7033 //		k += 1;
7034 
7035 		// k = 1
7036 
7037 		a_0 = A[0];
7038 		a_1 = A[1];
7039 		a_2 = A[2];
7040 		a_3 = A[3];
7041 
7042 		b_0 = B[1+0*bs];
7043 		CC[0+bs*1] += a_0 * b_0;
7044 		CC[1+bs*1] += a_1 * b_0;
7045 		CC[2+bs*1] += a_2 * b_0;
7046 		CC[3+bs*1] += a_3 * b_0;
7047 
7048 		b_0 = B[2+0*bs];
7049 		CC[0+bs*2] += a_0 * b_0;
7050 		CC[1+bs*2] += a_1 * b_0;
7051 		CC[2+bs*2] += a_2 * b_0;
7052 		CC[3+bs*2] += a_3 * b_0;
7053 
7054 		A += bs;
7055 		B += bs;
7056 //		k += 1;
7057 
7058 		// k = 2
7059 
7060 		a_0 = A[0];
7061 		a_1 = A[1];
7062 		a_2 = A[2];
7063 		a_3 = A[3];
7064 
7065 		b_0 = B[2+0*bs];
7066 		CC[0+bs*2] += a_0 * b_0;
7067 		CC[1+bs*2] += a_1 * b_0;
7068 		CC[2+bs*2] += a_2 * b_0;
7069 		CC[3+bs*2] += a_3 * b_0;
7070 
7071 		A += bs;
7072 		B += bs;
7073 //		k += 1;
7074 
7075 		// k = 3
7076 
7077 		A += bs;
7078 		B += bs;
7079 //		k += 1;
7080 
7081 		}
7082 	else if(n1==2)
7083 		{
7084 
7085 		// k = 0
7086 
7087 		a_0 = A[0];
7088 		a_1 = A[1];
7089 		a_2 = A[2];
7090 		a_3 = A[3];
7091 
7092 		b_0 = B[0+0*bs];
7093 		CC[0+bs*0] += a_0 * b_0;
7094 		CC[1+bs*0] += a_1 * b_0;
7095 		CC[2+bs*0] += a_2 * b_0;
7096 		CC[3+bs*0] += a_3 * b_0;
7097 
7098 		b_0 = B[1+0*bs];
7099 		CC[0+bs*1] += a_0 * b_0;
7100 		CC[1+bs*1] += a_1 * b_0;
7101 		CC[2+bs*1] += a_2 * b_0;
7102 		CC[3+bs*1] += a_3 * b_0;
7103 
7104 		A += bs;
7105 		B += bs;
7106 //		k += 1;
7107 
7108 		// k = 1
7109 
7110 		a_0 = A[0];
7111 		a_1 = A[1];
7112 		a_2 = A[2];
7113 		a_3 = A[3];
7114 
7115 		b_0 = B[1+0*bs];
7116 		CC[0+bs*1] += a_0 * b_0;
7117 		CC[1+bs*1] += a_1 * b_0;
7118 		CC[2+bs*1] += a_2 * b_0;
7119 		CC[3+bs*1] += a_3 * b_0;
7120 
7121 		A += bs;
7122 		B += bs;
7123 //		k += 1;
7124 
7125 		// k = 2
7126 
7127 		A += bs;
7128 		B += bs;
7129 //		k += 1;
7130 
7131 		// k = 3
7132 
7133 		A += bs;
7134 		B += bs;
7135 //		k += 1;
7136 
7137 		}
7138 	else if(n1==1)
7139 		{
7140 
7141 		// k = 0
7142 
7143 		a_0 = A[0];
7144 		a_1 = A[1];
7145 		a_2 = A[2];
7146 		a_3 = A[3];
7147 
7148 		b_0 = B[0+0*bs];
7149 		CC[0+bs*0] += a_0 * b_0;
7150 		CC[1+bs*0] += a_1 * b_0;
7151 		CC[2+bs*0] += a_2 * b_0;
7152 		CC[3+bs*0] += a_3 * b_0;
7153 
7154 		A += bs;
7155 		B += bs;
7156 //		k += 1;
7157 
7158 		// k = 1
7159 
7160 		A += bs;
7161 		B += bs;
7162 //		k += 1;
7163 
7164 		// k = 2
7165 
7166 		A += bs;
7167 		B += bs;
7168 //		k += 1;
7169 
7170 		// k = 3
7171 
7172 		A += bs;
7173 		B += bs;
7174 //		k += 1;
7175 
7176 		}
7177 
7178 	store:
7179 
7180 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
7181 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
7182 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
7183 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
7184 
7185 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
7186 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
7187 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
7188 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
7189 
7190 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
7191 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
7192 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
7193 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
7194 
7195 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
7196 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
7197 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
7198 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
7199 
7200 	if(m1>=4)
7201 		{
7202 		D[0+ldd*0] = CC[0+bs*0];
7203 		D[1+ldd*0] = CC[1+bs*0];
7204 		D[2+ldd*0] = CC[2+bs*0];
7205 		D[3+ldd*0] = CC[3+bs*0];
7206 
7207 		if(n1==1)
7208 			return;
7209 
7210 		D[0+ldd*1] = CC[0+bs*1];
7211 		D[1+ldd*1] = CC[1+bs*1];
7212 		D[2+ldd*1] = CC[2+bs*1];
7213 		D[3+ldd*1] = CC[3+bs*1];
7214 
7215 		if(n1==2)
7216 			return;
7217 
7218 		D[0+ldd*2] = CC[0+bs*2];
7219 		D[1+ldd*2] = CC[1+bs*2];
7220 		D[2+ldd*2] = CC[2+bs*2];
7221 		D[3+ldd*2] = CC[3+bs*2];
7222 
7223 		if(n1==3)
7224 			return;
7225 
7226 		D[0+ldd*3] = CC[0+bs*3];
7227 		D[1+ldd*3] = CC[1+bs*3];
7228 		D[2+ldd*3] = CC[2+bs*3];
7229 		D[3+ldd*3] = CC[3+bs*3];
7230 		}
7231 	else if(m1>=3)
7232 		{
7233 		D[0+ldd*0] = CC[0+bs*0];
7234 		D[1+ldd*0] = CC[1+bs*0];
7235 		D[2+ldd*0] = CC[2+bs*0];
7236 
7237 		if(n1==1)
7238 			return;
7239 
7240 		D[0+ldd*1] = CC[0+bs*1];
7241 		D[1+ldd*1] = CC[1+bs*1];
7242 		D[2+ldd*1] = CC[2+bs*1];
7243 
7244 		if(n1==2)
7245 			return;
7246 
7247 		D[0+ldd*2] = CC[0+bs*2];
7248 		D[1+ldd*2] = CC[1+bs*2];
7249 		D[2+ldd*2] = CC[2+bs*2];
7250 
7251 		if(n1==3)
7252 			return;
7253 
7254 		D[0+ldd*3] = CC[0+bs*3];
7255 		D[1+ldd*3] = CC[1+bs*3];
7256 		D[2+ldd*3] = CC[2+bs*3];
7257 		}
7258 	else if(m1>=2)
7259 		{
7260 		D[0+ldd*0] = CC[0+bs*0];
7261 		D[1+ldd*0] = CC[1+bs*0];
7262 
7263 		if(n1==1)
7264 			return;
7265 
7266 		D[0+ldd*1] = CC[0+bs*1];
7267 		D[1+ldd*1] = CC[1+bs*1];
7268 
7269 		if(n1==2)
7270 			return;
7271 
7272 		D[0+ldd*2] = CC[0+bs*2];
7273 		D[1+ldd*2] = CC[1+bs*2];
7274 
7275 		if(n1==3)
7276 			return;
7277 
7278 		D[0+ldd*3] = CC[0+bs*3];
7279 		D[1+ldd*3] = CC[1+bs*3];
7280 		}
7281 	else //if(m1>=1)
7282 		{
7283 		D[0+ldd*0] = CC[0+bs*0];
7284 
7285 		if(n1==1)
7286 			return;
7287 
7288 		D[0+ldd*1] = CC[0+bs*1];
7289 
7290 		if(n1==2)
7291 			return;
7292 
7293 		D[0+ldd*2] = CC[0+bs*2];
7294 
7295 		if(n1==3)
7296 			return;
7297 
7298 		D[0+ldd*3] = CC[0+bs*3];
7299 		}
7300 
7301 	return;
7302 
7303 	}
7304 #endif
7305 
7306 
7307 
7308 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_4x4_tran_lib444c(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int ldd)7309 void kernel_dtrmm_nt_rl_4x4_tran_lib444c(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd)
7310 	{
7311 
7312 	const int bs = 4;
7313 
7314 	double
7315 		tmp,
7316 		a_0, a_1, a_2, a_3,
7317 		b_0, b_1, b_2, b_3;
7318 
7319 #if defined(TARGET_GENERIC)
7320 	double CC[16] = {0};
7321 #else
7322 	ALIGNED( double CC[16], 64 ) = {0};
7323 #endif
7324 
7325 	double alpha1 = 1.0;
7326 	double beta1  = 0.0;
7327 
7328 	// assume always kmax>=4 !!!
7329 
7330 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
7331 
7332 	A += kmax*bs;
7333 	B += kmax*bs;
7334 
7335 	// k = 0
7336 
7337 	a_0 = A[0];
7338 	a_1 = A[1];
7339 	a_2 = A[2];
7340 	a_3 = A[3];
7341 
7342 	b_0 = B[0+0*bs];
7343 	CC[0+bs*0] += a_0 * b_0;
7344 	CC[1+bs*0] += a_1 * b_0;
7345 	CC[2+bs*0] += a_2 * b_0;
7346 	CC[3+bs*0] += a_3 * b_0;
7347 
7348 	b_0 = B[1+0*bs];
7349 	CC[0+bs*1] += a_0 * b_0;
7350 	CC[1+bs*1] += a_1 * b_0;
7351 	CC[2+bs*1] += a_2 * b_0;
7352 	CC[3+bs*1] += a_3 * b_0;
7353 
7354 	b_0 = B[2+0*bs];
7355 	CC[0+bs*2] += a_0 * b_0;
7356 	CC[1+bs*2] += a_1 * b_0;
7357 	CC[2+bs*2] += a_2 * b_0;
7358 	CC[3+bs*2] += a_3 * b_0;
7359 
7360 	b_0 = B[3+0*bs];
7361 	CC[0+bs*3] += a_0 * b_0;
7362 	CC[1+bs*3] += a_1 * b_0;
7363 	CC[2+bs*3] += a_2 * b_0;
7364 	CC[3+bs*3] += a_3 * b_0;
7365 
7366 	A += bs;
7367 	B += bs;
7368 //	k += 1;
7369 
7370 	// k = 1
7371 
7372 	a_0 = A[0];
7373 	a_1 = A[1];
7374 	a_2 = A[2];
7375 	a_3 = A[3];
7376 
7377 	b_0 = B[1+0*bs];
7378 	CC[0+bs*1] += a_0 * b_0;
7379 	CC[1+bs*1] += a_1 * b_0;
7380 	CC[2+bs*1] += a_2 * b_0;
7381 	CC[3+bs*1] += a_3 * b_0;
7382 
7383 	b_0 = B[2+0*bs];
7384 	CC[0+bs*2] += a_0 * b_0;
7385 	CC[1+bs*2] += a_1 * b_0;
7386 	CC[2+bs*2] += a_2 * b_0;
7387 	CC[3+bs*2] += a_3 * b_0;
7388 
7389 	b_0 = B[3+0*bs];
7390 	CC[0+bs*3] += a_0 * b_0;
7391 	CC[1+bs*3] += a_1 * b_0;
7392 	CC[2+bs*3] += a_2 * b_0;
7393 	CC[3+bs*3] += a_3 * b_0;
7394 
7395 	A += bs;
7396 	B += bs;
7397 //	k += 1;
7398 
7399 	// k = 2
7400 
7401 	a_0 = A[0];
7402 	a_1 = A[1];
7403 	a_2 = A[2];
7404 	a_3 = A[3];
7405 
7406 	b_0 = B[2+0*bs];
7407 	CC[0+bs*2] += a_0 * b_0;
7408 	CC[1+bs*2] += a_1 * b_0;
7409 	CC[2+bs*2] += a_2 * b_0;
7410 	CC[3+bs*2] += a_3 * b_0;
7411 
7412 	b_0 = B[3+0*bs];
7413 	CC[0+bs*3] += a_0 * b_0;
7414 	CC[1+bs*3] += a_1 * b_0;
7415 	CC[2+bs*3] += a_2 * b_0;
7416 	CC[3+bs*3] += a_3 * b_0;
7417 
7418 	A += bs;
7419 	B += bs;
7420 //	k += 1;
7421 
7422 	// k = 3
7423 
7424 	a_0 = A[0];
7425 	a_1 = A[1];
7426 	a_2 = A[2];
7427 	a_3 = A[3];
7428 
7429 	b_0 = B[3+0*bs];
7430 	CC[0+bs*3] += a_0 * b_0;
7431 	CC[1+bs*3] += a_1 * b_0;
7432 	CC[2+bs*3] += a_2 * b_0;
7433 	CC[3+bs*3] += a_3 * b_0;
7434 
7435 	A += bs;
7436 	B += bs;
7437 //	k += 1;
7438 
7439 	store:
7440 
7441 	// scale & transpose & store
7442 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
7443 	D[0+ldd*1] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
7444 	D[0+ldd*2] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
7445 	D[0+ldd*3] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
7446 
7447 	D[1+ldd*0] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
7448 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
7449 	D[1+ldd*2] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
7450 	D[1+ldd*3] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
7451 
7452 	D[2+ldd*0] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
7453 	D[2+ldd*1] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
7454 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
7455 	D[2+ldd*3] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
7456 
7457 	D[3+ldd*0] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
7458 	D[3+ldd*1] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
7459 	D[3+ldd*2] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
7460 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
7461 
7462 	return;
7463 
7464 	}
7465 #endif
7466 
7467 
7468 
7469 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_4x4_tran_vs_lib444c(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int ldd,int m1,int n1)7470 void kernel_dtrmm_nt_rl_4x4_tran_vs_lib444c(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1)
7471 	{
7472 
7473 	const int bs = 4;
7474 
7475 	double
7476 		tmp,
7477 		a_0, a_1, a_2, a_3,
7478 		b_0, b_1, b_2, b_3;
7479 
7480 #if defined(TARGET_GENERIC)
7481 	double CC[16] = {0};
7482 #else
7483 	ALIGNED( double CC[16], 64 ) = {0};
7484 #endif
7485 
7486 	double alpha1 = 1.0;
7487 	double beta1  = 0.0;
7488 
7489 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
7490 
7491 	A += kmax*bs;
7492 	B += kmax*bs;
7493 
7494 	// XXX m1 and n1 are swapped !!!!!
7495 	if(m1>=4)
7496 		{
7497 
7498 		// k = 0
7499 
7500 		a_0 = A[0];
7501 		a_1 = A[1];
7502 		a_2 = A[2];
7503 		a_3 = A[3];
7504 
7505 		b_0 = B[0+0*bs];
7506 		CC[0+bs*0] += a_0 * b_0;
7507 		CC[1+bs*0] += a_1 * b_0;
7508 		CC[2+bs*0] += a_2 * b_0;
7509 		CC[3+bs*0] += a_3 * b_0;
7510 
7511 		b_0 = B[1+0*bs];
7512 		CC[0+bs*1] += a_0 * b_0;
7513 		CC[1+bs*1] += a_1 * b_0;
7514 		CC[2+bs*1] += a_2 * b_0;
7515 		CC[3+bs*1] += a_3 * b_0;
7516 
7517 		b_0 = B[2+0*bs];
7518 		CC[0+bs*2] += a_0 * b_0;
7519 		CC[1+bs*2] += a_1 * b_0;
7520 		CC[2+bs*2] += a_2 * b_0;
7521 		CC[3+bs*2] += a_3 * b_0;
7522 
7523 		b_0 = B[3+0*bs];
7524 		CC[0+bs*3] += a_0 * b_0;
7525 		CC[1+bs*3] += a_1 * b_0;
7526 		CC[2+bs*3] += a_2 * b_0;
7527 		CC[3+bs*3] += a_3 * b_0;
7528 
7529 		A += bs;
7530 		B += bs;
7531 //		k += 1;
7532 
7533 		// k = 1
7534 
7535 		a_0 = A[0];
7536 		a_1 = A[1];
7537 		a_2 = A[2];
7538 		a_3 = A[3];
7539 
7540 		b_0 = B[1+0*bs];
7541 		CC[0+bs*1] += a_0 * b_0;
7542 		CC[1+bs*1] += a_1 * b_0;
7543 		CC[2+bs*1] += a_2 * b_0;
7544 		CC[3+bs*1] += a_3 * b_0;
7545 
7546 		b_0 = B[2+0*bs];
7547 		CC[0+bs*2] += a_0 * b_0;
7548 		CC[1+bs*2] += a_1 * b_0;
7549 		CC[2+bs*2] += a_2 * b_0;
7550 		CC[3+bs*2] += a_3 * b_0;
7551 
7552 		b_0 = B[3+0*bs];
7553 		CC[0+bs*3] += a_0 * b_0;
7554 		CC[1+bs*3] += a_1 * b_0;
7555 		CC[2+bs*3] += a_2 * b_0;
7556 		CC[3+bs*3] += a_3 * b_0;
7557 
7558 		A += bs;
7559 		B += bs;
7560 //		k += 1;
7561 
7562 		// k = 2
7563 
7564 		a_0 = A[0];
7565 		a_1 = A[1];
7566 		a_2 = A[2];
7567 		a_3 = A[3];
7568 
7569 		b_0 = B[2+0*bs];
7570 		CC[0+bs*2] += a_0 * b_0;
7571 		CC[1+bs*2] += a_1 * b_0;
7572 		CC[2+bs*2] += a_2 * b_0;
7573 		CC[3+bs*2] += a_3 * b_0;
7574 
7575 		b_0 = B[3+0*bs];
7576 		CC[0+bs*3] += a_0 * b_0;
7577 		CC[1+bs*3] += a_1 * b_0;
7578 		CC[2+bs*3] += a_2 * b_0;
7579 		CC[3+bs*3] += a_3 * b_0;
7580 
7581 		A += bs;
7582 		B += bs;
7583 //		k += 1;
7584 
7585 		// k = 3
7586 
7587 		a_0 = A[0];
7588 		a_1 = A[1];
7589 		a_2 = A[2];
7590 		a_3 = A[3];
7591 
7592 		b_0 = B[3+0*bs];
7593 		CC[0+bs*3] += a_0 * b_0;
7594 		CC[1+bs*3] += a_1 * b_0;
7595 		CC[2+bs*3] += a_2 * b_0;
7596 		CC[3+bs*3] += a_3 * b_0;
7597 
7598 		A += bs;
7599 		B += bs;
7600 //		k += 1;
7601 
7602 		}
7603 	else if(m1==3)
7604 		{
7605 
7606 		// k = 0
7607 
7608 		a_0 = A[0];
7609 		a_1 = A[1];
7610 		a_2 = A[2];
7611 		a_3 = A[3];
7612 
7613 		b_0 = B[0+0*bs];
7614 		CC[0+bs*0] += a_0 * b_0;
7615 		CC[1+bs*0] += a_1 * b_0;
7616 		CC[2+bs*0] += a_2 * b_0;
7617 		CC[3+bs*0] += a_3 * b_0;
7618 
7619 		b_0 = B[1+0*bs];
7620 		CC[0+bs*1] += a_0 * b_0;
7621 		CC[1+bs*1] += a_1 * b_0;
7622 		CC[2+bs*1] += a_2 * b_0;
7623 		CC[3+bs*1] += a_3 * b_0;
7624 
7625 		b_0 = B[2+0*bs];
7626 		CC[0+bs*2] += a_0 * b_0;
7627 		CC[1+bs*2] += a_1 * b_0;
7628 		CC[2+bs*2] += a_2 * b_0;
7629 		CC[3+bs*2] += a_3 * b_0;
7630 
7631 		A += bs;
7632 		B += bs;
7633 //		k += 1;
7634 
7635 		// k = 1
7636 
7637 		a_0 = A[0];
7638 		a_1 = A[1];
7639 		a_2 = A[2];
7640 		a_3 = A[3];
7641 
7642 		b_0 = B[1+0*bs];
7643 		CC[0+bs*1] += a_0 * b_0;
7644 		CC[1+bs*1] += a_1 * b_0;
7645 		CC[2+bs*1] += a_2 * b_0;
7646 		CC[3+bs*1] += a_3 * b_0;
7647 
7648 		b_0 = B[2+0*bs];
7649 		CC[0+bs*2] += a_0 * b_0;
7650 		CC[1+bs*2] += a_1 * b_0;
7651 		CC[2+bs*2] += a_2 * b_0;
7652 		CC[3+bs*2] += a_3 * b_0;
7653 
7654 		A += bs;
7655 		B += bs;
7656 //		k += 1;
7657 
7658 		// k = 2
7659 
7660 		a_0 = A[0];
7661 		a_1 = A[1];
7662 		a_2 = A[2];
7663 		a_3 = A[3];
7664 
7665 		b_0 = B[2+0*bs];
7666 		CC[0+bs*2] += a_0 * b_0;
7667 		CC[1+bs*2] += a_1 * b_0;
7668 		CC[2+bs*2] += a_2 * b_0;
7669 		CC[3+bs*2] += a_3 * b_0;
7670 
7671 		A += bs;
7672 		B += bs;
7673 //		k += 1;
7674 
7675 		// k = 3
7676 
7677 		A += bs;
7678 		B += bs;
7679 //		k += 1;
7680 
7681 		}
7682 	else if(m1==2)
7683 		{
7684 
7685 		// k = 0
7686 
7687 		a_0 = A[0];
7688 		a_1 = A[1];
7689 		a_2 = A[2];
7690 		a_3 = A[3];
7691 
7692 		b_0 = B[0+0*bs];
7693 		CC[0+bs*0] += a_0 * b_0;
7694 		CC[1+bs*0] += a_1 * b_0;
7695 		CC[2+bs*0] += a_2 * b_0;
7696 		CC[3+bs*0] += a_3 * b_0;
7697 
7698 		b_0 = B[1+0*bs];
7699 		CC[0+bs*1] += a_0 * b_0;
7700 		CC[1+bs*1] += a_1 * b_0;
7701 		CC[2+bs*1] += a_2 * b_0;
7702 		CC[3+bs*1] += a_3 * b_0;
7703 
7704 		A += bs;
7705 		B += bs;
7706 //		k += 1;
7707 
7708 		// k = 1
7709 
7710 		a_0 = A[0];
7711 		a_1 = A[1];
7712 		a_2 = A[2];
7713 		a_3 = A[3];
7714 
7715 		b_0 = B[1+0*bs];
7716 		CC[0+bs*1] += a_0 * b_0;
7717 		CC[1+bs*1] += a_1 * b_0;
7718 		CC[2+bs*1] += a_2 * b_0;
7719 		CC[3+bs*1] += a_3 * b_0;
7720 
7721 		A += bs;
7722 		B += bs;
7723 //		k += 1;
7724 
7725 		// k = 2
7726 
7727 		A += bs;
7728 		B += bs;
7729 //		k += 1;
7730 
7731 		// k = 3
7732 
7733 		A += bs;
7734 		B += bs;
7735 //		k += 1;
7736 
7737 		}
7738 	else if(m1==1)
7739 		{
7740 
7741 		// k = 0
7742 
7743 		a_0 = A[0];
7744 		a_1 = A[1];
7745 		a_2 = A[2];
7746 		a_3 = A[3];
7747 
7748 		b_0 = B[0+0*bs];
7749 		CC[0+bs*0] += a_0 * b_0;
7750 		CC[1+bs*0] += a_1 * b_0;
7751 		CC[2+bs*0] += a_2 * b_0;
7752 		CC[3+bs*0] += a_3 * b_0;
7753 
7754 		A += bs;
7755 		B += bs;
7756 //		k += 1;
7757 
7758 		// k = 1
7759 
7760 		A += bs;
7761 		B += bs;
7762 //		k += 1;
7763 
7764 		// k = 2
7765 
7766 		A += bs;
7767 		B += bs;
7768 //		k += 1;
7769 
7770 		// k = 3
7771 
7772 		A += bs;
7773 		B += bs;
7774 //		k += 1;
7775 
7776 		}
7777 
7778 	store:
7779 
7780 	// scale
7781 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
7782 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
7783 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
7784 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
7785 
7786 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
7787 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
7788 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
7789 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
7790 
7791 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
7792 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
7793 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
7794 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
7795 
7796 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
7797 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
7798 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
7799 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
7800 
7801 	// transpose
7802 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
7803 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
7804 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
7805 
7806 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
7807 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
7808 
7809 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
7810 
7811 	// store
7812 	if(m1>=4)
7813 		{
7814 		D[0+ldd*0] = CC[0+bs*0];
7815 		D[1+ldd*0] = CC[1+bs*0];
7816 		D[2+ldd*0] = CC[2+bs*0];
7817 		D[3+ldd*0] = CC[3+bs*0];
7818 
7819 		if(n1==1)
7820 			return;
7821 
7822 		D[0+ldd*1] = CC[0+bs*1];
7823 		D[1+ldd*1] = CC[1+bs*1];
7824 		D[2+ldd*1] = CC[2+bs*1];
7825 		D[3+ldd*1] = CC[3+bs*1];
7826 
7827 		if(n1==2)
7828 			return;
7829 
7830 		D[0+ldd*2] = CC[0+bs*2];
7831 		D[1+ldd*2] = CC[1+bs*2];
7832 		D[2+ldd*2] = CC[2+bs*2];
7833 		D[3+ldd*2] = CC[3+bs*2];
7834 
7835 		if(n1==3)
7836 			return;
7837 
7838 		D[0+ldd*3] = CC[0+bs*3];
7839 		D[1+ldd*3] = CC[1+bs*3];
7840 		D[2+ldd*3] = CC[2+bs*3];
7841 		D[3+ldd*3] = CC[3+bs*3];
7842 		}
7843 	else if(m1>=3)
7844 		{
7845 		D[0+ldd*0] = CC[0+bs*0];
7846 		D[1+ldd*0] = CC[1+bs*0];
7847 		D[2+ldd*0] = CC[2+bs*0];
7848 
7849 		if(n1==1)
7850 			return;
7851 
7852 		D[0+ldd*1] = CC[0+bs*1];
7853 		D[1+ldd*1] = CC[1+bs*1];
7854 		D[2+ldd*1] = CC[2+bs*1];
7855 
7856 		if(n1==2)
7857 			return;
7858 
7859 		D[0+ldd*2] = CC[0+bs*2];
7860 		D[1+ldd*2] = CC[1+bs*2];
7861 		D[2+ldd*2] = CC[2+bs*2];
7862 
7863 		if(n1==3)
7864 			return;
7865 
7866 		D[0+ldd*3] = CC[0+bs*3];
7867 		D[1+ldd*3] = CC[1+bs*3];
7868 		D[2+ldd*3] = CC[2+bs*3];
7869 		}
7870 	else if(m1>=2)
7871 		{
7872 		D[0+ldd*0] = CC[0+bs*0];
7873 		D[1+ldd*0] = CC[1+bs*0];
7874 
7875 		if(n1==1)
7876 			return;
7877 
7878 		D[0+ldd*1] = CC[0+bs*1];
7879 		D[1+ldd*1] = CC[1+bs*1];
7880 
7881 		if(n1==2)
7882 			return;
7883 
7884 		D[0+ldd*2] = CC[0+bs*2];
7885 		D[1+ldd*2] = CC[1+bs*2];
7886 
7887 		if(n1==3)
7888 			return;
7889 
7890 		D[0+ldd*3] = CC[0+bs*3];
7891 		D[1+ldd*3] = CC[1+bs*3];
7892 		}
7893 	else //if(m1>=1)
7894 		{
7895 		D[0+ldd*0] = CC[0+bs*0];
7896 
7897 		if(n1==1)
7898 			return;
7899 
7900 		D[0+ldd*1] = CC[0+bs*1];
7901 
7902 		if(n1==2)
7903 			return;
7904 
7905 		D[0+ldd*2] = CC[0+bs*2];
7906 
7907 		if(n1==3)
7908 			return;
7909 
7910 		D[0+ldd*3] = CC[0+bs*3];
7911 		}
7912 
7913 	return;
7914 
7915 	}
7916 #endif
7917 
7918 
7919 
7920 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_one_4x4_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd)7921 void kernel_dtrmm_nt_rl_one_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
7922 	{
7923 
7924 	const int bs = 4;
7925 
7926 	double
7927 		a_0, a_1, a_2, a_3,
7928 		b_0, b_1, b_2, b_3;
7929 
7930 #if defined(TARGET_GENERIC)
7931 	double CC[16] = {0};
7932 #else
7933 	ALIGNED( double CC[16], 64 ) = {0};
7934 #endif
7935 
7936 	double alpha1 = 1.0;
7937 	double beta1  = 0.0;
7938 
7939 	// assume always kmax>=4 !!!
7940 
7941 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
7942 
7943 	A += kmax*bs;
7944 	B += kmax*bs;
7945 
7946 	// k = 0
7947 
7948 	a_0 = A[0];
7949 	a_1 = A[1];
7950 	a_2 = A[2];
7951 	a_3 = A[3];
7952 
7953 	CC[0+bs*0] += a_0;
7954 	CC[1+bs*0] += a_1;
7955 	CC[2+bs*0] += a_2;
7956 	CC[3+bs*0] += a_3;
7957 
7958 	b_0 = B[1+0*bs];
7959 	CC[0+bs*1] += a_0 * b_0;
7960 	CC[1+bs*1] += a_1 * b_0;
7961 	CC[2+bs*1] += a_2 * b_0;
7962 	CC[3+bs*1] += a_3 * b_0;
7963 
7964 	b_0 = B[2+0*bs];
7965 	CC[0+bs*2] += a_0 * b_0;
7966 	CC[1+bs*2] += a_1 * b_0;
7967 	CC[2+bs*2] += a_2 * b_0;
7968 	CC[3+bs*2] += a_3 * b_0;
7969 
7970 	b_0 = B[3+0*bs];
7971 	CC[0+bs*3] += a_0 * b_0;
7972 	CC[1+bs*3] += a_1 * b_0;
7973 	CC[2+bs*3] += a_2 * b_0;
7974 	CC[3+bs*3] += a_3 * b_0;
7975 
7976 	A += bs;
7977 	B += bs;
7978 //	k += 1;
7979 
7980 	// k = 1
7981 
7982 	a_0 = A[0];
7983 	a_1 = A[1];
7984 	a_2 = A[2];
7985 	a_3 = A[3];
7986 
7987 	CC[0+bs*1] += a_0;
7988 	CC[1+bs*1] += a_1;
7989 	CC[2+bs*1] += a_2;
7990 	CC[3+bs*1] += a_3;
7991 
7992 	b_0 = B[2+0*bs];
7993 	CC[0+bs*2] += a_0 * b_0;
7994 	CC[1+bs*2] += a_1 * b_0;
7995 	CC[2+bs*2] += a_2 * b_0;
7996 	CC[3+bs*2] += a_3 * b_0;
7997 
7998 	b_0 = B[3+0*bs];
7999 	CC[0+bs*3] += a_0 * b_0;
8000 	CC[1+bs*3] += a_1 * b_0;
8001 	CC[2+bs*3] += a_2 * b_0;
8002 	CC[3+bs*3] += a_3 * b_0;
8003 
8004 	A += bs;
8005 	B += bs;
8006 //	k += 1;
8007 
8008 	// k = 2
8009 
8010 	a_0 = A[0];
8011 	a_1 = A[1];
8012 	a_2 = A[2];
8013 	a_3 = A[3];
8014 
8015 	CC[0+bs*2] += a_0;
8016 	CC[1+bs*2] += a_1;
8017 	CC[2+bs*2] += a_2;
8018 	CC[3+bs*2] += a_3;
8019 
8020 	b_0 = B[3+0*bs];
8021 	CC[0+bs*3] += a_0 * b_0;
8022 	CC[1+bs*3] += a_1 * b_0;
8023 	CC[2+bs*3] += a_2 * b_0;
8024 	CC[3+bs*3] += a_3 * b_0;
8025 
8026 	A += bs;
8027 	B += bs;
8028 //	k += 1;
8029 
8030 	// k = 3
8031 
8032 	a_0 = A[0];
8033 	a_1 = A[1];
8034 	a_2 = A[2];
8035 	a_3 = A[3];
8036 
8037 	CC[0+bs*3] += a_0;
8038 	CC[1+bs*3] += a_1;
8039 	CC[2+bs*3] += a_2;
8040 	CC[3+bs*3] += a_3;
8041 
8042 	A += bs;
8043 	B += bs;
8044 //	k += 1;
8045 
8046 	store:
8047 
8048 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
8049 	D[1+ldd*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
8050 	D[2+ldd*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
8051 	D[3+ldd*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
8052 
8053 	D[0+ldd*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
8054 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
8055 	D[2+ldd*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
8056 	D[3+ldd*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
8057 
8058 	D[0+ldd*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
8059 	D[1+ldd*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
8060 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
8061 	D[3+ldd*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
8062 
8063 	D[0+ldd*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
8064 	D[1+ldd*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
8065 	D[2+ldd*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
8066 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
8067 
8068 	return;
8069 
8070 	}
8071 #endif
8072 
8073 
8074 
8075 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_one_4x4_vs_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)8076 void kernel_dtrmm_nt_rl_one_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
8077 	{
8078 
8079 	const int bs = 4;
8080 
8081 	double
8082 		a_0, a_1, a_2, a_3,
8083 		b_0, b_1, b_2, b_3;
8084 
8085 #if defined(TARGET_GENERIC)
8086 	double CC[16] = {0};
8087 #else
8088 	ALIGNED( double CC[16], 64 ) = {0};
8089 #endif
8090 
8091 	double alpha1 = 1.0;
8092 	double beta1  = 0.0;
8093 
8094 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
8095 
8096 	A += kmax*bs;
8097 	B += kmax*bs;
8098 
8099 	if(n1>=4)
8100 		{
8101 
8102 		// k = 0
8103 
8104 		a_0 = A[0];
8105 		a_1 = A[1];
8106 		a_2 = A[2];
8107 		a_3 = A[3];
8108 
8109 		CC[0+bs*0] += a_0;
8110 		CC[1+bs*0] += a_1;
8111 		CC[2+bs*0] += a_2;
8112 		CC[3+bs*0] += a_3;
8113 
8114 		b_0 = B[1+0*bs];
8115 		CC[0+bs*1] += a_0 * b_0;
8116 		CC[1+bs*1] += a_1 * b_0;
8117 		CC[2+bs*1] += a_2 * b_0;
8118 		CC[3+bs*1] += a_3 * b_0;
8119 
8120 		b_0 = B[2+0*bs];
8121 		CC[0+bs*2] += a_0 * b_0;
8122 		CC[1+bs*2] += a_1 * b_0;
8123 		CC[2+bs*2] += a_2 * b_0;
8124 		CC[3+bs*2] += a_3 * b_0;
8125 
8126 		b_0 = B[3+0*bs];
8127 		CC[0+bs*3] += a_0 * b_0;
8128 		CC[1+bs*3] += a_1 * b_0;
8129 		CC[2+bs*3] += a_2 * b_0;
8130 		CC[3+bs*3] += a_3 * b_0;
8131 
8132 		A += bs;
8133 		B += bs;
8134 //		k += 1;
8135 
8136 		// k = 1
8137 
8138 		a_0 = A[0];
8139 		a_1 = A[1];
8140 		a_2 = A[2];
8141 		a_3 = A[3];
8142 
8143 		CC[0+bs*1] += a_0;
8144 		CC[1+bs*1] += a_1;
8145 		CC[2+bs*1] += a_2;
8146 		CC[3+bs*1] += a_3;
8147 
8148 		b_0 = B[2+0*bs];
8149 		CC[0+bs*2] += a_0 * b_0;
8150 		CC[1+bs*2] += a_1 * b_0;
8151 		CC[2+bs*2] += a_2 * b_0;
8152 		CC[3+bs*2] += a_3 * b_0;
8153 
8154 		b_0 = B[3+0*bs];
8155 		CC[0+bs*3] += a_0 * b_0;
8156 		CC[1+bs*3] += a_1 * b_0;
8157 		CC[2+bs*3] += a_2 * b_0;
8158 		CC[3+bs*3] += a_3 * b_0;
8159 
8160 		A += bs;
8161 		B += bs;
8162 //		k += 1;
8163 
8164 		// k = 2
8165 
8166 		a_0 = A[0];
8167 		a_1 = A[1];
8168 		a_2 = A[2];
8169 		a_3 = A[3];
8170 
8171 		CC[0+bs*2] += a_0;
8172 		CC[1+bs*2] += a_1;
8173 		CC[2+bs*2] += a_2;
8174 		CC[3+bs*2] += a_3;
8175 
8176 		b_0 = B[3+0*bs];
8177 		CC[0+bs*3] += a_0 * b_0;
8178 		CC[1+bs*3] += a_1 * b_0;
8179 		CC[2+bs*3] += a_2 * b_0;
8180 		CC[3+bs*3] += a_3 * b_0;
8181 
8182 		A += bs;
8183 		B += bs;
8184 //		k += 1;
8185 
8186 		// k = 3
8187 
8188 		a_0 = A[0];
8189 		a_1 = A[1];
8190 		a_2 = A[2];
8191 		a_3 = A[3];
8192 
8193 		CC[0+bs*3] += a_0;
8194 		CC[1+bs*3] += a_1;
8195 		CC[2+bs*3] += a_2;
8196 		CC[3+bs*3] += a_3;
8197 
8198 		A += bs;
8199 		B += bs;
8200 //		k += 1;
8201 
8202 		}
8203 	else if(n1==3)
8204 		{
8205 
8206 		// k = 0
8207 
8208 		a_0 = A[0];
8209 		a_1 = A[1];
8210 		a_2 = A[2];
8211 		a_3 = A[3];
8212 
8213 		CC[0+bs*0] += a_0;
8214 		CC[1+bs*0] += a_1;
8215 		CC[2+bs*0] += a_2;
8216 		CC[3+bs*0] += a_3;
8217 
8218 		b_0 = B[1+0*bs];
8219 		CC[0+bs*1] += a_0 * b_0;
8220 		CC[1+bs*1] += a_1 * b_0;
8221 		CC[2+bs*1] += a_2 * b_0;
8222 		CC[3+bs*1] += a_3 * b_0;
8223 
8224 		b_0 = B[2+0*bs];
8225 		CC[0+bs*2] += a_0 * b_0;
8226 		CC[1+bs*2] += a_1 * b_0;
8227 		CC[2+bs*2] += a_2 * b_0;
8228 		CC[3+bs*2] += a_3 * b_0;
8229 
8230 		A += bs;
8231 		B += bs;
8232 //		k += 1;
8233 
8234 		// k = 1
8235 
8236 		a_0 = A[0];
8237 		a_1 = A[1];
8238 		a_2 = A[2];
8239 		a_3 = A[3];
8240 
8241 		CC[0+bs*1] += a_0;
8242 		CC[1+bs*1] += a_1;
8243 		CC[2+bs*1] += a_2;
8244 		CC[3+bs*1] += a_3;
8245 
8246 		b_0 = B[2+0*bs];
8247 		CC[0+bs*2] += a_0 * b_0;
8248 		CC[1+bs*2] += a_1 * b_0;
8249 		CC[2+bs*2] += a_2 * b_0;
8250 		CC[3+bs*2] += a_3 * b_0;
8251 
8252 		A += bs;
8253 		B += bs;
8254 //		k += 1;
8255 
8256 		// k = 2
8257 
8258 		a_0 = A[0];
8259 		a_1 = A[1];
8260 		a_2 = A[2];
8261 		a_3 = A[3];
8262 
8263 		CC[0+bs*2] += a_0;
8264 		CC[1+bs*2] += a_1;
8265 		CC[2+bs*2] += a_2;
8266 		CC[3+bs*2] += a_3;
8267 
8268 		A += bs;
8269 		B += bs;
8270 //		k += 1;
8271 
8272 		// k = 3
8273 
8274 		A += bs;
8275 		B += bs;
8276 //		k += 1;
8277 
8278 		}
8279 	else if(n1==2)
8280 		{
8281 
8282 		// k = 0
8283 
8284 		a_0 = A[0];
8285 		a_1 = A[1];
8286 		a_2 = A[2];
8287 		a_3 = A[3];
8288 
8289 		CC[0+bs*0] += a_0;
8290 		CC[1+bs*0] += a_1;
8291 		CC[2+bs*0] += a_2;
8292 		CC[3+bs*0] += a_3;
8293 
8294 		b_0 = B[1+0*bs];
8295 		CC[0+bs*1] += a_0 * b_0;
8296 		CC[1+bs*1] += a_1 * b_0;
8297 		CC[2+bs*1] += a_2 * b_0;
8298 		CC[3+bs*1] += a_3 * b_0;
8299 
8300 		A += bs;
8301 		B += bs;
8302 //		k += 1;
8303 
8304 		// k = 1
8305 
8306 		a_0 = A[0];
8307 		a_1 = A[1];
8308 		a_2 = A[2];
8309 		a_3 = A[3];
8310 
8311 		CC[0+bs*1] += a_0;
8312 		CC[1+bs*1] += a_1;
8313 		CC[2+bs*1] += a_2;
8314 		CC[3+bs*1] += a_3;
8315 
8316 		A += bs;
8317 		B += bs;
8318 //		k += 1;
8319 
8320 		// k = 2
8321 
8322 		A += bs;
8323 		B += bs;
8324 //		k += 1;
8325 
8326 		// k = 3
8327 
8328 		A += bs;
8329 		B += bs;
8330 //		k += 1;
8331 
8332 		}
8333 	else if(n1==1)
8334 		{
8335 
8336 		// k = 0
8337 
8338 		a_0 = A[0];
8339 		a_1 = A[1];
8340 		a_2 = A[2];
8341 		a_3 = A[3];
8342 
8343 		CC[0+bs*0] += a_0;
8344 		CC[1+bs*0] += a_1;
8345 		CC[2+bs*0] += a_2;
8346 		CC[3+bs*0] += a_3;
8347 
8348 		A += bs;
8349 		B += bs;
8350 //		k += 1;
8351 
8352 		// k = 1
8353 
8354 		A += bs;
8355 		B += bs;
8356 //		k += 1;
8357 
8358 		// k = 2
8359 
8360 		A += bs;
8361 		B += bs;
8362 //		k += 1;
8363 
8364 		// k = 3
8365 
8366 		A += bs;
8367 		B += bs;
8368 //		k += 1;
8369 
8370 		}
8371 
8372 	store:
8373 
8374 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
8375 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
8376 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
8377 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
8378 
8379 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
8380 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
8381 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
8382 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
8383 
8384 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
8385 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
8386 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
8387 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
8388 
8389 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
8390 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
8391 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
8392 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
8393 
8394 	if(m1>=4)
8395 		{
8396 		D[0+ldd*0] = CC[0+bs*0];
8397 		D[1+ldd*0] = CC[1+bs*0];
8398 		D[2+ldd*0] = CC[2+bs*0];
8399 		D[3+ldd*0] = CC[3+bs*0];
8400 
8401 		if(n1==1)
8402 			return;
8403 
8404 		D[0+ldd*1] = CC[0+bs*1];
8405 		D[1+ldd*1] = CC[1+bs*1];
8406 		D[2+ldd*1] = CC[2+bs*1];
8407 		D[3+ldd*1] = CC[3+bs*1];
8408 
8409 		if(n1==2)
8410 			return;
8411 
8412 		D[0+ldd*2] = CC[0+bs*2];
8413 		D[1+ldd*2] = CC[1+bs*2];
8414 		D[2+ldd*2] = CC[2+bs*2];
8415 		D[3+ldd*2] = CC[3+bs*2];
8416 
8417 		if(n1==3)
8418 			return;
8419 
8420 		D[0+ldd*3] = CC[0+bs*3];
8421 		D[1+ldd*3] = CC[1+bs*3];
8422 		D[2+ldd*3] = CC[2+bs*3];
8423 		D[3+ldd*3] = CC[3+bs*3];
8424 		}
8425 	else if(m1>=3)
8426 		{
8427 		D[0+ldd*0] = CC[0+bs*0];
8428 		D[1+ldd*0] = CC[1+bs*0];
8429 		D[2+ldd*0] = CC[2+bs*0];
8430 
8431 		if(n1==1)
8432 			return;
8433 
8434 		D[0+ldd*1] = CC[0+bs*1];
8435 		D[1+ldd*1] = CC[1+bs*1];
8436 		D[2+ldd*1] = CC[2+bs*1];
8437 
8438 		if(n1==2)
8439 			return;
8440 
8441 		D[0+ldd*2] = CC[0+bs*2];
8442 		D[1+ldd*2] = CC[1+bs*2];
8443 		D[2+ldd*2] = CC[2+bs*2];
8444 
8445 		if(n1==3)
8446 			return;
8447 
8448 		D[0+ldd*3] = CC[0+bs*3];
8449 		D[1+ldd*3] = CC[1+bs*3];
8450 		D[2+ldd*3] = CC[2+bs*3];
8451 		}
8452 	else if(m1>=2)
8453 		{
8454 		D[0+ldd*0] = CC[0+bs*0];
8455 		D[1+ldd*0] = CC[1+bs*0];
8456 
8457 		if(n1==1)
8458 			return;
8459 
8460 		D[0+ldd*1] = CC[0+bs*1];
8461 		D[1+ldd*1] = CC[1+bs*1];
8462 
8463 		if(n1==2)
8464 			return;
8465 
8466 		D[0+ldd*2] = CC[0+bs*2];
8467 		D[1+ldd*2] = CC[1+bs*2];
8468 
8469 		if(n1==3)
8470 			return;
8471 
8472 		D[0+ldd*3] = CC[0+bs*3];
8473 		D[1+ldd*3] = CC[1+bs*3];
8474 		}
8475 	else //if(m1>=1)
8476 		{
8477 		D[0+ldd*0] = CC[0+bs*0];
8478 
8479 		if(n1==1)
8480 			return;
8481 
8482 		D[0+ldd*1] = CC[0+bs*1];
8483 
8484 		if(n1==2)
8485 			return;
8486 
8487 		D[0+ldd*2] = CC[0+bs*2];
8488 
8489 		if(n1==3)
8490 			return;
8491 
8492 		D[0+ldd*3] = CC[0+bs*3];
8493 		}
8494 
8495 	return;
8496 
8497 	}
8498 #endif
8499 
8500 
8501 
8502 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_one_4x4_tran_lib444c(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int ldd)8503 void kernel_dtrmm_nt_rl_one_4x4_tran_lib444c(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd)
8504 	{
8505 
8506 	const int bs = 4;
8507 
8508 	double
8509 		tmp,
8510 		a_0, a_1, a_2, a_3,
8511 		b_0, b_1, b_2, b_3;
8512 
8513 #if defined(TARGET_GENERIC)
8514 	double CC[16] = {0};
8515 #else
8516 	ALIGNED( double CC[16], 64 ) = {0};
8517 #endif
8518 
8519 	double alpha1 = 1.0;
8520 	double beta1  = 0.0;
8521 
8522 	// assume always kmax>=4 !!!
8523 
8524 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
8525 
8526 	A += kmax*bs;
8527 	B += kmax*bs;
8528 
8529 	// k = 0
8530 
8531 	a_0 = A[0];
8532 	a_1 = A[1];
8533 	a_2 = A[2];
8534 	a_3 = A[3];
8535 
8536 	CC[0+bs*0] += a_0;
8537 	CC[1+bs*0] += a_1;
8538 	CC[2+bs*0] += a_2;
8539 	CC[3+bs*0] += a_3;
8540 
8541 	b_0 = B[1+0*bs];
8542 	CC[0+bs*1] += a_0 * b_0;
8543 	CC[1+bs*1] += a_1 * b_0;
8544 	CC[2+bs*1] += a_2 * b_0;
8545 	CC[3+bs*1] += a_3 * b_0;
8546 
8547 	b_0 = B[2+0*bs];
8548 	CC[0+bs*2] += a_0 * b_0;
8549 	CC[1+bs*2] += a_1 * b_0;
8550 	CC[2+bs*2] += a_2 * b_0;
8551 	CC[3+bs*2] += a_3 * b_0;
8552 
8553 	b_0 = B[3+0*bs];
8554 	CC[0+bs*3] += a_0 * b_0;
8555 	CC[1+bs*3] += a_1 * b_0;
8556 	CC[2+bs*3] += a_2 * b_0;
8557 	CC[3+bs*3] += a_3 * b_0;
8558 
8559 	A += bs;
8560 	B += bs;
8561 //	k += 1;
8562 
8563 	// k = 1
8564 
8565 	a_0 = A[0];
8566 	a_1 = A[1];
8567 	a_2 = A[2];
8568 	a_3 = A[3];
8569 
8570 	CC[0+bs*1] += a_0;
8571 	CC[1+bs*1] += a_1;
8572 	CC[2+bs*1] += a_2;
8573 	CC[3+bs*1] += a_3;
8574 
8575 	b_0 = B[2+0*bs];
8576 	CC[0+bs*2] += a_0 * b_0;
8577 	CC[1+bs*2] += a_1 * b_0;
8578 	CC[2+bs*2] += a_2 * b_0;
8579 	CC[3+bs*2] += a_3 * b_0;
8580 
8581 	b_0 = B[3+0*bs];
8582 	CC[0+bs*3] += a_0 * b_0;
8583 	CC[1+bs*3] += a_1 * b_0;
8584 	CC[2+bs*3] += a_2 * b_0;
8585 	CC[3+bs*3] += a_3 * b_0;
8586 
8587 	A += bs;
8588 	B += bs;
8589 //	k += 1;
8590 
8591 	// k = 2
8592 
8593 	a_0 = A[0];
8594 	a_1 = A[1];
8595 	a_2 = A[2];
8596 	a_3 = A[3];
8597 
8598 	CC[0+bs*2] += a_0;
8599 	CC[1+bs*2] += a_1;
8600 	CC[2+bs*2] += a_2;
8601 	CC[3+bs*2] += a_3;
8602 
8603 	b_0 = B[3+0*bs];
8604 	CC[0+bs*3] += a_0 * b_0;
8605 	CC[1+bs*3] += a_1 * b_0;
8606 	CC[2+bs*3] += a_2 * b_0;
8607 	CC[3+bs*3] += a_3 * b_0;
8608 
8609 	A += bs;
8610 	B += bs;
8611 //	k += 1;
8612 
8613 	// k = 3
8614 
8615 	a_0 = A[0];
8616 	a_1 = A[1];
8617 	a_2 = A[2];
8618 	a_3 = A[3];
8619 
8620 	CC[0+bs*3] += a_0;
8621 	CC[1+bs*3] += a_1;
8622 	CC[2+bs*3] += a_2;
8623 	CC[3+bs*3] += a_3;
8624 
8625 	A += bs;
8626 	B += bs;
8627 //	k += 1;
8628 
8629 	store:
8630 
8631 	// scale & transpose & store
8632 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
8633 	D[0+ldd*1] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
8634 	D[0+ldd*2] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
8635 	D[0+ldd*3] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
8636 
8637 	D[1+ldd*0] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
8638 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
8639 	D[1+ldd*2] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
8640 	D[1+ldd*3] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
8641 
8642 	D[2+ldd*0] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
8643 	D[2+ldd*1] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
8644 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
8645 	D[2+ldd*3] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
8646 
8647 	D[3+ldd*0] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
8648 	D[3+ldd*1] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
8649 	D[3+ldd*2] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
8650 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
8651 
8652 	return;
8653 
8654 	}
8655 #endif
8656 
8657 
8658 
8659 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib444c(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int ldd,int m1,int n1)8660 void kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib444c(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1)
8661 	{
8662 
8663 	const int bs = 4;
8664 
8665 	double
8666 		tmp,
8667 		a_0, a_1, a_2, a_3,
8668 		b_0, b_1, b_2, b_3;
8669 
8670 #if defined(TARGET_GENERIC)
8671 	double CC[16] = {0};
8672 #else
8673 	ALIGNED( double CC[16], 64 ) = {0};
8674 #endif
8675 
8676 	double alpha1 = 1.0;
8677 	double beta1  = 0.0;
8678 
8679 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
8680 
8681 	A += kmax*bs;
8682 	B += kmax*bs;
8683 
8684 	// XXX m1 and n1 are swapped !!!!!
8685 	if(m1>=4)
8686 		{
8687 
8688 		// k = 0
8689 
8690 		a_0 = A[0];
8691 		a_1 = A[1];
8692 		a_2 = A[2];
8693 		a_3 = A[3];
8694 
8695 		CC[0+bs*0] += a_0;
8696 		CC[1+bs*0] += a_1;
8697 		CC[2+bs*0] += a_2;
8698 		CC[3+bs*0] += a_3;
8699 
8700 		b_0 = B[1+0*bs];
8701 		CC[0+bs*1] += a_0 * b_0;
8702 		CC[1+bs*1] += a_1 * b_0;
8703 		CC[2+bs*1] += a_2 * b_0;
8704 		CC[3+bs*1] += a_3 * b_0;
8705 
8706 		b_0 = B[2+0*bs];
8707 		CC[0+bs*2] += a_0 * b_0;
8708 		CC[1+bs*2] += a_1 * b_0;
8709 		CC[2+bs*2] += a_2 * b_0;
8710 		CC[3+bs*2] += a_3 * b_0;
8711 
8712 		b_0 = B[3+0*bs];
8713 		CC[0+bs*3] += a_0 * b_0;
8714 		CC[1+bs*3] += a_1 * b_0;
8715 		CC[2+bs*3] += a_2 * b_0;
8716 		CC[3+bs*3] += a_3 * b_0;
8717 
8718 		A += bs;
8719 		B += bs;
8720 //		k += 1;
8721 
8722 		// k = 1
8723 
8724 		a_0 = A[0];
8725 		a_1 = A[1];
8726 		a_2 = A[2];
8727 		a_3 = A[3];
8728 
8729 		CC[0+bs*1] += a_0;
8730 		CC[1+bs*1] += a_1;
8731 		CC[2+bs*1] += a_2;
8732 		CC[3+bs*1] += a_3;
8733 
8734 		b_0 = B[2+0*bs];
8735 		CC[0+bs*2] += a_0 * b_0;
8736 		CC[1+bs*2] += a_1 * b_0;
8737 		CC[2+bs*2] += a_2 * b_0;
8738 		CC[3+bs*2] += a_3 * b_0;
8739 
8740 		b_0 = B[3+0*bs];
8741 		CC[0+bs*3] += a_0 * b_0;
8742 		CC[1+bs*3] += a_1 * b_0;
8743 		CC[2+bs*3] += a_2 * b_0;
8744 		CC[3+bs*3] += a_3 * b_0;
8745 
8746 		A += bs;
8747 		B += bs;
8748 //		k += 1;
8749 
8750 		// k = 2
8751 
8752 		a_0 = A[0];
8753 		a_1 = A[1];
8754 		a_2 = A[2];
8755 		a_3 = A[3];
8756 
8757 		CC[0+bs*2] += a_0;
8758 		CC[1+bs*2] += a_1;
8759 		CC[2+bs*2] += a_2;
8760 		CC[3+bs*2] += a_3;
8761 
8762 		b_0 = B[3+0*bs];
8763 		CC[0+bs*3] += a_0 * b_0;
8764 		CC[1+bs*3] += a_1 * b_0;
8765 		CC[2+bs*3] += a_2 * b_0;
8766 		CC[3+bs*3] += a_3 * b_0;
8767 
8768 		A += bs;
8769 		B += bs;
8770 //		k += 1;
8771 
8772 		// k = 3
8773 
8774 		a_0 = A[0];
8775 		a_1 = A[1];
8776 		a_2 = A[2];
8777 		a_3 = A[3];
8778 
8779 		CC[0+bs*3] += a_0;
8780 		CC[1+bs*3] += a_1;
8781 		CC[2+bs*3] += a_2;
8782 		CC[3+bs*3] += a_3;
8783 
8784 		A += bs;
8785 		B += bs;
8786 //		k += 1;
8787 
8788 		}
8789 	else if(m1==3)
8790 		{
8791 
8792 		// k = 0
8793 
8794 		a_0 = A[0];
8795 		a_1 = A[1];
8796 		a_2 = A[2];
8797 		a_3 = A[3];
8798 
8799 		CC[0+bs*0] += a_0;
8800 		CC[1+bs*0] += a_1;
8801 		CC[2+bs*0] += a_2;
8802 		CC[3+bs*0] += a_3;
8803 
8804 		b_0 = B[1+0*bs];
8805 		CC[0+bs*1] += a_0 * b_0;
8806 		CC[1+bs*1] += a_1 * b_0;
8807 		CC[2+bs*1] += a_2 * b_0;
8808 		CC[3+bs*1] += a_3 * b_0;
8809 
8810 		b_0 = B[2+0*bs];
8811 		CC[0+bs*2] += a_0 * b_0;
8812 		CC[1+bs*2] += a_1 * b_0;
8813 		CC[2+bs*2] += a_2 * b_0;
8814 		CC[3+bs*2] += a_3 * b_0;
8815 
8816 		A += bs;
8817 		B += bs;
8818 //		k += 1;
8819 
8820 		// k = 1
8821 
8822 		a_0 = A[0];
8823 		a_1 = A[1];
8824 		a_2 = A[2];
8825 		a_3 = A[3];
8826 
8827 		CC[0+bs*1] += a_0;
8828 		CC[1+bs*1] += a_1;
8829 		CC[2+bs*1] += a_2;
8830 		CC[3+bs*1] += a_3;
8831 
8832 		b_0 = B[2+0*bs];
8833 		CC[0+bs*2] += a_0 * b_0;
8834 		CC[1+bs*2] += a_1 * b_0;
8835 		CC[2+bs*2] += a_2 * b_0;
8836 		CC[3+bs*2] += a_3 * b_0;
8837 
8838 		A += bs;
8839 		B += bs;
8840 //		k += 1;
8841 
8842 		// k = 2
8843 
8844 		a_0 = A[0];
8845 		a_1 = A[1];
8846 		a_2 = A[2];
8847 		a_3 = A[3];
8848 
8849 		CC[0+bs*2] += a_0;
8850 		CC[1+bs*2] += a_1;
8851 		CC[2+bs*2] += a_2;
8852 		CC[3+bs*2] += a_3;
8853 
8854 		A += bs;
8855 		B += bs;
8856 //		k += 1;
8857 
8858 		// k = 3
8859 
8860 		A += bs;
8861 		B += bs;
8862 //		k += 1;
8863 
8864 		}
8865 	else if(m1==2)
8866 		{
8867 
8868 		// k = 0
8869 
8870 		a_0 = A[0];
8871 		a_1 = A[1];
8872 		a_2 = A[2];
8873 		a_3 = A[3];
8874 
8875 		CC[0+bs*0] += a_0;
8876 		CC[1+bs*0] += a_1;
8877 		CC[2+bs*0] += a_2;
8878 		CC[3+bs*0] += a_3;
8879 
8880 		b_0 = B[1+0*bs];
8881 		CC[0+bs*1] += a_0 * b_0;
8882 		CC[1+bs*1] += a_1 * b_0;
8883 		CC[2+bs*1] += a_2 * b_0;
8884 		CC[3+bs*1] += a_3 * b_0;
8885 
8886 		A += bs;
8887 		B += bs;
8888 //		k += 1;
8889 
8890 		// k = 1
8891 
8892 		a_0 = A[0];
8893 		a_1 = A[1];
8894 		a_2 = A[2];
8895 		a_3 = A[3];
8896 
8897 		CC[0+bs*1] += a_0;
8898 		CC[1+bs*1] += a_1;
8899 		CC[2+bs*1] += a_2;
8900 		CC[3+bs*1] += a_3;
8901 
8902 		A += bs;
8903 		B += bs;
8904 //		k += 1;
8905 
8906 		// k = 2
8907 
8908 		A += bs;
8909 		B += bs;
8910 //		k += 1;
8911 
8912 		// k = 3
8913 
8914 		A += bs;
8915 		B += bs;
8916 //		k += 1;
8917 
8918 		}
8919 	else if(m1==1)
8920 		{
8921 
8922 		// k = 0
8923 
8924 		a_0 = A[0];
8925 		a_1 = A[1];
8926 		a_2 = A[2];
8927 		a_3 = A[3];
8928 
8929 		CC[0+bs*0] += a_0;
8930 		CC[1+bs*0] += a_1;
8931 		CC[2+bs*0] += a_2;
8932 		CC[3+bs*0] += a_3;
8933 
8934 		A += bs;
8935 		B += bs;
8936 //		k += 1;
8937 
8938 		// k = 1
8939 
8940 		A += bs;
8941 		B += bs;
8942 //		k += 1;
8943 
8944 		// k = 2
8945 
8946 		A += bs;
8947 		B += bs;
8948 //		k += 1;
8949 
8950 		// k = 3
8951 
8952 		A += bs;
8953 		B += bs;
8954 //		k += 1;
8955 
8956 		}
8957 
8958 	store:
8959 
8960 	// scale
8961 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
8962 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
8963 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
8964 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
8965 
8966 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
8967 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
8968 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
8969 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
8970 
8971 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
8972 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
8973 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
8974 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
8975 
8976 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
8977 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
8978 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
8979 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
8980 
8981 	// transpose
8982 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
8983 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
8984 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
8985 
8986 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
8987 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
8988 
8989 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
8990 
8991 	// store
8992 	if(m1>=4)
8993 		{
8994 		D[0+ldd*0] = CC[0+bs*0];
8995 		D[1+ldd*0] = CC[1+bs*0];
8996 		D[2+ldd*0] = CC[2+bs*0];
8997 		D[3+ldd*0] = CC[3+bs*0];
8998 
8999 		if(n1==1)
9000 			return;
9001 
9002 		D[0+ldd*1] = CC[0+bs*1];
9003 		D[1+ldd*1] = CC[1+bs*1];
9004 		D[2+ldd*1] = CC[2+bs*1];
9005 		D[3+ldd*1] = CC[3+bs*1];
9006 
9007 		if(n1==2)
9008 			return;
9009 
9010 		D[0+ldd*2] = CC[0+bs*2];
9011 		D[1+ldd*2] = CC[1+bs*2];
9012 		D[2+ldd*2] = CC[2+bs*2];
9013 		D[3+ldd*2] = CC[3+bs*2];
9014 
9015 		if(n1==3)
9016 			return;
9017 
9018 		D[0+ldd*3] = CC[0+bs*3];
9019 		D[1+ldd*3] = CC[1+bs*3];
9020 		D[2+ldd*3] = CC[2+bs*3];
9021 		D[3+ldd*3] = CC[3+bs*3];
9022 		}
9023 	else if(m1>=3)
9024 		{
9025 		D[0+ldd*0] = CC[0+bs*0];
9026 		D[1+ldd*0] = CC[1+bs*0];
9027 		D[2+ldd*0] = CC[2+bs*0];
9028 
9029 		if(n1==1)
9030 			return;
9031 
9032 		D[0+ldd*1] = CC[0+bs*1];
9033 		D[1+ldd*1] = CC[1+bs*1];
9034 		D[2+ldd*1] = CC[2+bs*1];
9035 
9036 		if(n1==2)
9037 			return;
9038 
9039 		D[0+ldd*2] = CC[0+bs*2];
9040 		D[1+ldd*2] = CC[1+bs*2];
9041 		D[2+ldd*2] = CC[2+bs*2];
9042 
9043 		if(n1==3)
9044 			return;
9045 
9046 		D[0+ldd*3] = CC[0+bs*3];
9047 		D[1+ldd*3] = CC[1+bs*3];
9048 		D[2+ldd*3] = CC[2+bs*3];
9049 		}
9050 	else if(m1>=2)
9051 		{
9052 		D[0+ldd*0] = CC[0+bs*0];
9053 		D[1+ldd*0] = CC[1+bs*0];
9054 
9055 		if(n1==1)
9056 			return;
9057 
9058 		D[0+ldd*1] = CC[0+bs*1];
9059 		D[1+ldd*1] = CC[1+bs*1];
9060 
9061 		if(n1==2)
9062 			return;
9063 
9064 		D[0+ldd*2] = CC[0+bs*2];
9065 		D[1+ldd*2] = CC[1+bs*2];
9066 
9067 		if(n1==3)
9068 			return;
9069 
9070 		D[0+ldd*3] = CC[0+bs*3];
9071 		D[1+ldd*3] = CC[1+bs*3];
9072 		}
9073 	else //if(m1>=1)
9074 		{
9075 		D[0+ldd*0] = CC[0+bs*0];
9076 
9077 		if(n1==1)
9078 			return;
9079 
9080 		D[0+ldd*1] = CC[0+bs*1];
9081 
9082 		if(n1==2)
9083 			return;
9084 
9085 		D[0+ldd*2] = CC[0+bs*2];
9086 
9087 		if(n1==3)
9088 			return;
9089 
9090 		D[0+ldd*3] = CC[0+bs*3];
9091 		}
9092 
9093 	return;
9094 
9095 	}
9096 #endif
9097 
9098 
9099 
9100 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)9101 void kernel_dtrmm_nt_rl_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
9102 	{
9103 
9104 	const int bs = 4;
9105 
9106 	double
9107 		a_0, a_1, a_2, a_3,
9108 		b_0, b_1, b_2, b_3;
9109 
9110 #if defined(TARGET_GENERIC)
9111 	double CC[16] = {0};
9112 #else
9113 	ALIGNED( double CC[16], 64 ) = {0};
9114 #endif
9115 
9116 	double alpha1 = 1.0;
9117 
9118 	// assume always kmax>=4 !!!
9119 
9120 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
9121 
9122 	A += kmax*bs;
9123 	B += kmax*ldb;
9124 
9125 	// k = 0
9126 
9127 	a_0 = A[0];
9128 	a_1 = A[1];
9129 	a_2 = A[2];
9130 	a_3 = A[3];
9131 
9132 	b_0 = B[0+0*ldb];
9133 	CC[0+bs*0] += a_0 * b_0;
9134 	CC[1+bs*0] += a_1 * b_0;
9135 	CC[2+bs*0] += a_2 * b_0;
9136 	CC[3+bs*0] += a_3 * b_0;
9137 
9138 	b_0 = B[1+0*ldb];
9139 	CC[0+bs*1] += a_0 * b_0;
9140 	CC[1+bs*1] += a_1 * b_0;
9141 	CC[2+bs*1] += a_2 * b_0;
9142 	CC[3+bs*1] += a_3 * b_0;
9143 
9144 	b_0 = B[2+0*ldb];
9145 	CC[0+bs*2] += a_0 * b_0;
9146 	CC[1+bs*2] += a_1 * b_0;
9147 	CC[2+bs*2] += a_2 * b_0;
9148 	CC[3+bs*2] += a_3 * b_0;
9149 
9150 	b_0 = B[3+0*ldb];
9151 	CC[0+bs*3] += a_0 * b_0;
9152 	CC[1+bs*3] += a_1 * b_0;
9153 	CC[2+bs*3] += a_2 * b_0;
9154 	CC[3+bs*3] += a_3 * b_0;
9155 
9156 	A += bs;
9157 	B += ldb;
9158 //	k += 1;
9159 
9160 	// k = 1
9161 
9162 	a_0 = A[0];
9163 	a_1 = A[1];
9164 	a_2 = A[2];
9165 	a_3 = A[3];
9166 
9167 	b_0 = B[1+0*ldb];
9168 	CC[0+bs*1] += a_0 * b_0;
9169 	CC[1+bs*1] += a_1 * b_0;
9170 	CC[2+bs*1] += a_2 * b_0;
9171 	CC[3+bs*1] += a_3 * b_0;
9172 
9173 	b_0 = B[2+0*ldb];
9174 	CC[0+bs*2] += a_0 * b_0;
9175 	CC[1+bs*2] += a_1 * b_0;
9176 	CC[2+bs*2] += a_2 * b_0;
9177 	CC[3+bs*2] += a_3 * b_0;
9178 
9179 	b_0 = B[3+0*ldb];
9180 	CC[0+bs*3] += a_0 * b_0;
9181 	CC[1+bs*3] += a_1 * b_0;
9182 	CC[2+bs*3] += a_2 * b_0;
9183 	CC[3+bs*3] += a_3 * b_0;
9184 
9185 	A += bs;
9186 	B += ldb;
9187 //	k += 1;
9188 
9189 	// k = 2
9190 
9191 	a_0 = A[0];
9192 	a_1 = A[1];
9193 	a_2 = A[2];
9194 	a_3 = A[3];
9195 
9196 	b_0 = B[2+0*ldb];
9197 	CC[0+bs*2] += a_0 * b_0;
9198 	CC[1+bs*2] += a_1 * b_0;
9199 	CC[2+bs*2] += a_2 * b_0;
9200 	CC[3+bs*2] += a_3 * b_0;
9201 
9202 	b_0 = B[3+0*ldb];
9203 	CC[0+bs*3] += a_0 * b_0;
9204 	CC[1+bs*3] += a_1 * b_0;
9205 	CC[2+bs*3] += a_2 * b_0;
9206 	CC[3+bs*3] += a_3 * b_0;
9207 
9208 	A += bs;
9209 	B += ldb;
9210 //	k += 1;
9211 
9212 	// k = 3
9213 
9214 	a_0 = A[0];
9215 	a_1 = A[1];
9216 	a_2 = A[2];
9217 	a_3 = A[3];
9218 
9219 	b_0 = B[3+0*ldb];
9220 	CC[0+bs*3] += a_0 * b_0;
9221 	CC[1+bs*3] += a_1 * b_0;
9222 	CC[2+bs*3] += a_2 * b_0;
9223 	CC[3+bs*3] += a_3 * b_0;
9224 
9225 	A += bs;
9226 	B += ldb;
9227 //	k += 1;
9228 
9229 	store:
9230 
9231 	D[0+ldd*0] = alpha[0]*CC[0+bs*0];
9232 	D[1+ldd*0] = alpha[0]*CC[1+bs*0];
9233 	D[2+ldd*0] = alpha[0]*CC[2+bs*0];
9234 	D[3+ldd*0] = alpha[0]*CC[3+bs*0];
9235 
9236 	D[0+ldd*1] = alpha[0]*CC[0+bs*1];
9237 	D[1+ldd*1] = alpha[0]*CC[1+bs*1];
9238 	D[2+ldd*1] = alpha[0]*CC[2+bs*1];
9239 	D[3+ldd*1] = alpha[0]*CC[3+bs*1];
9240 
9241 	D[0+ldd*2] = alpha[0]*CC[0+bs*2];
9242 	D[1+ldd*2] = alpha[0]*CC[1+bs*2];
9243 	D[2+ldd*2] = alpha[0]*CC[2+bs*2];
9244 	D[3+ldd*2] = alpha[0]*CC[3+bs*2];
9245 
9246 	D[0+ldd*3] = alpha[0]*CC[0+bs*3];
9247 	D[1+ldd*3] = alpha[0]*CC[1+bs*3];
9248 	D[2+ldd*3] = alpha[0]*CC[2+bs*3];
9249 	D[3+ldd*3] = alpha[0]*CC[3+bs*3];
9250 
9251 	return;
9252 
9253 	}
9254 #endif
9255 
9256 
9257 
9258 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)9259 void kernel_dtrmm_nt_rl_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
9260 	{
9261 
9262 	const int bs = 4;
9263 
9264 	double
9265 		a_0, a_1, a_2, a_3,
9266 		b_0, b_1, b_2, b_3;
9267 
9268 #if defined(TARGET_GENERIC)
9269 	double CC[16] = {0};
9270 #else
9271 	ALIGNED( double CC[16], 64 ) = {0};
9272 #endif
9273 
9274 	double alpha1 = 1.0;
9275 
9276 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
9277 
9278 	A += kmax*bs;
9279 	B += kmax*ldb;
9280 
9281 	if(n1>=4)
9282 		{
9283 
9284 		// k = 0
9285 
9286 		a_0 = A[0];
9287 		a_1 = A[1];
9288 		a_2 = A[2];
9289 		a_3 = A[3];
9290 
9291 		b_0 = B[0+0*ldb];
9292 		CC[0+bs*0] += a_0 * b_0;
9293 		CC[1+bs*0] += a_1 * b_0;
9294 		CC[2+bs*0] += a_2 * b_0;
9295 		CC[3+bs*0] += a_3 * b_0;
9296 
9297 		b_0 = B[1+0*ldb];
9298 		CC[0+bs*1] += a_0 * b_0;
9299 		CC[1+bs*1] += a_1 * b_0;
9300 		CC[2+bs*1] += a_2 * b_0;
9301 		CC[3+bs*1] += a_3 * b_0;
9302 
9303 		b_0 = B[2+0*ldb];
9304 		CC[0+bs*2] += a_0 * b_0;
9305 		CC[1+bs*2] += a_1 * b_0;
9306 		CC[2+bs*2] += a_2 * b_0;
9307 		CC[3+bs*2] += a_3 * b_0;
9308 
9309 		b_0 = B[3+0*ldb];
9310 		CC[0+bs*3] += a_0 * b_0;
9311 		CC[1+bs*3] += a_1 * b_0;
9312 		CC[2+bs*3] += a_2 * b_0;
9313 		CC[3+bs*3] += a_3 * b_0;
9314 
9315 		A += bs;
9316 		B += ldb;
9317 //		k += 1;
9318 
9319 		// k = 1
9320 
9321 		a_0 = A[0];
9322 		a_1 = A[1];
9323 		a_2 = A[2];
9324 		a_3 = A[3];
9325 
9326 		b_0 = B[1+0*ldb];
9327 		CC[0+bs*1] += a_0 * b_0;
9328 		CC[1+bs*1] += a_1 * b_0;
9329 		CC[2+bs*1] += a_2 * b_0;
9330 		CC[3+bs*1] += a_3 * b_0;
9331 
9332 		b_0 = B[2+0*ldb];
9333 		CC[0+bs*2] += a_0 * b_0;
9334 		CC[1+bs*2] += a_1 * b_0;
9335 		CC[2+bs*2] += a_2 * b_0;
9336 		CC[3+bs*2] += a_3 * b_0;
9337 
9338 		b_0 = B[3+0*ldb];
9339 		CC[0+bs*3] += a_0 * b_0;
9340 		CC[1+bs*3] += a_1 * b_0;
9341 		CC[2+bs*3] += a_2 * b_0;
9342 		CC[3+bs*3] += a_3 * b_0;
9343 
9344 		A += bs;
9345 		B += ldb;
9346 //		k += 1;
9347 
9348 		// k = 2
9349 
9350 		a_0 = A[0];
9351 		a_1 = A[1];
9352 		a_2 = A[2];
9353 		a_3 = A[3];
9354 
9355 		b_0 = B[2+0*ldb];
9356 		CC[0+bs*2] += a_0 * b_0;
9357 		CC[1+bs*2] += a_1 * b_0;
9358 		CC[2+bs*2] += a_2 * b_0;
9359 		CC[3+bs*2] += a_3 * b_0;
9360 
9361 		b_0 = B[3+0*ldb];
9362 		CC[0+bs*3] += a_0 * b_0;
9363 		CC[1+bs*3] += a_1 * b_0;
9364 		CC[2+bs*3] += a_2 * b_0;
9365 		CC[3+bs*3] += a_3 * b_0;
9366 
9367 		A += bs;
9368 		B += ldb;
9369 //		k += 1;
9370 
9371 		// k = 3
9372 
9373 		a_0 = A[0];
9374 		a_1 = A[1];
9375 		a_2 = A[2];
9376 		a_3 = A[3];
9377 
9378 		b_0 = B[3+0*ldb];
9379 		CC[0+bs*3] += a_0 * b_0;
9380 		CC[1+bs*3] += a_1 * b_0;
9381 		CC[2+bs*3] += a_2 * b_0;
9382 		CC[3+bs*3] += a_3 * b_0;
9383 
9384 		A += bs;
9385 		B += ldb;
9386 //		k += 1;
9387 
9388 		}
9389 	else if(n1==3)
9390 		{
9391 
9392 		// k = 0
9393 
9394 		a_0 = A[0];
9395 		a_1 = A[1];
9396 		a_2 = A[2];
9397 		a_3 = A[3];
9398 
9399 		b_0 = B[0+0*ldb];
9400 		CC[0+bs*0] += a_0 * b_0;
9401 		CC[1+bs*0] += a_1 * b_0;
9402 		CC[2+bs*0] += a_2 * b_0;
9403 		CC[3+bs*0] += a_3 * b_0;
9404 
9405 		b_0 = B[1+0*ldb];
9406 		CC[0+bs*1] += a_0 * b_0;
9407 		CC[1+bs*1] += a_1 * b_0;
9408 		CC[2+bs*1] += a_2 * b_0;
9409 		CC[3+bs*1] += a_3 * b_0;
9410 
9411 		b_0 = B[2+0*ldb];
9412 		CC[0+bs*2] += a_0 * b_0;
9413 		CC[1+bs*2] += a_1 * b_0;
9414 		CC[2+bs*2] += a_2 * b_0;
9415 		CC[3+bs*2] += a_3 * b_0;
9416 
9417 		A += bs;
9418 		B += ldb;
9419 //		k += 1;
9420 
9421 		// k = 1
9422 
9423 		a_0 = A[0];
9424 		a_1 = A[1];
9425 		a_2 = A[2];
9426 		a_3 = A[3];
9427 
9428 		b_0 = B[1+0*ldb];
9429 		CC[0+bs*1] += a_0 * b_0;
9430 		CC[1+bs*1] += a_1 * b_0;
9431 		CC[2+bs*1] += a_2 * b_0;
9432 		CC[3+bs*1] += a_3 * b_0;
9433 
9434 		b_0 = B[2+0*ldb];
9435 		CC[0+bs*2] += a_0 * b_0;
9436 		CC[1+bs*2] += a_1 * b_0;
9437 		CC[2+bs*2] += a_2 * b_0;
9438 		CC[3+bs*2] += a_3 * b_0;
9439 
9440 		A += bs;
9441 		B += ldb;
9442 //		k += 1;
9443 
9444 		// k = 2
9445 
9446 		a_0 = A[0];
9447 		a_1 = A[1];
9448 		a_2 = A[2];
9449 		a_3 = A[3];
9450 
9451 		b_0 = B[2+0*ldb];
9452 		CC[0+bs*2] += a_0 * b_0;
9453 		CC[1+bs*2] += a_1 * b_0;
9454 		CC[2+bs*2] += a_2 * b_0;
9455 		CC[3+bs*2] += a_3 * b_0;
9456 
9457 		A += bs;
9458 		B += ldb;
9459 //		k += 1;
9460 
9461 		// k = 3
9462 
9463 		A += bs;
9464 		B += ldb;
9465 //		k += 1;
9466 
9467 		}
9468 	else if(n1==2)
9469 		{
9470 
9471 		// k = 0
9472 
9473 		a_0 = A[0];
9474 		a_1 = A[1];
9475 		a_2 = A[2];
9476 		a_3 = A[3];
9477 
9478 		b_0 = B[0+0*ldb];
9479 		CC[0+bs*0] += a_0 * b_0;
9480 		CC[1+bs*0] += a_1 * b_0;
9481 		CC[2+bs*0] += a_2 * b_0;
9482 		CC[3+bs*0] += a_3 * b_0;
9483 
9484 		b_0 = B[1+0*ldb];
9485 		CC[0+bs*1] += a_0 * b_0;
9486 		CC[1+bs*1] += a_1 * b_0;
9487 		CC[2+bs*1] += a_2 * b_0;
9488 		CC[3+bs*1] += a_3 * b_0;
9489 
9490 		A += bs;
9491 		B += ldb;
9492 //		k += 1;
9493 
9494 		// k = 1
9495 
9496 		a_0 = A[0];
9497 		a_1 = A[1];
9498 		a_2 = A[2];
9499 		a_3 = A[3];
9500 
9501 		b_0 = B[1+0*ldb];
9502 		CC[0+bs*1] += a_0 * b_0;
9503 		CC[1+bs*1] += a_1 * b_0;
9504 		CC[2+bs*1] += a_2 * b_0;
9505 		CC[3+bs*1] += a_3 * b_0;
9506 
9507 		A += bs;
9508 		B += ldb;
9509 //		k += 1;
9510 
9511 		// k = 2
9512 
9513 		A += bs;
9514 		B += ldb;
9515 //		k += 1;
9516 
9517 		// k = 3
9518 
9519 		A += bs;
9520 		B += ldb;
9521 //		k += 1;
9522 
9523 		}
9524 	else if(n1==1)
9525 		{
9526 
9527 		// k = 0
9528 
9529 		a_0 = A[0];
9530 		a_1 = A[1];
9531 		a_2 = A[2];
9532 		a_3 = A[3];
9533 
9534 		b_0 = B[0+0*ldb];
9535 		CC[0+bs*0] += a_0 * b_0;
9536 		CC[1+bs*0] += a_1 * b_0;
9537 		CC[2+bs*0] += a_2 * b_0;
9538 		CC[3+bs*0] += a_3 * b_0;
9539 
9540 		A += bs;
9541 		B += ldb;
9542 //		k += 1;
9543 
9544 		// k = 1
9545 
9546 		A += bs;
9547 		B += ldb;
9548 //		k += 1;
9549 
9550 		// k = 2
9551 
9552 		A += bs;
9553 		B += ldb;
9554 //		k += 1;
9555 
9556 		// k = 3
9557 
9558 		A += bs;
9559 		B += ldb;
9560 //		k += 1;
9561 
9562 		}
9563 
9564 	store:
9565 
9566 	CC[0+bs*0] = alpha[0]*CC[0+bs*0];
9567 	CC[1+bs*0] = alpha[0]*CC[1+bs*0];
9568 	CC[2+bs*0] = alpha[0]*CC[2+bs*0];
9569 	CC[3+bs*0] = alpha[0]*CC[3+bs*0];
9570 
9571 	CC[0+bs*1] = alpha[0]*CC[0+bs*1];
9572 	CC[1+bs*1] = alpha[0]*CC[1+bs*1];
9573 	CC[2+bs*1] = alpha[0]*CC[2+bs*1];
9574 	CC[3+bs*1] = alpha[0]*CC[3+bs*1];
9575 
9576 	CC[0+bs*2] = alpha[0]*CC[0+bs*2];
9577 	CC[1+bs*2] = alpha[0]*CC[1+bs*2];
9578 	CC[2+bs*2] = alpha[0]*CC[2+bs*2];
9579 	CC[3+bs*2] = alpha[0]*CC[3+bs*2];
9580 
9581 	CC[0+bs*3] = alpha[0]*CC[0+bs*3];
9582 	CC[1+bs*3] = alpha[0]*CC[1+bs*3];
9583 	CC[2+bs*3] = alpha[0]*CC[2+bs*3];
9584 	CC[3+bs*3] = alpha[0]*CC[3+bs*3];
9585 
9586 	if(m1>=4)
9587 		{
9588 		D[0+ldd*0] = CC[0+bs*0];
9589 		D[1+ldd*0] = CC[1+bs*0];
9590 		D[2+ldd*0] = CC[2+bs*0];
9591 		D[3+ldd*0] = CC[3+bs*0];
9592 
9593 		if(n1==1)
9594 			return;
9595 
9596 		D[0+ldd*1] = CC[0+bs*1];
9597 		D[1+ldd*1] = CC[1+bs*1];
9598 		D[2+ldd*1] = CC[2+bs*1];
9599 		D[3+ldd*1] = CC[3+bs*1];
9600 
9601 		if(n1==2)
9602 			return;
9603 
9604 		D[0+ldd*2] = CC[0+bs*2];
9605 		D[1+ldd*2] = CC[1+bs*2];
9606 		D[2+ldd*2] = CC[2+bs*2];
9607 		D[3+ldd*2] = CC[3+bs*2];
9608 
9609 		if(n1==3)
9610 			return;
9611 
9612 		D[0+ldd*3] = CC[0+bs*3];
9613 		D[1+ldd*3] = CC[1+bs*3];
9614 		D[2+ldd*3] = CC[2+bs*3];
9615 		D[3+ldd*3] = CC[3+bs*3];
9616 		}
9617 	else if(m1>=3)
9618 		{
9619 		D[0+ldd*0] = CC[0+bs*0];
9620 		D[1+ldd*0] = CC[1+bs*0];
9621 		D[2+ldd*0] = CC[2+bs*0];
9622 
9623 		if(n1==1)
9624 			return;
9625 
9626 		D[0+ldd*1] = CC[0+bs*1];
9627 		D[1+ldd*1] = CC[1+bs*1];
9628 		D[2+ldd*1] = CC[2+bs*1];
9629 
9630 		if(n1==2)
9631 			return;
9632 
9633 		D[0+ldd*2] = CC[0+bs*2];
9634 		D[1+ldd*2] = CC[1+bs*2];
9635 		D[2+ldd*2] = CC[2+bs*2];
9636 
9637 		if(n1==3)
9638 			return;
9639 
9640 		D[0+ldd*3] = CC[0+bs*3];
9641 		D[1+ldd*3] = CC[1+bs*3];
9642 		D[2+ldd*3] = CC[2+bs*3];
9643 		}
9644 	else if(m1>=2)
9645 		{
9646 		D[0+ldd*0] = CC[0+bs*0];
9647 		D[1+ldd*0] = CC[1+bs*0];
9648 
9649 		if(n1==1)
9650 			return;
9651 
9652 		D[0+ldd*1] = CC[0+bs*1];
9653 		D[1+ldd*1] = CC[1+bs*1];
9654 
9655 		if(n1==2)
9656 			return;
9657 
9658 		D[0+ldd*2] = CC[0+bs*2];
9659 		D[1+ldd*2] = CC[1+bs*2];
9660 
9661 		if(n1==3)
9662 			return;
9663 
9664 		D[0+ldd*3] = CC[0+bs*3];
9665 		D[1+ldd*3] = CC[1+bs*3];
9666 		}
9667 	else //if(m1>=1)
9668 		{
9669 		D[0+ldd*0] = CC[0+bs*0];
9670 
9671 		if(n1==1)
9672 			return;
9673 
9674 		D[0+ldd*1] = CC[0+bs*1];
9675 
9676 		if(n1==2)
9677 			return;
9678 
9679 		D[0+ldd*2] = CC[0+bs*2];
9680 
9681 		if(n1==3)
9682 			return;
9683 
9684 		D[0+ldd*3] = CC[0+bs*3];
9685 		}
9686 
9687 	return;
9688 
9689 	}
9690 #endif
9691 
9692 
9693 
9694 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_4x4_tran_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd)9695 void kernel_dtrmm_nt_rl_4x4_tran_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd)
9696 	{
9697 
9698 	const int bs = 4;
9699 
9700 	double
9701 		a_0, a_1, a_2, a_3,
9702 		b_0, b_1, b_2, b_3;
9703 
9704 #if defined(TARGET_GENERIC)
9705 	double CC[16] = {0};
9706 #else
9707 	ALIGNED( double CC[16], 64 ) = {0};
9708 #endif
9709 
9710 	double alpha1 = 1.0;
9711 
9712 	// assume always kmax>=4 !!!
9713 
9714 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
9715 
9716 	A += kmax*bs;
9717 	B += kmax*ldb;
9718 
9719 	// k = 0
9720 
9721 	a_0 = A[0];
9722 	a_1 = A[1];
9723 	a_2 = A[2];
9724 	a_3 = A[3];
9725 
9726 	b_0 = B[0+0*ldb];
9727 	CC[0+bs*0] += a_0 * b_0;
9728 	CC[1+bs*0] += a_1 * b_0;
9729 	CC[2+bs*0] += a_2 * b_0;
9730 	CC[3+bs*0] += a_3 * b_0;
9731 
9732 	b_0 = B[1+0*ldb];
9733 	CC[0+bs*1] += a_0 * b_0;
9734 	CC[1+bs*1] += a_1 * b_0;
9735 	CC[2+bs*1] += a_2 * b_0;
9736 	CC[3+bs*1] += a_3 * b_0;
9737 
9738 	b_0 = B[2+0*ldb];
9739 	CC[0+bs*2] += a_0 * b_0;
9740 	CC[1+bs*2] += a_1 * b_0;
9741 	CC[2+bs*2] += a_2 * b_0;
9742 	CC[3+bs*2] += a_3 * b_0;
9743 
9744 	b_0 = B[3+0*ldb];
9745 	CC[0+bs*3] += a_0 * b_0;
9746 	CC[1+bs*3] += a_1 * b_0;
9747 	CC[2+bs*3] += a_2 * b_0;
9748 	CC[3+bs*3] += a_3 * b_0;
9749 
9750 	A += bs;
9751 	B += ldb;
9752 //	k += 1;
9753 
9754 	// k = 1
9755 
9756 	a_0 = A[0];
9757 	a_1 = A[1];
9758 	a_2 = A[2];
9759 	a_3 = A[3];
9760 
9761 	b_0 = B[1+0*ldb];
9762 	CC[0+bs*1] += a_0 * b_0;
9763 	CC[1+bs*1] += a_1 * b_0;
9764 	CC[2+bs*1] += a_2 * b_0;
9765 	CC[3+bs*1] += a_3 * b_0;
9766 
9767 	b_0 = B[2+0*ldb];
9768 	CC[0+bs*2] += a_0 * b_0;
9769 	CC[1+bs*2] += a_1 * b_0;
9770 	CC[2+bs*2] += a_2 * b_0;
9771 	CC[3+bs*2] += a_3 * b_0;
9772 
9773 	b_0 = B[3+0*ldb];
9774 	CC[0+bs*3] += a_0 * b_0;
9775 	CC[1+bs*3] += a_1 * b_0;
9776 	CC[2+bs*3] += a_2 * b_0;
9777 	CC[3+bs*3] += a_3 * b_0;
9778 
9779 	A += bs;
9780 	B += ldb;
9781 //	k += 1;
9782 
9783 	// k = 2
9784 
9785 	a_0 = A[0];
9786 	a_1 = A[1];
9787 	a_2 = A[2];
9788 	a_3 = A[3];
9789 
9790 	b_0 = B[2+0*ldb];
9791 	CC[0+bs*2] += a_0 * b_0;
9792 	CC[1+bs*2] += a_1 * b_0;
9793 	CC[2+bs*2] += a_2 * b_0;
9794 	CC[3+bs*2] += a_3 * b_0;
9795 
9796 	b_0 = B[3+0*ldb];
9797 	CC[0+bs*3] += a_0 * b_0;
9798 	CC[1+bs*3] += a_1 * b_0;
9799 	CC[2+bs*3] += a_2 * b_0;
9800 	CC[3+bs*3] += a_3 * b_0;
9801 
9802 	A += bs;
9803 	B += ldb;
9804 //	k += 1;
9805 
9806 	// k = 3
9807 
9808 	a_0 = A[0];
9809 	a_1 = A[1];
9810 	a_2 = A[2];
9811 	a_3 = A[3];
9812 
9813 	b_0 = B[3+0*ldb];
9814 	CC[0+bs*3] += a_0 * b_0;
9815 	CC[1+bs*3] += a_1 * b_0;
9816 	CC[2+bs*3] += a_2 * b_0;
9817 	CC[3+bs*3] += a_3 * b_0;
9818 
9819 	A += bs;
9820 	B += ldb;
9821 //	k += 1;
9822 
9823 	store:
9824 
9825 	// scale & tranpose & store
9826 	D[0+ldd*0] = alpha[0]*CC[0+bs*0];
9827 	D[0+ldd*1] = alpha[0]*CC[1+bs*0];
9828 	D[0+ldd*2] = alpha[0]*CC[2+bs*0];
9829 	D[0+ldd*3] = alpha[0]*CC[3+bs*0];
9830 
9831 	D[1+ldd*0] = alpha[0]*CC[0+bs*1];
9832 	D[1+ldd*1] = alpha[0]*CC[1+bs*1];
9833 	D[1+ldd*2] = alpha[0]*CC[2+bs*1];
9834 	D[1+ldd*3] = alpha[0]*CC[3+bs*1];
9835 
9836 	D[2+ldd*0] = alpha[0]*CC[0+bs*2];
9837 	D[2+ldd*1] = alpha[0]*CC[1+bs*2];
9838 	D[2+ldd*2] = alpha[0]*CC[2+bs*2];
9839 	D[2+ldd*3] = alpha[0]*CC[3+bs*2];
9840 
9841 	D[3+ldd*0] = alpha[0]*CC[0+bs*3];
9842 	D[3+ldd*1] = alpha[0]*CC[1+bs*3];
9843 	D[3+ldd*2] = alpha[0]*CC[2+bs*3];
9844 	D[3+ldd*3] = alpha[0]*CC[3+bs*3];
9845 
9846 	return;
9847 
9848 	}
9849 #endif
9850 
9851 
9852 
9853 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_4x4_tran_vs_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd,int m1,int n1)9854 void kernel_dtrmm_nt_rl_4x4_tran_vs_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1)
9855 	{
9856 
9857 	const int bs = 4;
9858 
9859 	double
9860 		tmp,
9861 		a_0, a_1, a_2, a_3,
9862 		b_0, b_1, b_2, b_3;
9863 
9864 #if defined(TARGET_GENERIC)
9865 	double CC[16] = {0};
9866 #else
9867 	ALIGNED( double CC[16], 64 ) = {0};
9868 #endif
9869 
9870 	double alpha1 = 1.0;
9871 
9872 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
9873 
9874 	A += kmax*bs;
9875 	B += kmax*ldb;
9876 
9877 	// XXX m1 and n1 are swapped !!!!!
9878 	if(m1>=4)
9879 		{
9880 
9881 		// k = 0
9882 
9883 		a_0 = A[0];
9884 		a_1 = A[1];
9885 		a_2 = A[2];
9886 		a_3 = A[3];
9887 
9888 		b_0 = B[0+0*ldb];
9889 		CC[0+bs*0] += a_0 * b_0;
9890 		CC[1+bs*0] += a_1 * b_0;
9891 		CC[2+bs*0] += a_2 * b_0;
9892 		CC[3+bs*0] += a_3 * b_0;
9893 
9894 		b_0 = B[1+0*ldb];
9895 		CC[0+bs*1] += a_0 * b_0;
9896 		CC[1+bs*1] += a_1 * b_0;
9897 		CC[2+bs*1] += a_2 * b_0;
9898 		CC[3+bs*1] += a_3 * b_0;
9899 
9900 		b_0 = B[2+0*ldb];
9901 		CC[0+bs*2] += a_0 * b_0;
9902 		CC[1+bs*2] += a_1 * b_0;
9903 		CC[2+bs*2] += a_2 * b_0;
9904 		CC[3+bs*2] += a_3 * b_0;
9905 
9906 		b_0 = B[3+0*ldb];
9907 		CC[0+bs*3] += a_0 * b_0;
9908 		CC[1+bs*3] += a_1 * b_0;
9909 		CC[2+bs*3] += a_2 * b_0;
9910 		CC[3+bs*3] += a_3 * b_0;
9911 
9912 		A += bs;
9913 		B += ldb;
9914 //		k += 1;
9915 
9916 		// k = 1
9917 
9918 		a_0 = A[0];
9919 		a_1 = A[1];
9920 		a_2 = A[2];
9921 		a_3 = A[3];
9922 
9923 		b_0 = B[1+0*ldb];
9924 		CC[0+bs*1] += a_0 * b_0;
9925 		CC[1+bs*1] += a_1 * b_0;
9926 		CC[2+bs*1] += a_2 * b_0;
9927 		CC[3+bs*1] += a_3 * b_0;
9928 
9929 		b_0 = B[2+0*ldb];
9930 		CC[0+bs*2] += a_0 * b_0;
9931 		CC[1+bs*2] += a_1 * b_0;
9932 		CC[2+bs*2] += a_2 * b_0;
9933 		CC[3+bs*2] += a_3 * b_0;
9934 
9935 		b_0 = B[3+0*ldb];
9936 		CC[0+bs*3] += a_0 * b_0;
9937 		CC[1+bs*3] += a_1 * b_0;
9938 		CC[2+bs*3] += a_2 * b_0;
9939 		CC[3+bs*3] += a_3 * b_0;
9940 
9941 		A += bs;
9942 		B += ldb;
9943 //		k += 1;
9944 
9945 		// k = 2
9946 
9947 		a_0 = A[0];
9948 		a_1 = A[1];
9949 		a_2 = A[2];
9950 		a_3 = A[3];
9951 
9952 		b_0 = B[2+0*ldb];
9953 		CC[0+bs*2] += a_0 * b_0;
9954 		CC[1+bs*2] += a_1 * b_0;
9955 		CC[2+bs*2] += a_2 * b_0;
9956 		CC[3+bs*2] += a_3 * b_0;
9957 
9958 		b_0 = B[3+0*ldb];
9959 		CC[0+bs*3] += a_0 * b_0;
9960 		CC[1+bs*3] += a_1 * b_0;
9961 		CC[2+bs*3] += a_2 * b_0;
9962 		CC[3+bs*3] += a_3 * b_0;
9963 
9964 		A += bs;
9965 		B += ldb;
9966 //		k += 1;
9967 
9968 		// k = 3
9969 
9970 		a_0 = A[0];
9971 		a_1 = A[1];
9972 		a_2 = A[2];
9973 		a_3 = A[3];
9974 
9975 		b_0 = B[3+0*ldb];
9976 		CC[0+bs*3] += a_0 * b_0;
9977 		CC[1+bs*3] += a_1 * b_0;
9978 		CC[2+bs*3] += a_2 * b_0;
9979 		CC[3+bs*3] += a_3 * b_0;
9980 
9981 		A += bs;
9982 		B += ldb;
9983 //		k += 1;
9984 
9985 		}
9986 	else if(m1==3)
9987 		{
9988 
9989 		// k = 0
9990 
9991 		a_0 = A[0];
9992 		a_1 = A[1];
9993 		a_2 = A[2];
9994 		a_3 = A[3];
9995 
9996 		b_0 = B[0+0*ldb];
9997 		CC[0+bs*0] += a_0 * b_0;
9998 		CC[1+bs*0] += a_1 * b_0;
9999 		CC[2+bs*0] += a_2 * b_0;
10000 		CC[3+bs*0] += a_3 * b_0;
10001 
10002 		b_0 = B[1+0*ldb];
10003 		CC[0+bs*1] += a_0 * b_0;
10004 		CC[1+bs*1] += a_1 * b_0;
10005 		CC[2+bs*1] += a_2 * b_0;
10006 		CC[3+bs*1] += a_3 * b_0;
10007 
10008 		b_0 = B[2+0*ldb];
10009 		CC[0+bs*2] += a_0 * b_0;
10010 		CC[1+bs*2] += a_1 * b_0;
10011 		CC[2+bs*2] += a_2 * b_0;
10012 		CC[3+bs*2] += a_3 * b_0;
10013 
10014 		A += bs;
10015 		B += ldb;
10016 //		k += 1;
10017 
10018 		// k = 1
10019 
10020 		a_0 = A[0];
10021 		a_1 = A[1];
10022 		a_2 = A[2];
10023 		a_3 = A[3];
10024 
10025 		b_0 = B[1+0*ldb];
10026 		CC[0+bs*1] += a_0 * b_0;
10027 		CC[1+bs*1] += a_1 * b_0;
10028 		CC[2+bs*1] += a_2 * b_0;
10029 		CC[3+bs*1] += a_3 * b_0;
10030 
10031 		b_0 = B[2+0*ldb];
10032 		CC[0+bs*2] += a_0 * b_0;
10033 		CC[1+bs*2] += a_1 * b_0;
10034 		CC[2+bs*2] += a_2 * b_0;
10035 		CC[3+bs*2] += a_3 * b_0;
10036 
10037 		A += bs;
10038 		B += ldb;
10039 //		k += 1;
10040 
10041 		// k = 2
10042 
10043 		a_0 = A[0];
10044 		a_1 = A[1];
10045 		a_2 = A[2];
10046 		a_3 = A[3];
10047 
10048 		b_0 = B[2+0*ldb];
10049 		CC[0+bs*2] += a_0 * b_0;
10050 		CC[1+bs*2] += a_1 * b_0;
10051 		CC[2+bs*2] += a_2 * b_0;
10052 		CC[3+bs*2] += a_3 * b_0;
10053 
10054 		A += bs;
10055 		B += ldb;
10056 //		k += 1;
10057 
10058 		// k = 3
10059 
10060 		A += bs;
10061 		B += ldb;
10062 //		k += 1;
10063 
10064 		}
10065 	else if(m1==2)
10066 		{
10067 
10068 		// k = 0
10069 
10070 		a_0 = A[0];
10071 		a_1 = A[1];
10072 		a_2 = A[2];
10073 		a_3 = A[3];
10074 
10075 		b_0 = B[0+0*ldb];
10076 		CC[0+bs*0] += a_0 * b_0;
10077 		CC[1+bs*0] += a_1 * b_0;
10078 		CC[2+bs*0] += a_2 * b_0;
10079 		CC[3+bs*0] += a_3 * b_0;
10080 
10081 		b_0 = B[1+0*ldb];
10082 		CC[0+bs*1] += a_0 * b_0;
10083 		CC[1+bs*1] += a_1 * b_0;
10084 		CC[2+bs*1] += a_2 * b_0;
10085 		CC[3+bs*1] += a_3 * b_0;
10086 
10087 		A += bs;
10088 		B += ldb;
10089 //		k += 1;
10090 
10091 		// k = 1
10092 
10093 		a_0 = A[0];
10094 		a_1 = A[1];
10095 		a_2 = A[2];
10096 		a_3 = A[3];
10097 
10098 		b_0 = B[1+0*ldb];
10099 		CC[0+bs*1] += a_0 * b_0;
10100 		CC[1+bs*1] += a_1 * b_0;
10101 		CC[2+bs*1] += a_2 * b_0;
10102 		CC[3+bs*1] += a_3 * b_0;
10103 
10104 		A += bs;
10105 		B += ldb;
10106 //		k += 1;
10107 
10108 		// k = 2
10109 
10110 		A += bs;
10111 		B += ldb;
10112 //		k += 1;
10113 
10114 		// k = 3
10115 
10116 		A += bs;
10117 		B += ldb;
10118 //		k += 1;
10119 
10120 		}
10121 	else if(m1==1)
10122 		{
10123 
10124 		// k = 0
10125 
10126 		a_0 = A[0];
10127 		a_1 = A[1];
10128 		a_2 = A[2];
10129 		a_3 = A[3];
10130 
10131 		b_0 = B[0+0*ldb];
10132 		CC[0+bs*0] += a_0 * b_0;
10133 		CC[1+bs*0] += a_1 * b_0;
10134 		CC[2+bs*0] += a_2 * b_0;
10135 		CC[3+bs*0] += a_3 * b_0;
10136 
10137 		A += bs;
10138 		B += ldb;
10139 //		k += 1;
10140 
10141 		// k = 1
10142 
10143 		A += bs;
10144 		B += ldb;
10145 //		k += 1;
10146 
10147 		// k = 2
10148 
10149 		A += bs;
10150 		B += ldb;
10151 //		k += 1;
10152 
10153 		// k = 3
10154 
10155 		A += bs;
10156 		B += ldb;
10157 //		k += 1;
10158 
10159 		}
10160 
10161 	store:
10162 
10163 	// scale
10164 	CC[0+bs*0] = alpha[0]*CC[0+bs*0];
10165 	CC[1+bs*0] = alpha[0]*CC[1+bs*0];
10166 	CC[2+bs*0] = alpha[0]*CC[2+bs*0];
10167 	CC[3+bs*0] = alpha[0]*CC[3+bs*0];
10168 
10169 	CC[0+bs*1] = alpha[0]*CC[0+bs*1];
10170 	CC[1+bs*1] = alpha[0]*CC[1+bs*1];
10171 	CC[2+bs*1] = alpha[0]*CC[2+bs*1];
10172 	CC[3+bs*1] = alpha[0]*CC[3+bs*1];
10173 
10174 	CC[0+bs*2] = alpha[0]*CC[0+bs*2];
10175 	CC[1+bs*2] = alpha[0]*CC[1+bs*2];
10176 	CC[2+bs*2] = alpha[0]*CC[2+bs*2];
10177 	CC[3+bs*2] = alpha[0]*CC[3+bs*2];
10178 
10179 	CC[0+bs*3] = alpha[0]*CC[0+bs*3];
10180 	CC[1+bs*3] = alpha[0]*CC[1+bs*3];
10181 	CC[2+bs*3] = alpha[0]*CC[2+bs*3];
10182 	CC[3+bs*3] = alpha[0]*CC[3+bs*3];
10183 
10184 	// transpose
10185 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
10186 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
10187 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
10188 
10189 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
10190 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
10191 
10192 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
10193 
10194 	// store
10195 	if(m1>=4)
10196 		{
10197 		D[0+ldd*0] = CC[0+bs*0];
10198 		D[1+ldd*0] = CC[1+bs*0];
10199 		D[2+ldd*0] = CC[2+bs*0];
10200 		D[3+ldd*0] = CC[3+bs*0];
10201 
10202 		if(n1==1)
10203 			return;
10204 
10205 		D[0+ldd*1] = CC[0+bs*1];
10206 		D[1+ldd*1] = CC[1+bs*1];
10207 		D[2+ldd*1] = CC[2+bs*1];
10208 		D[3+ldd*1] = CC[3+bs*1];
10209 
10210 		if(n1==2)
10211 			return;
10212 
10213 		D[0+ldd*2] = CC[0+bs*2];
10214 		D[1+ldd*2] = CC[1+bs*2];
10215 		D[2+ldd*2] = CC[2+bs*2];
10216 		D[3+ldd*2] = CC[3+bs*2];
10217 
10218 		if(n1==3)
10219 			return;
10220 
10221 		D[0+ldd*3] = CC[0+bs*3];
10222 		D[1+ldd*3] = CC[1+bs*3];
10223 		D[2+ldd*3] = CC[2+bs*3];
10224 		D[3+ldd*3] = CC[3+bs*3];
10225 		}
10226 	else if(m1>=3)
10227 		{
10228 		D[0+ldd*0] = CC[0+bs*0];
10229 		D[1+ldd*0] = CC[1+bs*0];
10230 		D[2+ldd*0] = CC[2+bs*0];
10231 
10232 		if(n1==1)
10233 			return;
10234 
10235 		D[0+ldd*1] = CC[0+bs*1];
10236 		D[1+ldd*1] = CC[1+bs*1];
10237 		D[2+ldd*1] = CC[2+bs*1];
10238 
10239 		if(n1==2)
10240 			return;
10241 
10242 		D[0+ldd*2] = CC[0+bs*2];
10243 		D[1+ldd*2] = CC[1+bs*2];
10244 		D[2+ldd*2] = CC[2+bs*2];
10245 
10246 		if(n1==3)
10247 			return;
10248 
10249 		D[0+ldd*3] = CC[0+bs*3];
10250 		D[1+ldd*3] = CC[1+bs*3];
10251 		D[2+ldd*3] = CC[2+bs*3];
10252 		}
10253 	else if(m1>=2)
10254 		{
10255 		D[0+ldd*0] = CC[0+bs*0];
10256 		D[1+ldd*0] = CC[1+bs*0];
10257 
10258 		if(n1==1)
10259 			return;
10260 
10261 		D[0+ldd*1] = CC[0+bs*1];
10262 		D[1+ldd*1] = CC[1+bs*1];
10263 
10264 		if(n1==2)
10265 			return;
10266 
10267 		D[0+ldd*2] = CC[0+bs*2];
10268 		D[1+ldd*2] = CC[1+bs*2];
10269 
10270 		if(n1==3)
10271 			return;
10272 
10273 		D[0+ldd*3] = CC[0+bs*3];
10274 		D[1+ldd*3] = CC[1+bs*3];
10275 		}
10276 	else //if(m1>=1)
10277 		{
10278 		D[0+ldd*0] = CC[0+bs*0];
10279 
10280 		if(n1==1)
10281 			return;
10282 
10283 		D[0+ldd*1] = CC[0+bs*1];
10284 
10285 		if(n1==2)
10286 			return;
10287 
10288 		D[0+ldd*2] = CC[0+bs*2];
10289 
10290 		if(n1==3)
10291 			return;
10292 
10293 		D[0+ldd*3] = CC[0+bs*3];
10294 		}
10295 
10296 	return;
10297 
10298 	}
10299 #endif
10300 
10301 
10302 
10303 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_one_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)10304 void kernel_dtrmm_nt_rl_one_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
10305 	{
10306 
10307 	const int bs = 4;
10308 
10309 	double
10310 		a_0, a_1, a_2, a_3,
10311 		b_0, b_1, b_2, b_3;
10312 
10313 #if defined(TARGET_GENERIC)
10314 	double CC[16] = {0};
10315 #else
10316 	ALIGNED( double CC[16], 64 ) = {0};
10317 #endif
10318 
10319 	double alpha1 = 1.0;
10320 
10321 	// assume always kmax>=4 !!!
10322 
10323 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
10324 
10325 	A += kmax*bs;
10326 	B += kmax*ldb;
10327 
10328 	// k = 0
10329 
10330 	a_0 = A[0];
10331 	a_1 = A[1];
10332 	a_2 = A[2];
10333 	a_3 = A[3];
10334 
10335 	CC[0+bs*0] += a_0;
10336 	CC[1+bs*0] += a_1;
10337 	CC[2+bs*0] += a_2;
10338 	CC[3+bs*0] += a_3;
10339 
10340 	b_0 = B[1+0*ldb];
10341 	CC[0+bs*1] += a_0 * b_0;
10342 	CC[1+bs*1] += a_1 * b_0;
10343 	CC[2+bs*1] += a_2 * b_0;
10344 	CC[3+bs*1] += a_3 * b_0;
10345 
10346 	b_0 = B[2+0*ldb];
10347 	CC[0+bs*2] += a_0 * b_0;
10348 	CC[1+bs*2] += a_1 * b_0;
10349 	CC[2+bs*2] += a_2 * b_0;
10350 	CC[3+bs*2] += a_3 * b_0;
10351 
10352 	b_0 = B[3+0*ldb];
10353 	CC[0+bs*3] += a_0 * b_0;
10354 	CC[1+bs*3] += a_1 * b_0;
10355 	CC[2+bs*3] += a_2 * b_0;
10356 	CC[3+bs*3] += a_3 * b_0;
10357 
10358 	A += bs;
10359 	B += ldb;
10360 //	k += 1;
10361 
10362 	// k = 1
10363 
10364 	a_0 = A[0];
10365 	a_1 = A[1];
10366 	a_2 = A[2];
10367 	a_3 = A[3];
10368 
10369 	CC[0+bs*1] += a_0;
10370 	CC[1+bs*1] += a_1;
10371 	CC[2+bs*1] += a_2;
10372 	CC[3+bs*1] += a_3;
10373 
10374 	b_0 = B[2+0*ldb];
10375 	CC[0+bs*2] += a_0 * b_0;
10376 	CC[1+bs*2] += a_1 * b_0;
10377 	CC[2+bs*2] += a_2 * b_0;
10378 	CC[3+bs*2] += a_3 * b_0;
10379 
10380 	b_0 = B[3+0*ldb];
10381 	CC[0+bs*3] += a_0 * b_0;
10382 	CC[1+bs*3] += a_1 * b_0;
10383 	CC[2+bs*3] += a_2 * b_0;
10384 	CC[3+bs*3] += a_3 * b_0;
10385 
10386 	A += bs;
10387 	B += ldb;
10388 //	k += 1;
10389 
10390 	// k = 2
10391 
10392 	a_0 = A[0];
10393 	a_1 = A[1];
10394 	a_2 = A[2];
10395 	a_3 = A[3];
10396 
10397 	CC[0+bs*2] += a_0;
10398 	CC[1+bs*2] += a_1;
10399 	CC[2+bs*2] += a_2;
10400 	CC[3+bs*2] += a_3;
10401 
10402 	b_0 = B[3+0*ldb];
10403 	CC[0+bs*3] += a_0 * b_0;
10404 	CC[1+bs*3] += a_1 * b_0;
10405 	CC[2+bs*3] += a_2 * b_0;
10406 	CC[3+bs*3] += a_3 * b_0;
10407 
10408 	A += bs;
10409 	B += ldb;
10410 //	k += 1;
10411 
10412 	// k = 3
10413 
10414 	a_0 = A[0];
10415 	a_1 = A[1];
10416 	a_2 = A[2];
10417 	a_3 = A[3];
10418 
10419 	CC[0+bs*3] += a_0;
10420 	CC[1+bs*3] += a_1;
10421 	CC[2+bs*3] += a_2;
10422 	CC[3+bs*3] += a_3;
10423 
10424 	A += bs;
10425 	B += ldb;
10426 //	k += 1;
10427 
10428 	store:
10429 
10430 	D[0+ldd*0] = alpha[0]*CC[0+bs*0];
10431 	D[1+ldd*0] = alpha[0]*CC[1+bs*0];
10432 	D[2+ldd*0] = alpha[0]*CC[2+bs*0];
10433 	D[3+ldd*0] = alpha[0]*CC[3+bs*0];
10434 
10435 	D[0+ldd*1] = alpha[0]*CC[0+bs*1];
10436 	D[1+ldd*1] = alpha[0]*CC[1+bs*1];
10437 	D[2+ldd*1] = alpha[0]*CC[2+bs*1];
10438 	D[3+ldd*1] = alpha[0]*CC[3+bs*1];
10439 
10440 	D[0+ldd*2] = alpha[0]*CC[0+bs*2];
10441 	D[1+ldd*2] = alpha[0]*CC[1+bs*2];
10442 	D[2+ldd*2] = alpha[0]*CC[2+bs*2];
10443 	D[3+ldd*2] = alpha[0]*CC[3+bs*2];
10444 
10445 	D[0+ldd*3] = alpha[0]*CC[0+bs*3];
10446 	D[1+ldd*3] = alpha[0]*CC[1+bs*3];
10447 	D[2+ldd*3] = alpha[0]*CC[2+bs*3];
10448 	D[3+ldd*3] = alpha[0]*CC[3+bs*3];
10449 
10450 	return;
10451 
10452 	}
10453 #endif
10454 
10455 
10456 
10457 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_one_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)10458 void kernel_dtrmm_nt_rl_one_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
10459 	{
10460 
10461 	const int bs = 4;
10462 
10463 	double
10464 		a_0, a_1, a_2, a_3,
10465 		b_0, b_1, b_2, b_3;
10466 
10467 #if defined(TARGET_GENERIC)
10468 	double CC[16] = {0};
10469 #else
10470 	ALIGNED( double CC[16], 64 ) = {0};
10471 #endif
10472 
10473 	double alpha1 = 1.0;
10474 
10475 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
10476 
10477 	A += kmax*bs;
10478 	B += kmax*ldb;
10479 
10480 	if(n1>=4)
10481 		{
10482 
10483 		// k = 0
10484 
10485 		a_0 = A[0];
10486 		a_1 = A[1];
10487 		a_2 = A[2];
10488 		a_3 = A[3];
10489 
10490 		CC[0+bs*0] += a_0;
10491 		CC[1+bs*0] += a_1;
10492 		CC[2+bs*0] += a_2;
10493 		CC[3+bs*0] += a_3;
10494 
10495 		b_0 = B[1+0*ldb];
10496 		CC[0+bs*1] += a_0 * b_0;
10497 		CC[1+bs*1] += a_1 * b_0;
10498 		CC[2+bs*1] += a_2 * b_0;
10499 		CC[3+bs*1] += a_3 * b_0;
10500 
10501 		b_0 = B[2+0*ldb];
10502 		CC[0+bs*2] += a_0 * b_0;
10503 		CC[1+bs*2] += a_1 * b_0;
10504 		CC[2+bs*2] += a_2 * b_0;
10505 		CC[3+bs*2] += a_3 * b_0;
10506 
10507 		b_0 = B[3+0*ldb];
10508 		CC[0+bs*3] += a_0 * b_0;
10509 		CC[1+bs*3] += a_1 * b_0;
10510 		CC[2+bs*3] += a_2 * b_0;
10511 		CC[3+bs*3] += a_3 * b_0;
10512 
10513 		A += bs;
10514 		B += ldb;
10515 //		k += 1;
10516 
10517 		// k = 1
10518 
10519 		a_0 = A[0];
10520 		a_1 = A[1];
10521 		a_2 = A[2];
10522 		a_3 = A[3];
10523 
10524 		CC[0+bs*1] += a_0;
10525 		CC[1+bs*1] += a_1;
10526 		CC[2+bs*1] += a_2;
10527 		CC[3+bs*1] += a_3;
10528 
10529 		b_0 = B[2+0*ldb];
10530 		CC[0+bs*2] += a_0 * b_0;
10531 		CC[1+bs*2] += a_1 * b_0;
10532 		CC[2+bs*2] += a_2 * b_0;
10533 		CC[3+bs*2] += a_3 * b_0;
10534 
10535 		b_0 = B[3+0*ldb];
10536 		CC[0+bs*3] += a_0 * b_0;
10537 		CC[1+bs*3] += a_1 * b_0;
10538 		CC[2+bs*3] += a_2 * b_0;
10539 		CC[3+bs*3] += a_3 * b_0;
10540 
10541 		A += bs;
10542 		B += ldb;
10543 //		k += 1;
10544 
10545 		// k = 2
10546 
10547 		a_0 = A[0];
10548 		a_1 = A[1];
10549 		a_2 = A[2];
10550 		a_3 = A[3];
10551 
10552 		CC[0+bs*2] += a_0;
10553 		CC[1+bs*2] += a_1;
10554 		CC[2+bs*2] += a_2;
10555 		CC[3+bs*2] += a_3;
10556 
10557 		b_0 = B[3+0*ldb];
10558 		CC[0+bs*3] += a_0 * b_0;
10559 		CC[1+bs*3] += a_1 * b_0;
10560 		CC[2+bs*3] += a_2 * b_0;
10561 		CC[3+bs*3] += a_3 * b_0;
10562 
10563 		A += bs;
10564 		B += ldb;
10565 //		k += 1;
10566 
10567 		// k = 3
10568 
10569 		a_0 = A[0];
10570 		a_1 = A[1];
10571 		a_2 = A[2];
10572 		a_3 = A[3];
10573 
10574 		CC[0+bs*3] += a_0;
10575 		CC[1+bs*3] += a_1;
10576 		CC[2+bs*3] += a_2;
10577 		CC[3+bs*3] += a_3;
10578 
10579 		A += bs;
10580 		B += ldb;
10581 //		k += 1;
10582 
10583 		}
10584 	else if(n1==3)
10585 		{
10586 
10587 		// k = 0
10588 
10589 		a_0 = A[0];
10590 		a_1 = A[1];
10591 		a_2 = A[2];
10592 		a_3 = A[3];
10593 
10594 		CC[0+bs*0] += a_0;
10595 		CC[1+bs*0] += a_1;
10596 		CC[2+bs*0] += a_2;
10597 		CC[3+bs*0] += a_3;
10598 
10599 		b_0 = B[1+0*ldb];
10600 		CC[0+bs*1] += a_0 * b_0;
10601 		CC[1+bs*1] += a_1 * b_0;
10602 		CC[2+bs*1] += a_2 * b_0;
10603 		CC[3+bs*1] += a_3 * b_0;
10604 
10605 		b_0 = B[2+0*ldb];
10606 		CC[0+bs*2] += a_0 * b_0;
10607 		CC[1+bs*2] += a_1 * b_0;
10608 		CC[2+bs*2] += a_2 * b_0;
10609 		CC[3+bs*2] += a_3 * b_0;
10610 
10611 		A += bs;
10612 		B += ldb;
10613 //		k += 1;
10614 
10615 		// k = 1
10616 
10617 		a_0 = A[0];
10618 		a_1 = A[1];
10619 		a_2 = A[2];
10620 		a_3 = A[3];
10621 
10622 		CC[0+bs*1] += a_0;
10623 		CC[1+bs*1] += a_1;
10624 		CC[2+bs*1] += a_2;
10625 		CC[3+bs*1] += a_3;
10626 
10627 		b_0 = B[2+0*ldb];
10628 		CC[0+bs*2] += a_0 * b_0;
10629 		CC[1+bs*2] += a_1 * b_0;
10630 		CC[2+bs*2] += a_2 * b_0;
10631 		CC[3+bs*2] += a_3 * b_0;
10632 
10633 		A += bs;
10634 		B += ldb;
10635 //		k += 1;
10636 
10637 		// k = 2
10638 
10639 		a_0 = A[0];
10640 		a_1 = A[1];
10641 		a_2 = A[2];
10642 		a_3 = A[3];
10643 
10644 		CC[0+bs*2] += a_0;
10645 		CC[1+bs*2] += a_1;
10646 		CC[2+bs*2] += a_2;
10647 		CC[3+bs*2] += a_3;
10648 
10649 		A += bs;
10650 		B += ldb;
10651 //		k += 1;
10652 
10653 		// k = 3
10654 
10655 		A += bs;
10656 		B += ldb;
10657 //		k += 1;
10658 
10659 		}
10660 	else if(n1==2)
10661 		{
10662 
10663 		// k = 0
10664 
10665 		a_0 = A[0];
10666 		a_1 = A[1];
10667 		a_2 = A[2];
10668 		a_3 = A[3];
10669 
10670 		CC[0+bs*0] += a_0;
10671 		CC[1+bs*0] += a_1;
10672 		CC[2+bs*0] += a_2;
10673 		CC[3+bs*0] += a_3;
10674 
10675 		b_0 = B[1+0*ldb];
10676 		CC[0+bs*1] += a_0 * b_0;
10677 		CC[1+bs*1] += a_1 * b_0;
10678 		CC[2+bs*1] += a_2 * b_0;
10679 		CC[3+bs*1] += a_3 * b_0;
10680 
10681 		A += bs;
10682 		B += ldb;
10683 //		k += 1;
10684 
10685 		// k = 1
10686 
10687 		a_0 = A[0];
10688 		a_1 = A[1];
10689 		a_2 = A[2];
10690 		a_3 = A[3];
10691 
10692 		CC[0+bs*1] += a_0;
10693 		CC[1+bs*1] += a_1;
10694 		CC[2+bs*1] += a_2;
10695 		CC[3+bs*1] += a_3;
10696 
10697 		A += bs;
10698 		B += ldb;
10699 //		k += 1;
10700 
10701 		// k = 2
10702 
10703 		A += bs;
10704 		B += ldb;
10705 //		k += 1;
10706 
10707 		// k = 3
10708 
10709 		A += bs;
10710 		B += ldb;
10711 //		k += 1;
10712 
10713 		}
10714 	else if(n1==1)
10715 		{
10716 
10717 		// k = 0
10718 
10719 		a_0 = A[0];
10720 		a_1 = A[1];
10721 		a_2 = A[2];
10722 		a_3 = A[3];
10723 
10724 		CC[0+bs*0] += a_0;
10725 		CC[1+bs*0] += a_1;
10726 		CC[2+bs*0] += a_2;
10727 		CC[3+bs*0] += a_3;
10728 
10729 		A += bs;
10730 		B += ldb;
10731 //		k += 1;
10732 
10733 		// k = 1
10734 
10735 		A += bs;
10736 		B += ldb;
10737 //		k += 1;
10738 
10739 		// k = 2
10740 
10741 		A += bs;
10742 		B += ldb;
10743 //		k += 1;
10744 
10745 		// k = 3
10746 
10747 		A += bs;
10748 		B += ldb;
10749 //		k += 1;
10750 
10751 		}
10752 
10753 	store:
10754 
10755 	CC[0+bs*0] = alpha[0]*CC[0+bs*0];
10756 	CC[1+bs*0] = alpha[0]*CC[1+bs*0];
10757 	CC[2+bs*0] = alpha[0]*CC[2+bs*0];
10758 	CC[3+bs*0] = alpha[0]*CC[3+bs*0];
10759 
10760 	CC[0+bs*1] = alpha[0]*CC[0+bs*1];
10761 	CC[1+bs*1] = alpha[0]*CC[1+bs*1];
10762 	CC[2+bs*1] = alpha[0]*CC[2+bs*1];
10763 	CC[3+bs*1] = alpha[0]*CC[3+bs*1];
10764 
10765 	CC[0+bs*2] = alpha[0]*CC[0+bs*2];
10766 	CC[1+bs*2] = alpha[0]*CC[1+bs*2];
10767 	CC[2+bs*2] = alpha[0]*CC[2+bs*2];
10768 	CC[3+bs*2] = alpha[0]*CC[3+bs*2];
10769 
10770 	CC[0+bs*3] = alpha[0]*CC[0+bs*3];
10771 	CC[1+bs*3] = alpha[0]*CC[1+bs*3];
10772 	CC[2+bs*3] = alpha[0]*CC[2+bs*3];
10773 	CC[3+bs*3] = alpha[0]*CC[3+bs*3];
10774 
10775 	if(m1>=4)
10776 		{
10777 		D[0+ldd*0] = CC[0+bs*0];
10778 		D[1+ldd*0] = CC[1+bs*0];
10779 		D[2+ldd*0] = CC[2+bs*0];
10780 		D[3+ldd*0] = CC[3+bs*0];
10781 
10782 		if(n1==1)
10783 			return;
10784 
10785 		D[0+ldd*1] = CC[0+bs*1];
10786 		D[1+ldd*1] = CC[1+bs*1];
10787 		D[2+ldd*1] = CC[2+bs*1];
10788 		D[3+ldd*1] = CC[3+bs*1];
10789 
10790 		if(n1==2)
10791 			return;
10792 
10793 		D[0+ldd*2] = CC[0+bs*2];
10794 		D[1+ldd*2] = CC[1+bs*2];
10795 		D[2+ldd*2] = CC[2+bs*2];
10796 		D[3+ldd*2] = CC[3+bs*2];
10797 
10798 		if(n1==3)
10799 			return;
10800 
10801 		D[0+ldd*3] = CC[0+bs*3];
10802 		D[1+ldd*3] = CC[1+bs*3];
10803 		D[2+ldd*3] = CC[2+bs*3];
10804 		D[3+ldd*3] = CC[3+bs*3];
10805 		}
10806 	else if(m1>=3)
10807 		{
10808 		D[0+ldd*0] = CC[0+bs*0];
10809 		D[1+ldd*0] = CC[1+bs*0];
10810 		D[2+ldd*0] = CC[2+bs*0];
10811 
10812 		if(n1==1)
10813 			return;
10814 
10815 		D[0+ldd*1] = CC[0+bs*1];
10816 		D[1+ldd*1] = CC[1+bs*1];
10817 		D[2+ldd*1] = CC[2+bs*1];
10818 
10819 		if(n1==2)
10820 			return;
10821 
10822 		D[0+ldd*2] = CC[0+bs*2];
10823 		D[1+ldd*2] = CC[1+bs*2];
10824 		D[2+ldd*2] = CC[2+bs*2];
10825 
10826 		if(n1==3)
10827 			return;
10828 
10829 		D[0+ldd*3] = CC[0+bs*3];
10830 		D[1+ldd*3] = CC[1+bs*3];
10831 		D[2+ldd*3] = CC[2+bs*3];
10832 		}
10833 	else if(m1>=2)
10834 		{
10835 		D[0+ldd*0] = CC[0+bs*0];
10836 		D[1+ldd*0] = CC[1+bs*0];
10837 
10838 		if(n1==1)
10839 			return;
10840 
10841 		D[0+ldd*1] = CC[0+bs*1];
10842 		D[1+ldd*1] = CC[1+bs*1];
10843 
10844 		if(n1==2)
10845 			return;
10846 
10847 		D[0+ldd*2] = CC[0+bs*2];
10848 		D[1+ldd*2] = CC[1+bs*2];
10849 
10850 		if(n1==3)
10851 			return;
10852 
10853 		D[0+ldd*3] = CC[0+bs*3];
10854 		D[1+ldd*3] = CC[1+bs*3];
10855 		}
10856 	else //if(m1>=1)
10857 		{
10858 		D[0+ldd*0] = CC[0+bs*0];
10859 
10860 		if(n1==1)
10861 			return;
10862 
10863 		D[0+ldd*1] = CC[0+bs*1];
10864 
10865 		if(n1==2)
10866 			return;
10867 
10868 		D[0+ldd*2] = CC[0+bs*2];
10869 
10870 		if(n1==3)
10871 			return;
10872 
10873 		D[0+ldd*3] = CC[0+bs*3];
10874 		}
10875 
10876 	return;
10877 
10878 	}
10879 #endif
10880 
10881 
10882 
10883 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_one_4x4_tran_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd)10884 void kernel_dtrmm_nt_rl_one_4x4_tran_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd)
10885 	{
10886 
10887 	const int bs = 4;
10888 
10889 	double
10890 		tmp,
10891 		a_0, a_1, a_2, a_3,
10892 		b_0, b_1, b_2, b_3;
10893 
10894 #if defined(TARGET_GENERIC)
10895 	double CC[16] = {0};
10896 #else
10897 	ALIGNED( double CC[16], 64 ) = {0};
10898 #endif
10899 
10900 	double alpha1 = 1.0;
10901 
10902 	// assume always kmax>=4 !!!
10903 
10904 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
10905 
10906 	A += kmax*bs;
10907 	B += kmax*ldb;
10908 
10909 	// k = 0
10910 
10911 	a_0 = A[0];
10912 	a_1 = A[1];
10913 	a_2 = A[2];
10914 	a_3 = A[3];
10915 
10916 	CC[0+bs*0] += a_0;
10917 	CC[1+bs*0] += a_1;
10918 	CC[2+bs*0] += a_2;
10919 	CC[3+bs*0] += a_3;
10920 
10921 	b_0 = B[1+0*ldb];
10922 	CC[0+bs*1] += a_0 * b_0;
10923 	CC[1+bs*1] += a_1 * b_0;
10924 	CC[2+bs*1] += a_2 * b_0;
10925 	CC[3+bs*1] += a_3 * b_0;
10926 
10927 	b_0 = B[2+0*ldb];
10928 	CC[0+bs*2] += a_0 * b_0;
10929 	CC[1+bs*2] += a_1 * b_0;
10930 	CC[2+bs*2] += a_2 * b_0;
10931 	CC[3+bs*2] += a_3 * b_0;
10932 
10933 	b_0 = B[3+0*ldb];
10934 	CC[0+bs*3] += a_0 * b_0;
10935 	CC[1+bs*3] += a_1 * b_0;
10936 	CC[2+bs*3] += a_2 * b_0;
10937 	CC[3+bs*3] += a_3 * b_0;
10938 
10939 	A += bs;
10940 	B += ldb;
10941 //	k += 1;
10942 
10943 	// k = 1
10944 
10945 	a_0 = A[0];
10946 	a_1 = A[1];
10947 	a_2 = A[2];
10948 	a_3 = A[3];
10949 
10950 	CC[0+bs*1] += a_0;
10951 	CC[1+bs*1] += a_1;
10952 	CC[2+bs*1] += a_2;
10953 	CC[3+bs*1] += a_3;
10954 
10955 	b_0 = B[2+0*ldb];
10956 	CC[0+bs*2] += a_0 * b_0;
10957 	CC[1+bs*2] += a_1 * b_0;
10958 	CC[2+bs*2] += a_2 * b_0;
10959 	CC[3+bs*2] += a_3 * b_0;
10960 
10961 	b_0 = B[3+0*ldb];
10962 	CC[0+bs*3] += a_0 * b_0;
10963 	CC[1+bs*3] += a_1 * b_0;
10964 	CC[2+bs*3] += a_2 * b_0;
10965 	CC[3+bs*3] += a_3 * b_0;
10966 
10967 	A += bs;
10968 	B += ldb;
10969 //	k += 1;
10970 
10971 	// k = 2
10972 
10973 	a_0 = A[0];
10974 	a_1 = A[1];
10975 	a_2 = A[2];
10976 	a_3 = A[3];
10977 
10978 	CC[0+bs*2] += a_0;
10979 	CC[1+bs*2] += a_1;
10980 	CC[2+bs*2] += a_2;
10981 	CC[3+bs*2] += a_3;
10982 
10983 	b_0 = B[3+0*ldb];
10984 	CC[0+bs*3] += a_0 * b_0;
10985 	CC[1+bs*3] += a_1 * b_0;
10986 	CC[2+bs*3] += a_2 * b_0;
10987 	CC[3+bs*3] += a_3 * b_0;
10988 
10989 	A += bs;
10990 	B += ldb;
10991 //	k += 1;
10992 
10993 	// k = 3
10994 
10995 	a_0 = A[0];
10996 	a_1 = A[1];
10997 	a_2 = A[2];
10998 	a_3 = A[3];
10999 
11000 	CC[0+bs*3] += a_0;
11001 	CC[1+bs*3] += a_1;
11002 	CC[2+bs*3] += a_2;
11003 	CC[3+bs*3] += a_3;
11004 
11005 	A += bs;
11006 	B += ldb;
11007 //	k += 1;
11008 
11009 	store:
11010 
11011 	// scale & tranpose & store
11012 	D[0+ldd*0] = alpha[0]*CC[0+bs*0];
11013 	D[0+ldd*1] = alpha[0]*CC[1+bs*0];
11014 	D[0+ldd*2] = alpha[0]*CC[2+bs*0];
11015 	D[0+ldd*3] = alpha[0]*CC[3+bs*0];
11016 
11017 	D[1+ldd*0] = alpha[0]*CC[0+bs*1];
11018 	D[1+ldd*1] = alpha[0]*CC[1+bs*1];
11019 	D[1+ldd*2] = alpha[0]*CC[2+bs*1];
11020 	D[1+ldd*3] = alpha[0]*CC[3+bs*1];
11021 
11022 	D[2+ldd*0] = alpha[0]*CC[0+bs*2];
11023 	D[2+ldd*1] = alpha[0]*CC[1+bs*2];
11024 	D[2+ldd*2] = alpha[0]*CC[2+bs*2];
11025 	D[2+ldd*3] = alpha[0]*CC[3+bs*2];
11026 
11027 	D[3+ldd*0] = alpha[0]*CC[0+bs*3];
11028 	D[3+ldd*1] = alpha[0]*CC[1+bs*3];
11029 	D[3+ldd*2] = alpha[0]*CC[2+bs*3];
11030 	D[3+ldd*3] = alpha[0]*CC[3+bs*3];
11031 
11032 	return;
11033 
11034 	}
11035 #endif
11036 
11037 
11038 
11039 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd,int m1,int n1)11040 void kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1)
11041 	{
11042 
11043 	const int bs = 4;
11044 
11045 	double
11046 		tmp,
11047 		a_0, a_1, a_2, a_3,
11048 		b_0, b_1, b_2, b_3;
11049 
11050 #if defined(TARGET_GENERIC)
11051 	double CC[16] = {0};
11052 #else
11053 	ALIGNED( double CC[16], 64 ) = {0};
11054 #endif
11055 
11056 	double alpha1 = 1.0;
11057 
11058 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
11059 
11060 	A += kmax*bs;
11061 	B += kmax*ldb;
11062 
11063 	// XXX m1 and n1 are swapped !!!!!
11064 	if(m1>=4)
11065 		{
11066 
11067 		// k = 0
11068 
11069 		a_0 = A[0];
11070 		a_1 = A[1];
11071 		a_2 = A[2];
11072 		a_3 = A[3];
11073 
11074 		CC[0+bs*0] += a_0;
11075 		CC[1+bs*0] += a_1;
11076 		CC[2+bs*0] += a_2;
11077 		CC[3+bs*0] += a_3;
11078 
11079 		b_0 = B[1+0*ldb];
11080 		CC[0+bs*1] += a_0 * b_0;
11081 		CC[1+bs*1] += a_1 * b_0;
11082 		CC[2+bs*1] += a_2 * b_0;
11083 		CC[3+bs*1] += a_3 * b_0;
11084 
11085 		b_0 = B[2+0*ldb];
11086 		CC[0+bs*2] += a_0 * b_0;
11087 		CC[1+bs*2] += a_1 * b_0;
11088 		CC[2+bs*2] += a_2 * b_0;
11089 		CC[3+bs*2] += a_3 * b_0;
11090 
11091 		b_0 = B[3+0*ldb];
11092 		CC[0+bs*3] += a_0 * b_0;
11093 		CC[1+bs*3] += a_1 * b_0;
11094 		CC[2+bs*3] += a_2 * b_0;
11095 		CC[3+bs*3] += a_3 * b_0;
11096 
11097 		A += bs;
11098 		B += ldb;
11099 //		k += 1;
11100 
11101 		// k = 1
11102 
11103 		a_0 = A[0];
11104 		a_1 = A[1];
11105 		a_2 = A[2];
11106 		a_3 = A[3];
11107 
11108 		CC[0+bs*1] += a_0;
11109 		CC[1+bs*1] += a_1;
11110 		CC[2+bs*1] += a_2;
11111 		CC[3+bs*1] += a_3;
11112 
11113 		b_0 = B[2+0*ldb];
11114 		CC[0+bs*2] += a_0 * b_0;
11115 		CC[1+bs*2] += a_1 * b_0;
11116 		CC[2+bs*2] += a_2 * b_0;
11117 		CC[3+bs*2] += a_3 * b_0;
11118 
11119 		b_0 = B[3+0*ldb];
11120 		CC[0+bs*3] += a_0 * b_0;
11121 		CC[1+bs*3] += a_1 * b_0;
11122 		CC[2+bs*3] += a_2 * b_0;
11123 		CC[3+bs*3] += a_3 * b_0;
11124 
11125 		A += bs;
11126 		B += ldb;
11127 //		k += 1;
11128 
11129 		// k = 2
11130 
11131 		a_0 = A[0];
11132 		a_1 = A[1];
11133 		a_2 = A[2];
11134 		a_3 = A[3];
11135 
11136 		CC[0+bs*2] += a_0;
11137 		CC[1+bs*2] += a_1;
11138 		CC[2+bs*2] += a_2;
11139 		CC[3+bs*2] += a_3;
11140 
11141 		b_0 = B[3+0*ldb];
11142 		CC[0+bs*3] += a_0 * b_0;
11143 		CC[1+bs*3] += a_1 * b_0;
11144 		CC[2+bs*3] += a_2 * b_0;
11145 		CC[3+bs*3] += a_3 * b_0;
11146 
11147 		A += bs;
11148 		B += ldb;
11149 //		k += 1;
11150 
11151 		// k = 3
11152 
11153 		a_0 = A[0];
11154 		a_1 = A[1];
11155 		a_2 = A[2];
11156 		a_3 = A[3];
11157 
11158 		CC[0+bs*3] += a_0;
11159 		CC[1+bs*3] += a_1;
11160 		CC[2+bs*3] += a_2;
11161 		CC[3+bs*3] += a_3;
11162 
11163 		A += bs;
11164 		B += ldb;
11165 //		k += 1;
11166 
11167 		}
11168 	else if(m1==3)
11169 		{
11170 
11171 		// k = 0
11172 
11173 		a_0 = A[0];
11174 		a_1 = A[1];
11175 		a_2 = A[2];
11176 		a_3 = A[3];
11177 
11178 		CC[0+bs*0] += a_0;
11179 		CC[1+bs*0] += a_1;
11180 		CC[2+bs*0] += a_2;
11181 		CC[3+bs*0] += a_3;
11182 
11183 		b_0 = B[1+0*ldb];
11184 		CC[0+bs*1] += a_0 * b_0;
11185 		CC[1+bs*1] += a_1 * b_0;
11186 		CC[2+bs*1] += a_2 * b_0;
11187 		CC[3+bs*1] += a_3 * b_0;
11188 
11189 		b_0 = B[2+0*ldb];
11190 		CC[0+bs*2] += a_0 * b_0;
11191 		CC[1+bs*2] += a_1 * b_0;
11192 		CC[2+bs*2] += a_2 * b_0;
11193 		CC[3+bs*2] += a_3 * b_0;
11194 
11195 		A += bs;
11196 		B += ldb;
11197 //		k += 1;
11198 
11199 		// k = 1
11200 
11201 		a_0 = A[0];
11202 		a_1 = A[1];
11203 		a_2 = A[2];
11204 		a_3 = A[3];
11205 
11206 		CC[0+bs*1] += a_0;
11207 		CC[1+bs*1] += a_1;
11208 		CC[2+bs*1] += a_2;
11209 		CC[3+bs*1] += a_3;
11210 
11211 		b_0 = B[2+0*ldb];
11212 		CC[0+bs*2] += a_0 * b_0;
11213 		CC[1+bs*2] += a_1 * b_0;
11214 		CC[2+bs*2] += a_2 * b_0;
11215 		CC[3+bs*2] += a_3 * b_0;
11216 
11217 		A += bs;
11218 		B += ldb;
11219 //		k += 1;
11220 
11221 		// k = 2
11222 
11223 		a_0 = A[0];
11224 		a_1 = A[1];
11225 		a_2 = A[2];
11226 		a_3 = A[3];
11227 
11228 		CC[0+bs*2] += a_0;
11229 		CC[1+bs*2] += a_1;
11230 		CC[2+bs*2] += a_2;
11231 		CC[3+bs*2] += a_3;
11232 
11233 		A += bs;
11234 		B += ldb;
11235 //		k += 1;
11236 
11237 		// k = 3
11238 
11239 		A += bs;
11240 		B += ldb;
11241 //		k += 1;
11242 
11243 		}
11244 	else if(m1==2)
11245 		{
11246 
11247 		// k = 0
11248 
11249 		a_0 = A[0];
11250 		a_1 = A[1];
11251 		a_2 = A[2];
11252 		a_3 = A[3];
11253 
11254 		CC[0+bs*0] += a_0;
11255 		CC[1+bs*0] += a_1;
11256 		CC[2+bs*0] += a_2;
11257 		CC[3+bs*0] += a_3;
11258 
11259 		b_0 = B[1+0*ldb];
11260 		CC[0+bs*1] += a_0 * b_0;
11261 		CC[1+bs*1] += a_1 * b_0;
11262 		CC[2+bs*1] += a_2 * b_0;
11263 		CC[3+bs*1] += a_3 * b_0;
11264 
11265 		A += bs;
11266 		B += ldb;
11267 //		k += 1;
11268 
11269 		// k = 1
11270 
11271 		a_0 = A[0];
11272 		a_1 = A[1];
11273 		a_2 = A[2];
11274 		a_3 = A[3];
11275 
11276 		CC[0+bs*1] += a_0;
11277 		CC[1+bs*1] += a_1;
11278 		CC[2+bs*1] += a_2;
11279 		CC[3+bs*1] += a_3;
11280 
11281 		A += bs;
11282 		B += ldb;
11283 //		k += 1;
11284 
11285 		// k = 2
11286 
11287 		A += bs;
11288 		B += ldb;
11289 //		k += 1;
11290 
11291 		// k = 3
11292 
11293 		A += bs;
11294 		B += ldb;
11295 //		k += 1;
11296 
11297 		}
11298 	else if(m1==1)
11299 		{
11300 
11301 		// k = 0
11302 
11303 		a_0 = A[0];
11304 		a_1 = A[1];
11305 		a_2 = A[2];
11306 		a_3 = A[3];
11307 
11308 		CC[0+bs*0] += a_0;
11309 		CC[1+bs*0] += a_1;
11310 		CC[2+bs*0] += a_2;
11311 		CC[3+bs*0] += a_3;
11312 
11313 		A += bs;
11314 		B += ldb;
11315 //		k += 1;
11316 
11317 		// k = 1
11318 
11319 		A += bs;
11320 		B += ldb;
11321 //		k += 1;
11322 
11323 		// k = 2
11324 
11325 		A += bs;
11326 		B += ldb;
11327 //		k += 1;
11328 
11329 		// k = 3
11330 
11331 		A += bs;
11332 		B += ldb;
11333 //		k += 1;
11334 
11335 		}
11336 
11337 	store:
11338 
11339 	// scale
11340 	CC[0+bs*0] = alpha[0]*CC[0+bs*0];
11341 	CC[1+bs*0] = alpha[0]*CC[1+bs*0];
11342 	CC[2+bs*0] = alpha[0]*CC[2+bs*0];
11343 	CC[3+bs*0] = alpha[0]*CC[3+bs*0];
11344 
11345 	CC[0+bs*1] = alpha[0]*CC[0+bs*1];
11346 	CC[1+bs*1] = alpha[0]*CC[1+bs*1];
11347 	CC[2+bs*1] = alpha[0]*CC[2+bs*1];
11348 	CC[3+bs*1] = alpha[0]*CC[3+bs*1];
11349 
11350 	CC[0+bs*2] = alpha[0]*CC[0+bs*2];
11351 	CC[1+bs*2] = alpha[0]*CC[1+bs*2];
11352 	CC[2+bs*2] = alpha[0]*CC[2+bs*2];
11353 	CC[3+bs*2] = alpha[0]*CC[3+bs*2];
11354 
11355 	CC[0+bs*3] = alpha[0]*CC[0+bs*3];
11356 	CC[1+bs*3] = alpha[0]*CC[1+bs*3];
11357 	CC[2+bs*3] = alpha[0]*CC[2+bs*3];
11358 	CC[3+bs*3] = alpha[0]*CC[3+bs*3];
11359 
11360 	// transpose
11361 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
11362 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
11363 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
11364 
11365 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
11366 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
11367 
11368 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
11369 
11370 	// store
11371 	if(m1>=4)
11372 		{
11373 		D[0+ldd*0] = CC[0+bs*0];
11374 		D[1+ldd*0] = CC[1+bs*0];
11375 		D[2+ldd*0] = CC[2+bs*0];
11376 		D[3+ldd*0] = CC[3+bs*0];
11377 
11378 		if(n1==1)
11379 			return;
11380 
11381 		D[0+ldd*1] = CC[0+bs*1];
11382 		D[1+ldd*1] = CC[1+bs*1];
11383 		D[2+ldd*1] = CC[2+bs*1];
11384 		D[3+ldd*1] = CC[3+bs*1];
11385 
11386 		if(n1==2)
11387 			return;
11388 
11389 		D[0+ldd*2] = CC[0+bs*2];
11390 		D[1+ldd*2] = CC[1+bs*2];
11391 		D[2+ldd*2] = CC[2+bs*2];
11392 		D[3+ldd*2] = CC[3+bs*2];
11393 
11394 		if(n1==3)
11395 			return;
11396 
11397 		D[0+ldd*3] = CC[0+bs*3];
11398 		D[1+ldd*3] = CC[1+bs*3];
11399 		D[2+ldd*3] = CC[2+bs*3];
11400 		D[3+ldd*3] = CC[3+bs*3];
11401 		}
11402 	else if(m1>=3)
11403 		{
11404 		D[0+ldd*0] = CC[0+bs*0];
11405 		D[1+ldd*0] = CC[1+bs*0];
11406 		D[2+ldd*0] = CC[2+bs*0];
11407 
11408 		if(n1==1)
11409 			return;
11410 
11411 		D[0+ldd*1] = CC[0+bs*1];
11412 		D[1+ldd*1] = CC[1+bs*1];
11413 		D[2+ldd*1] = CC[2+bs*1];
11414 
11415 		if(n1==2)
11416 			return;
11417 
11418 		D[0+ldd*2] = CC[0+bs*2];
11419 		D[1+ldd*2] = CC[1+bs*2];
11420 		D[2+ldd*2] = CC[2+bs*2];
11421 
11422 		if(n1==3)
11423 			return;
11424 
11425 		D[0+ldd*3] = CC[0+bs*3];
11426 		D[1+ldd*3] = CC[1+bs*3];
11427 		D[2+ldd*3] = CC[2+bs*3];
11428 		}
11429 	else if(m1>=2)
11430 		{
11431 		D[0+ldd*0] = CC[0+bs*0];
11432 		D[1+ldd*0] = CC[1+bs*0];
11433 
11434 		if(n1==1)
11435 			return;
11436 
11437 		D[0+ldd*1] = CC[0+bs*1];
11438 		D[1+ldd*1] = CC[1+bs*1];
11439 
11440 		if(n1==2)
11441 			return;
11442 
11443 		D[0+ldd*2] = CC[0+bs*2];
11444 		D[1+ldd*2] = CC[1+bs*2];
11445 
11446 		if(n1==3)
11447 			return;
11448 
11449 		D[0+ldd*3] = CC[0+bs*3];
11450 		D[1+ldd*3] = CC[1+bs*3];
11451 		}
11452 	else //if(m1>=1)
11453 		{
11454 		D[0+ldd*0] = CC[0+bs*0];
11455 
11456 		if(n1==1)
11457 			return;
11458 
11459 		D[0+ldd*1] = CC[0+bs*1];
11460 
11461 		if(n1==2)
11462 			return;
11463 
11464 		D[0+ldd*2] = CC[0+bs*2];
11465 
11466 		if(n1==3)
11467 			return;
11468 
11469 		D[0+ldd*3] = CC[0+bs*3];
11470 		}
11471 
11472 	return;
11473 
11474 	}
11475 #endif
11476 
11477 
11478 
11479 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd)11480 void kernel_dtrmm_nt_ru_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
11481 	{
11482 
11483 	const int bs = 4;
11484 
11485 	double
11486 		a_0, a_1, a_2, a_3,
11487 		b_0, b_1, b_2, b_3;
11488 
11489 #if defined(TARGET_GENERIC)
11490 	double CC[16] = {0};
11491 #else
11492 	ALIGNED( double CC[16], 64 ) = {0};
11493 #endif
11494 
11495 	int k;
11496 
11497 	k = 0;
11498 
11499 	// k = 0
11500 
11501 	a_0 = A[0];
11502 	a_1 = A[1];
11503 	a_2 = A[2];
11504 	a_3 = A[3];
11505 
11506 	b_0 = B[0+0*bs];
11507 	CC[0+bs*0] += a_0 * b_0;
11508 	CC[1+bs*0] += a_1 * b_0;
11509 	CC[2+bs*0] += a_2 * b_0;
11510 	CC[3+bs*0] += a_3 * b_0;
11511 
11512 	A += bs;
11513 	B += bs;
11514 	k += 1;
11515 
11516 	if(k>=kmax)
11517 		goto store;
11518 
11519 	// k = 1
11520 
11521 	a_0 = A[0];
11522 	a_1 = A[1];
11523 	a_2 = A[2];
11524 	a_3 = A[3];
11525 
11526 	b_0 = B[0+0*bs];
11527 	CC[0+bs*0] += a_0 * b_0;
11528 	CC[1+bs*0] += a_1 * b_0;
11529 	CC[2+bs*0] += a_2 * b_0;
11530 	CC[3+bs*0] += a_3 * b_0;
11531 
11532 	b_1 = B[1+0*bs];
11533 	CC[0+bs*1] += a_0 * b_1;
11534 	CC[1+bs*1] += a_1 * b_1;
11535 	CC[2+bs*1] += a_2 * b_1;
11536 	CC[3+bs*1] += a_3 * b_1;
11537 
11538 	A += bs;
11539 	B += bs;
11540 	k += 1;
11541 
11542 	if(k>=kmax)
11543 		goto store;
11544 
11545 	// k = 2
11546 
11547 	a_0 = A[0];
11548 	a_1 = A[1];
11549 	a_2 = A[2];
11550 	a_3 = A[3];
11551 
11552 	b_0 = B[0+0*bs];
11553 	CC[0+bs*0] += a_0 * b_0;
11554 	CC[1+bs*0] += a_1 * b_0;
11555 	CC[2+bs*0] += a_2 * b_0;
11556 	CC[3+bs*0] += a_3 * b_0;
11557 
11558 	b_1 = B[1+0*bs];
11559 	CC[0+bs*1] += a_0 * b_1;
11560 	CC[1+bs*1] += a_1 * b_1;
11561 	CC[2+bs*1] += a_2 * b_1;
11562 	CC[3+bs*1] += a_3 * b_1;
11563 
11564 	b_2 = B[2+0*bs];
11565 	CC[0+bs*2] += a_0 * b_2;
11566 	CC[1+bs*2] += a_1 * b_2;
11567 	CC[2+bs*2] += a_2 * b_2;
11568 	CC[3+bs*2] += a_3 * b_2;
11569 
11570 	A += bs;
11571 	B += bs;
11572 	k += 1;
11573 
11574 	if(k>=kmax)
11575 		goto store;
11576 
11577 	// k = 3
11578 
11579 	a_0 = A[0];
11580 	a_1 = A[1];
11581 	a_2 = A[2];
11582 	a_3 = A[3];
11583 
11584 	b_0 = B[0+0*bs];
11585 	CC[0+bs*0] += a_0 * b_0;
11586 	CC[1+bs*0] += a_1 * b_0;
11587 	CC[2+bs*0] += a_2 * b_0;
11588 	CC[3+bs*0] += a_3 * b_0;
11589 
11590 	b_1 = B[1+0*bs];
11591 	CC[0+bs*1] += a_0 * b_1;
11592 	CC[1+bs*1] += a_1 * b_1;
11593 	CC[2+bs*1] += a_2 * b_1;
11594 	CC[3+bs*1] += a_3 * b_1;
11595 
11596 	b_2 = B[2+0*bs];
11597 	CC[0+bs*2] += a_0 * b_2;
11598 	CC[1+bs*2] += a_1 * b_2;
11599 	CC[2+bs*2] += a_2 * b_2;
11600 	CC[3+bs*2] += a_3 * b_2;
11601 
11602 	b_3 = B[3+0*bs];
11603 	CC[0+bs*3] += a_0 * b_3;
11604 	CC[1+bs*3] += a_1 * b_3;
11605 	CC[2+bs*3] += a_2 * b_3;
11606 	CC[3+bs*3] += a_3 * b_3;
11607 
11608 	A += bs;
11609 	B += bs;
11610 	k += 1;
11611 
11612 	store:
11613 
11614 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
11615 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
11616 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
11617 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
11618 
11619 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
11620 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
11621 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
11622 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
11623 
11624 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
11625 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
11626 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
11627 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
11628 
11629 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
11630 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
11631 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
11632 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
11633 
11634 	double beta1 = 1.0;
11635 
11636 	kernel_dgemm_nt_4x4_lib4(kmax-k, alpha, A, B, &beta1, CC, CC);
11637 
11638 	D[0+ldd*0] = CC[0+bs*0];
11639 	D[1+ldd*0] = CC[1+bs*0];
11640 	D[2+ldd*0] = CC[2+bs*0];
11641 	D[3+ldd*0] = CC[3+bs*0];
11642 
11643 	D[0+ldd*1] = CC[0+bs*1];
11644 	D[1+ldd*1] = CC[1+bs*1];
11645 	D[2+ldd*1] = CC[2+bs*1];
11646 	D[3+ldd*1] = CC[3+bs*1];
11647 
11648 	D[0+ldd*2] = CC[0+bs*2];
11649 	D[1+ldd*2] = CC[1+bs*2];
11650 	D[2+ldd*2] = CC[2+bs*2];
11651 	D[3+ldd*2] = CC[3+bs*2];
11652 
11653 	D[0+ldd*3] = CC[0+bs*3];
11654 	D[1+ldd*3] = CC[1+bs*3];
11655 	D[2+ldd*3] = CC[2+bs*3];
11656 	D[3+ldd*3] = CC[3+bs*3];
11657 
11658 	return;
11659 
11660 	}
11661 #endif
11662 
11663 
11664 
11665 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_vs_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)11666 void kernel_dtrmm_nt_ru_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
11667 	{
11668 
11669 	const int bs = 4;
11670 
11671 	double
11672 		a_0, a_1, a_2, a_3,
11673 		b_0, b_1, b_2, b_3;
11674 
11675 #if defined(TARGET_GENERIC)
11676 	double CC[16] = {0};
11677 #else
11678 	ALIGNED( double CC[16], 64 ) = {0};
11679 #endif
11680 
11681 	int k;
11682 
11683 	k = 0;
11684 
11685 	// k = 0
11686 
11687 	a_0 = A[0];
11688 	a_1 = A[1];
11689 	a_2 = A[2];
11690 	a_3 = A[3];
11691 
11692 	b_0 = B[0+0*bs];
11693 	CC[0+bs*0] += a_0 * b_0;
11694 	CC[1+bs*0] += a_1 * b_0;
11695 	CC[2+bs*0] += a_2 * b_0;
11696 	CC[3+bs*0] += a_3 * b_0;
11697 
11698 	A += bs;
11699 	B += bs;
11700 	k += 1;
11701 
11702 	if(k>=kmax)
11703 		goto store;
11704 
11705 	// k = 1
11706 
11707 	a_0 = A[0];
11708 	a_1 = A[1];
11709 	a_2 = A[2];
11710 	a_3 = A[3];
11711 
11712 	b_0 = B[0+0*bs];
11713 	CC[0+bs*0] += a_0 * b_0;
11714 	CC[1+bs*0] += a_1 * b_0;
11715 	CC[2+bs*0] += a_2 * b_0;
11716 	CC[3+bs*0] += a_3 * b_0;
11717 
11718 	b_1 = B[1+0*bs];
11719 	CC[0+bs*1] += a_0 * b_1;
11720 	CC[1+bs*1] += a_1 * b_1;
11721 	CC[2+bs*1] += a_2 * b_1;
11722 	CC[3+bs*1] += a_3 * b_1;
11723 
11724 	A += bs;
11725 	B += bs;
11726 	k += 1;
11727 
11728 	if(k>=kmax)
11729 		goto store;
11730 
11731 	// k = 2
11732 
11733 	a_0 = A[0];
11734 	a_1 = A[1];
11735 	a_2 = A[2];
11736 	a_3 = A[3];
11737 
11738 	b_0 = B[0+0*bs];
11739 	CC[0+bs*0] += a_0 * b_0;
11740 	CC[1+bs*0] += a_1 * b_0;
11741 	CC[2+bs*0] += a_2 * b_0;
11742 	CC[3+bs*0] += a_3 * b_0;
11743 
11744 	b_1 = B[1+0*bs];
11745 	CC[0+bs*1] += a_0 * b_1;
11746 	CC[1+bs*1] += a_1 * b_1;
11747 	CC[2+bs*1] += a_2 * b_1;
11748 	CC[3+bs*1] += a_3 * b_1;
11749 
11750 	b_2 = B[2+0*bs];
11751 	CC[0+bs*2] += a_0 * b_2;
11752 	CC[1+bs*2] += a_1 * b_2;
11753 	CC[2+bs*2] += a_2 * b_2;
11754 	CC[3+bs*2] += a_3 * b_2;
11755 
11756 	A += bs;
11757 	B += bs;
11758 	k += 1;
11759 
11760 	if(k>=kmax)
11761 		goto store;
11762 
11763 	// k = 3
11764 
11765 	a_0 = A[0];
11766 	a_1 = A[1];
11767 	a_2 = A[2];
11768 	a_3 = A[3];
11769 
11770 	b_0 = B[0+0*bs];
11771 	CC[0+bs*0] += a_0 * b_0;
11772 	CC[1+bs*0] += a_1 * b_0;
11773 	CC[2+bs*0] += a_2 * b_0;
11774 	CC[3+bs*0] += a_3 * b_0;
11775 
11776 	b_1 = B[1+0*bs];
11777 	CC[0+bs*1] += a_0 * b_1;
11778 	CC[1+bs*1] += a_1 * b_1;
11779 	CC[2+bs*1] += a_2 * b_1;
11780 	CC[3+bs*1] += a_3 * b_1;
11781 
11782 	b_2 = B[2+0*bs];
11783 	CC[0+bs*2] += a_0 * b_2;
11784 	CC[1+bs*2] += a_1 * b_2;
11785 	CC[2+bs*2] += a_2 * b_2;
11786 	CC[3+bs*2] += a_3 * b_2;
11787 
11788 	b_3 = B[3+0*bs];
11789 	CC[0+bs*3] += a_0 * b_3;
11790 	CC[1+bs*3] += a_1 * b_3;
11791 	CC[2+bs*3] += a_2 * b_3;
11792 	CC[3+bs*3] += a_3 * b_3;
11793 
11794 	A += bs;
11795 	B += bs;
11796 	k += 1;
11797 
11798 	store:
11799 
11800 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
11801 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
11802 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
11803 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
11804 
11805 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
11806 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
11807 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
11808 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
11809 
11810 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
11811 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
11812 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
11813 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
11814 
11815 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
11816 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
11817 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
11818 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
11819 
11820 	double beta1 = 1.0;
11821 
11822 	kernel_dgemm_nt_4x4_lib4(kmax-k, alpha, A, B, &beta1, CC, CC);
11823 
11824 	if(m1>=4)
11825 		{
11826 		D[0+ldd*0] = CC[0+bs*0];
11827 		D[1+ldd*0] = CC[1+bs*0];
11828 		D[2+ldd*0] = CC[2+bs*0];
11829 		D[3+ldd*0] = CC[3+bs*0];
11830 
11831 		if(n1==1)
11832 			return;
11833 
11834 		D[0+ldd*1] = CC[0+bs*1];
11835 		D[1+ldd*1] = CC[1+bs*1];
11836 		D[2+ldd*1] = CC[2+bs*1];
11837 		D[3+ldd*1] = CC[3+bs*1];
11838 
11839 		if(n1==2)
11840 			return;
11841 
11842 		D[0+ldd*2] = CC[0+bs*2];
11843 		D[1+ldd*2] = CC[1+bs*2];
11844 		D[2+ldd*2] = CC[2+bs*2];
11845 		D[3+ldd*2] = CC[3+bs*2];
11846 
11847 		if(n1==3)
11848 			return;
11849 
11850 		D[0+ldd*3] = CC[0+bs*3];
11851 		D[1+ldd*3] = CC[1+bs*3];
11852 		D[2+ldd*3] = CC[2+bs*3];
11853 		D[3+ldd*3] = CC[3+bs*3];
11854 		}
11855 	else if(m1>=3)
11856 		{
11857 		D[0+ldd*0] = CC[0+bs*0];
11858 		D[1+ldd*0] = CC[1+bs*0];
11859 		D[2+ldd*0] = CC[2+bs*0];
11860 
11861 		if(n1==1)
11862 			return;
11863 
11864 		D[0+ldd*1] = CC[0+bs*1];
11865 		D[1+ldd*1] = CC[1+bs*1];
11866 		D[2+ldd*1] = CC[2+bs*1];
11867 
11868 		if(n1==2)
11869 			return;
11870 
11871 		D[0+ldd*2] = CC[0+bs*2];
11872 		D[1+ldd*2] = CC[1+bs*2];
11873 		D[2+ldd*2] = CC[2+bs*2];
11874 
11875 		if(n1==3)
11876 			return;
11877 
11878 		D[0+ldd*3] = CC[0+bs*3];
11879 		D[1+ldd*3] = CC[1+bs*3];
11880 		D[2+ldd*3] = CC[2+bs*3];
11881 		}
11882 	else if(m1>=2)
11883 		{
11884 		D[0+ldd*0] = CC[0+bs*0];
11885 		D[1+ldd*0] = CC[1+bs*0];
11886 
11887 		if(n1==1)
11888 			return;
11889 
11890 		D[0+ldd*1] = CC[0+bs*1];
11891 		D[1+ldd*1] = CC[1+bs*1];
11892 
11893 		if(n1==2)
11894 			return;
11895 
11896 		D[0+ldd*2] = CC[0+bs*2];
11897 		D[1+ldd*2] = CC[1+bs*2];
11898 
11899 		if(n1==3)
11900 			return;
11901 
11902 		D[0+ldd*3] = CC[0+bs*3];
11903 		D[1+ldd*3] = CC[1+bs*3];
11904 		}
11905 	else //if(m1>=1)
11906 		{
11907 		D[0+ldd*0] = CC[0+bs*0];
11908 
11909 		if(n1==1)
11910 			return;
11911 
11912 		D[0+ldd*1] = CC[0+bs*1];
11913 
11914 		if(n1==2)
11915 			return;
11916 
11917 		D[0+ldd*2] = CC[0+bs*2];
11918 
11919 		if(n1==3)
11920 			return;
11921 
11922 		D[0+ldd*3] = CC[0+bs*3];
11923 		}
11924 
11925 	return;
11926 
11927 	}
11928 #endif
11929 
11930 
11931 
11932 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_tran_lib444c(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int ldd)11933 void kernel_dtrmm_nt_ru_4x4_tran_lib444c(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd)
11934 	{
11935 
11936 	const int bs = 4;
11937 
11938 	double
11939 		a_0, a_1, a_2, a_3,
11940 		b_0, b_1, b_2, b_3;
11941 
11942 #if defined(TARGET_GENERIC)
11943 	double CC[16] = {0};
11944 #else
11945 	ALIGNED( double CC[16], 64 ) = {0};
11946 #endif
11947 
11948 	int k;
11949 
11950 	k = 0;
11951 
11952 	// k = 0
11953 
11954 	a_0 = A[0];
11955 	a_1 = A[1];
11956 	a_2 = A[2];
11957 	a_3 = A[3];
11958 
11959 	b_0 = B[0+0*bs];
11960 	CC[0+bs*0] += a_0 * b_0;
11961 	CC[1+bs*0] += a_1 * b_0;
11962 	CC[2+bs*0] += a_2 * b_0;
11963 	CC[3+bs*0] += a_3 * b_0;
11964 
11965 	A += bs;
11966 	B += bs;
11967 	k += 1;
11968 
11969 	if(k>=kmax)
11970 		goto store;
11971 
11972 	// k = 1
11973 
11974 	a_0 = A[0];
11975 	a_1 = A[1];
11976 	a_2 = A[2];
11977 	a_3 = A[3];
11978 
11979 	b_0 = B[0+0*bs];
11980 	CC[0+bs*0] += a_0 * b_0;
11981 	CC[1+bs*0] += a_1 * b_0;
11982 	CC[2+bs*0] += a_2 * b_0;
11983 	CC[3+bs*0] += a_3 * b_0;
11984 
11985 	b_1 = B[1+0*bs];
11986 	CC[0+bs*1] += a_0 * b_1;
11987 	CC[1+bs*1] += a_1 * b_1;
11988 	CC[2+bs*1] += a_2 * b_1;
11989 	CC[3+bs*1] += a_3 * b_1;
11990 
11991 	A += bs;
11992 	B += bs;
11993 	k += 1;
11994 
11995 	if(k>=kmax)
11996 		goto store;
11997 
11998 	// k = 2
11999 
12000 	a_0 = A[0];
12001 	a_1 = A[1];
12002 	a_2 = A[2];
12003 	a_3 = A[3];
12004 
12005 	b_0 = B[0+0*bs];
12006 	CC[0+bs*0] += a_0 * b_0;
12007 	CC[1+bs*0] += a_1 * b_0;
12008 	CC[2+bs*0] += a_2 * b_0;
12009 	CC[3+bs*0] += a_3 * b_0;
12010 
12011 	b_1 = B[1+0*bs];
12012 	CC[0+bs*1] += a_0 * b_1;
12013 	CC[1+bs*1] += a_1 * b_1;
12014 	CC[2+bs*1] += a_2 * b_1;
12015 	CC[3+bs*1] += a_3 * b_1;
12016 
12017 	b_2 = B[2+0*bs];
12018 	CC[0+bs*2] += a_0 * b_2;
12019 	CC[1+bs*2] += a_1 * b_2;
12020 	CC[2+bs*2] += a_2 * b_2;
12021 	CC[3+bs*2] += a_3 * b_2;
12022 
12023 	A += bs;
12024 	B += bs;
12025 	k += 1;
12026 
12027 	if(k>=kmax)
12028 		goto store;
12029 
12030 	// k = 3
12031 
12032 	a_0 = A[0];
12033 	a_1 = A[1];
12034 	a_2 = A[2];
12035 	a_3 = A[3];
12036 
12037 	b_0 = B[0+0*bs];
12038 	CC[0+bs*0] += a_0 * b_0;
12039 	CC[1+bs*0] += a_1 * b_0;
12040 	CC[2+bs*0] += a_2 * b_0;
12041 	CC[3+bs*0] += a_3 * b_0;
12042 
12043 	b_1 = B[1+0*bs];
12044 	CC[0+bs*1] += a_0 * b_1;
12045 	CC[1+bs*1] += a_1 * b_1;
12046 	CC[2+bs*1] += a_2 * b_1;
12047 	CC[3+bs*1] += a_3 * b_1;
12048 
12049 	b_2 = B[2+0*bs];
12050 	CC[0+bs*2] += a_0 * b_2;
12051 	CC[1+bs*2] += a_1 * b_2;
12052 	CC[2+bs*2] += a_2 * b_2;
12053 	CC[3+bs*2] += a_3 * b_2;
12054 
12055 	b_3 = B[3+0*bs];
12056 	CC[0+bs*3] += a_0 * b_3;
12057 	CC[1+bs*3] += a_1 * b_3;
12058 	CC[2+bs*3] += a_2 * b_3;
12059 	CC[3+bs*3] += a_3 * b_3;
12060 
12061 	A += bs;
12062 	B += bs;
12063 	k += 1;
12064 
12065 	double alpha1 = 1.0;
12066 	double beta1 = 1.0;
12067 
12068 	kernel_dgemm_nt_4x4_lib4(kmax-k, &alpha1, A, B, &beta1, CC, CC);
12069 
12070 	store:
12071 
12072 	// scale & tranpose & store
12073 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
12074 	D[0+ldd*1] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
12075 	D[0+ldd*2] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
12076 	D[0+ldd*3] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
12077 
12078 	D[1+ldd*0] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
12079 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
12080 	D[1+ldd*2] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
12081 	D[1+ldd*3] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
12082 
12083 	D[2+ldd*0] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
12084 	D[2+ldd*1] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
12085 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
12086 	D[2+ldd*3] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
12087 
12088 	D[3+ldd*0] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
12089 	D[3+ldd*1] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
12090 	D[3+ldd*2] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
12091 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
12092 
12093 	return;
12094 
12095 	}
12096 #endif
12097 
12098 
12099 
12100 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_tran_vs_lib444c(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int ldd,int m1,int n1)12101 void kernel_dtrmm_nt_ru_4x4_tran_vs_lib444c(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1)
12102 	{
12103 
12104 	const int bs = 4;
12105 
12106 	double
12107 		tmp,
12108 		a_0, a_1, a_2, a_3,
12109 		b_0, b_1, b_2, b_3;
12110 
12111 #if defined(TARGET_GENERIC)
12112 	double CC[16] = {0};
12113 #else
12114 	ALIGNED( double CC[16], 64 ) = {0};
12115 #endif
12116 
12117 	int k;
12118 
12119 	k = 0;
12120 
12121 	// k = 0
12122 
12123 	a_0 = A[0];
12124 	a_1 = A[1];
12125 	a_2 = A[2];
12126 	a_3 = A[3];
12127 
12128 	b_0 = B[0+0*bs];
12129 	CC[0+bs*0] += a_0 * b_0;
12130 	CC[1+bs*0] += a_1 * b_0;
12131 	CC[2+bs*0] += a_2 * b_0;
12132 	CC[3+bs*0] += a_3 * b_0;
12133 
12134 	A += bs;
12135 	B += bs;
12136 	k += 1;
12137 
12138 	if(k>=kmax)
12139 		goto store;
12140 
12141 	// k = 1
12142 
12143 	a_0 = A[0];
12144 	a_1 = A[1];
12145 	a_2 = A[2];
12146 	a_3 = A[3];
12147 
12148 	b_0 = B[0+0*bs];
12149 	CC[0+bs*0] += a_0 * b_0;
12150 	CC[1+bs*0] += a_1 * b_0;
12151 	CC[2+bs*0] += a_2 * b_0;
12152 	CC[3+bs*0] += a_3 * b_0;
12153 
12154 	b_1 = B[1+0*bs];
12155 	CC[0+bs*1] += a_0 * b_1;
12156 	CC[1+bs*1] += a_1 * b_1;
12157 	CC[2+bs*1] += a_2 * b_1;
12158 	CC[3+bs*1] += a_3 * b_1;
12159 
12160 	A += bs;
12161 	B += bs;
12162 	k += 1;
12163 
12164 	if(k>=kmax)
12165 		goto store;
12166 
12167 	// k = 2
12168 
12169 	a_0 = A[0];
12170 	a_1 = A[1];
12171 	a_2 = A[2];
12172 	a_3 = A[3];
12173 
12174 	b_0 = B[0+0*bs];
12175 	CC[0+bs*0] += a_0 * b_0;
12176 	CC[1+bs*0] += a_1 * b_0;
12177 	CC[2+bs*0] += a_2 * b_0;
12178 	CC[3+bs*0] += a_3 * b_0;
12179 
12180 	b_1 = B[1+0*bs];
12181 	CC[0+bs*1] += a_0 * b_1;
12182 	CC[1+bs*1] += a_1 * b_1;
12183 	CC[2+bs*1] += a_2 * b_1;
12184 	CC[3+bs*1] += a_3 * b_1;
12185 
12186 	b_2 = B[2+0*bs];
12187 	CC[0+bs*2] += a_0 * b_2;
12188 	CC[1+bs*2] += a_1 * b_2;
12189 	CC[2+bs*2] += a_2 * b_2;
12190 	CC[3+bs*2] += a_3 * b_2;
12191 
12192 	A += bs;
12193 	B += bs;
12194 	k += 1;
12195 
12196 	if(k>=kmax)
12197 		goto store;
12198 
12199 	// k = 3
12200 
12201 	a_0 = A[0];
12202 	a_1 = A[1];
12203 	a_2 = A[2];
12204 	a_3 = A[3];
12205 
12206 	b_0 = B[0+0*bs];
12207 	CC[0+bs*0] += a_0 * b_0;
12208 	CC[1+bs*0] += a_1 * b_0;
12209 	CC[2+bs*0] += a_2 * b_0;
12210 	CC[3+bs*0] += a_3 * b_0;
12211 
12212 	b_1 = B[1+0*bs];
12213 	CC[0+bs*1] += a_0 * b_1;
12214 	CC[1+bs*1] += a_1 * b_1;
12215 	CC[2+bs*1] += a_2 * b_1;
12216 	CC[3+bs*1] += a_3 * b_1;
12217 
12218 	b_2 = B[2+0*bs];
12219 	CC[0+bs*2] += a_0 * b_2;
12220 	CC[1+bs*2] += a_1 * b_2;
12221 	CC[2+bs*2] += a_2 * b_2;
12222 	CC[3+bs*2] += a_3 * b_2;
12223 
12224 	b_3 = B[3+0*bs];
12225 	CC[0+bs*3] += a_0 * b_3;
12226 	CC[1+bs*3] += a_1 * b_3;
12227 	CC[2+bs*3] += a_2 * b_3;
12228 	CC[3+bs*3] += a_3 * b_3;
12229 
12230 	A += bs;
12231 	B += bs;
12232 	k += 1;
12233 
12234 	double alpha1 = 1.0;
12235 	double beta1 = 1.0;
12236 
12237 	kernel_dgemm_nt_4x4_lib4(kmax-k, &alpha1, A, B, &beta1, CC, CC);
12238 
12239 	store:
12240 
12241 	// scale
12242 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
12243 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
12244 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
12245 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
12246 
12247 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
12248 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
12249 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
12250 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
12251 
12252 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
12253 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
12254 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
12255 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
12256 
12257 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
12258 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
12259 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
12260 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
12261 
12262 	// transpose
12263 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
12264 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
12265 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
12266 
12267 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
12268 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
12269 
12270 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
12271 
12272 	// store
12273 	if(m1>=4)
12274 		{
12275 		D[0+ldd*0] = CC[0+bs*0];
12276 		D[1+ldd*0] = CC[1+bs*0];
12277 		D[2+ldd*0] = CC[2+bs*0];
12278 		D[3+ldd*0] = CC[3+bs*0];
12279 
12280 		if(n1==1)
12281 			return;
12282 
12283 		D[0+ldd*1] = CC[0+bs*1];
12284 		D[1+ldd*1] = CC[1+bs*1];
12285 		D[2+ldd*1] = CC[2+bs*1];
12286 		D[3+ldd*1] = CC[3+bs*1];
12287 
12288 		if(n1==2)
12289 			return;
12290 
12291 		D[0+ldd*2] = CC[0+bs*2];
12292 		D[1+ldd*2] = CC[1+bs*2];
12293 		D[2+ldd*2] = CC[2+bs*2];
12294 		D[3+ldd*2] = CC[3+bs*2];
12295 
12296 		if(n1==3)
12297 			return;
12298 
12299 		D[0+ldd*3] = CC[0+bs*3];
12300 		D[1+ldd*3] = CC[1+bs*3];
12301 		D[2+ldd*3] = CC[2+bs*3];
12302 		D[3+ldd*3] = CC[3+bs*3];
12303 		}
12304 	else if(m1>=3)
12305 		{
12306 		D[0+ldd*0] = CC[0+bs*0];
12307 		D[1+ldd*0] = CC[1+bs*0];
12308 		D[2+ldd*0] = CC[2+bs*0];
12309 
12310 		if(n1==1)
12311 			return;
12312 
12313 		D[0+ldd*1] = CC[0+bs*1];
12314 		D[1+ldd*1] = CC[1+bs*1];
12315 		D[2+ldd*1] = CC[2+bs*1];
12316 
12317 		if(n1==2)
12318 			return;
12319 
12320 		D[0+ldd*2] = CC[0+bs*2];
12321 		D[1+ldd*2] = CC[1+bs*2];
12322 		D[2+ldd*2] = CC[2+bs*2];
12323 
12324 		if(n1==3)
12325 			return;
12326 
12327 		D[0+ldd*3] = CC[0+bs*3];
12328 		D[1+ldd*3] = CC[1+bs*3];
12329 		D[2+ldd*3] = CC[2+bs*3];
12330 		}
12331 	else if(m1>=2)
12332 		{
12333 		D[0+ldd*0] = CC[0+bs*0];
12334 		D[1+ldd*0] = CC[1+bs*0];
12335 
12336 		if(n1==1)
12337 			return;
12338 
12339 		D[0+ldd*1] = CC[0+bs*1];
12340 		D[1+ldd*1] = CC[1+bs*1];
12341 
12342 		if(n1==2)
12343 			return;
12344 
12345 		D[0+ldd*2] = CC[0+bs*2];
12346 		D[1+ldd*2] = CC[1+bs*2];
12347 
12348 		if(n1==3)
12349 			return;
12350 
12351 		D[0+ldd*3] = CC[0+bs*3];
12352 		D[1+ldd*3] = CC[1+bs*3];
12353 		}
12354 	else //if(m1>=1)
12355 		{
12356 		D[0+ldd*0] = CC[0+bs*0];
12357 
12358 		if(n1==1)
12359 			return;
12360 
12361 		D[0+ldd*1] = CC[0+bs*1];
12362 
12363 		if(n1==2)
12364 			return;
12365 
12366 		D[0+ldd*2] = CC[0+bs*2];
12367 
12368 		if(n1==3)
12369 			return;
12370 
12371 		D[0+ldd*3] = CC[0+bs*3];
12372 		}
12373 
12374 	return;
12375 
12376 	}
12377 #endif
12378 
12379 
12380 
12381 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)12382 void kernel_dtrmm_nt_ru_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
12383 	{
12384 
12385 	const int bs = 4;
12386 
12387 	double
12388 		a_0, a_1, a_2, a_3,
12389 		b_0, b_1, b_2, b_3;
12390 
12391 #if defined(TARGET_GENERIC)
12392 	double CC[16] = {0};
12393 #else
12394 	ALIGNED( double CC[16], 64 ) = {0};
12395 #endif
12396 
12397 	int k;
12398 
12399 	k = 0;
12400 
12401 	// k = 0
12402 
12403 	a_0 = A[0];
12404 	a_1 = A[1];
12405 	a_2 = A[2];
12406 	a_3 = A[3];
12407 
12408 	b_0 = B[0+0*ldb];
12409 	CC[0+bs*0] += a_0 * b_0;
12410 	CC[1+bs*0] += a_1 * b_0;
12411 	CC[2+bs*0] += a_2 * b_0;
12412 	CC[3+bs*0] += a_3 * b_0;
12413 
12414 	A += bs;
12415 	B += ldb;
12416 	k += 1;
12417 
12418 	if(k>=kmax)
12419 		goto store;
12420 
12421 	// k = 1
12422 
12423 	a_0 = A[0];
12424 	a_1 = A[1];
12425 	a_2 = A[2];
12426 	a_3 = A[3];
12427 
12428 	b_0 = B[0+0*ldb];
12429 	CC[0+bs*0] += a_0 * b_0;
12430 	CC[1+bs*0] += a_1 * b_0;
12431 	CC[2+bs*0] += a_2 * b_0;
12432 	CC[3+bs*0] += a_3 * b_0;
12433 
12434 	b_1 = B[1+0*ldb];
12435 	CC[0+bs*1] += a_0 * b_1;
12436 	CC[1+bs*1] += a_1 * b_1;
12437 	CC[2+bs*1] += a_2 * b_1;
12438 	CC[3+bs*1] += a_3 * b_1;
12439 
12440 	A += bs;
12441 	B += ldb;
12442 	k += 1;
12443 
12444 	if(k>=kmax)
12445 		goto store;
12446 
12447 	// k = 2
12448 
12449 	a_0 = A[0];
12450 	a_1 = A[1];
12451 	a_2 = A[2];
12452 	a_3 = A[3];
12453 
12454 	b_0 = B[0+0*ldb];
12455 	CC[0+bs*0] += a_0 * b_0;
12456 	CC[1+bs*0] += a_1 * b_0;
12457 	CC[2+bs*0] += a_2 * b_0;
12458 	CC[3+bs*0] += a_3 * b_0;
12459 
12460 	b_1 = B[1+0*ldb];
12461 	CC[0+bs*1] += a_0 * b_1;
12462 	CC[1+bs*1] += a_1 * b_1;
12463 	CC[2+bs*1] += a_2 * b_1;
12464 	CC[3+bs*1] += a_3 * b_1;
12465 
12466 	b_2 = B[2+0*ldb];
12467 	CC[0+bs*2] += a_0 * b_2;
12468 	CC[1+bs*2] += a_1 * b_2;
12469 	CC[2+bs*2] += a_2 * b_2;
12470 	CC[3+bs*2] += a_3 * b_2;
12471 
12472 	A += bs;
12473 	B += ldb;
12474 	k += 1;
12475 
12476 	if(k>=kmax)
12477 		goto store;
12478 
12479 	// k = 3
12480 
12481 	a_0 = A[0];
12482 	a_1 = A[1];
12483 	a_2 = A[2];
12484 	a_3 = A[3];
12485 
12486 	b_0 = B[0+0*ldb];
12487 	CC[0+bs*0] += a_0 * b_0;
12488 	CC[1+bs*0] += a_1 * b_0;
12489 	CC[2+bs*0] += a_2 * b_0;
12490 	CC[3+bs*0] += a_3 * b_0;
12491 
12492 	b_1 = B[1+0*ldb];
12493 	CC[0+bs*1] += a_0 * b_1;
12494 	CC[1+bs*1] += a_1 * b_1;
12495 	CC[2+bs*1] += a_2 * b_1;
12496 	CC[3+bs*1] += a_3 * b_1;
12497 
12498 	b_2 = B[2+0*ldb];
12499 	CC[0+bs*2] += a_0 * b_2;
12500 	CC[1+bs*2] += a_1 * b_2;
12501 	CC[2+bs*2] += a_2 * b_2;
12502 	CC[3+bs*2] += a_3 * b_2;
12503 
12504 	b_3 = B[3+0*ldb];
12505 	CC[0+bs*3] += a_0 * b_3;
12506 	CC[1+bs*3] += a_1 * b_3;
12507 	CC[2+bs*3] += a_2 * b_3;
12508 	CC[3+bs*3] += a_3 * b_3;
12509 
12510 	A += bs;
12511 	B += ldb;
12512 	k += 1;
12513 
12514 	store:
12515 
12516 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
12517 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
12518 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
12519 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
12520 
12521 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
12522 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
12523 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
12524 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
12525 
12526 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
12527 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
12528 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
12529 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
12530 
12531 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
12532 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
12533 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
12534 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
12535 
12536 	double beta1 = 1.0;
12537 
12538 	kernel_dgemm_nt_4x4_lib4ccc(kmax-k, alpha, A, B, ldb, &beta1, CC, bs, D, ldd);
12539 
12540 	return;
12541 
12542 	}
12543 #endif
12544 
12545 
12546 
12547 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)12548 void kernel_dtrmm_nt_ru_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
12549 	{
12550 
12551 	const int bs = 4;
12552 
12553 	double
12554 		a_0, a_1, a_2, a_3,
12555 		b_0, b_1, b_2, b_3;
12556 
12557 #if defined(TARGET_GENERIC)
12558 	double CC[16] = {0};
12559 #else
12560 	ALIGNED( double CC[16], 64 ) = {0};
12561 #endif
12562 
12563 	int k;
12564 
12565 	k = 0;
12566 
12567 	// k = 0
12568 
12569 	a_0 = A[0];
12570 	a_1 = A[1];
12571 	a_2 = A[2];
12572 	a_3 = A[3];
12573 
12574 	b_0 = B[0+0*ldb];
12575 	CC[0+bs*0] += a_0 * b_0;
12576 	CC[1+bs*0] += a_1 * b_0;
12577 	CC[2+bs*0] += a_2 * b_0;
12578 	CC[3+bs*0] += a_3 * b_0;
12579 
12580 	A += bs;
12581 	B += ldb;
12582 	k += 1;
12583 
12584 	if(k>=kmax)
12585 		goto store;
12586 
12587 	// k = 1
12588 
12589 	a_0 = A[0];
12590 	a_1 = A[1];
12591 	a_2 = A[2];
12592 	a_3 = A[3];
12593 
12594 	b_0 = B[0+0*ldb];
12595 	CC[0+bs*0] += a_0 * b_0;
12596 	CC[1+bs*0] += a_1 * b_0;
12597 	CC[2+bs*0] += a_2 * b_0;
12598 	CC[3+bs*0] += a_3 * b_0;
12599 
12600 	b_1 = B[1+0*ldb];
12601 	CC[0+bs*1] += a_0 * b_1;
12602 	CC[1+bs*1] += a_1 * b_1;
12603 	CC[2+bs*1] += a_2 * b_1;
12604 	CC[3+bs*1] += a_3 * b_1;
12605 
12606 	A += bs;
12607 	B += ldb;
12608 	k += 1;
12609 
12610 	if(k>=kmax)
12611 		goto store;
12612 
12613 	// k = 2
12614 
12615 	a_0 = A[0];
12616 	a_1 = A[1];
12617 	a_2 = A[2];
12618 	a_3 = A[3];
12619 
12620 	b_0 = B[0+0*ldb];
12621 	CC[0+bs*0] += a_0 * b_0;
12622 	CC[1+bs*0] += a_1 * b_0;
12623 	CC[2+bs*0] += a_2 * b_0;
12624 	CC[3+bs*0] += a_3 * b_0;
12625 
12626 	b_1 = B[1+0*ldb];
12627 	CC[0+bs*1] += a_0 * b_1;
12628 	CC[1+bs*1] += a_1 * b_1;
12629 	CC[2+bs*1] += a_2 * b_1;
12630 	CC[3+bs*1] += a_3 * b_1;
12631 
12632 	b_2 = B[2+0*ldb];
12633 	CC[0+bs*2] += a_0 * b_2;
12634 	CC[1+bs*2] += a_1 * b_2;
12635 	CC[2+bs*2] += a_2 * b_2;
12636 	CC[3+bs*2] += a_3 * b_2;
12637 
12638 	A += bs;
12639 	B += ldb;
12640 	k += 1;
12641 
12642 	if(k>=kmax)
12643 		goto store;
12644 
12645 	// k = 3
12646 
12647 	a_0 = A[0];
12648 	a_1 = A[1];
12649 	a_2 = A[2];
12650 	a_3 = A[3];
12651 
12652 	b_0 = B[0+0*ldb];
12653 	CC[0+bs*0] += a_0 * b_0;
12654 	CC[1+bs*0] += a_1 * b_0;
12655 	CC[2+bs*0] += a_2 * b_0;
12656 	CC[3+bs*0] += a_3 * b_0;
12657 
12658 	b_1 = B[1+0*ldb];
12659 	CC[0+bs*1] += a_0 * b_1;
12660 	CC[1+bs*1] += a_1 * b_1;
12661 	CC[2+bs*1] += a_2 * b_1;
12662 	CC[3+bs*1] += a_3 * b_1;
12663 
12664 	b_2 = B[2+0*ldb];
12665 	CC[0+bs*2] += a_0 * b_2;
12666 	CC[1+bs*2] += a_1 * b_2;
12667 	CC[2+bs*2] += a_2 * b_2;
12668 	CC[3+bs*2] += a_3 * b_2;
12669 
12670 	b_3 = B[3+0*ldb];
12671 	CC[0+bs*3] += a_0 * b_3;
12672 	CC[1+bs*3] += a_1 * b_3;
12673 	CC[2+bs*3] += a_2 * b_3;
12674 	CC[3+bs*3] += a_3 * b_3;
12675 
12676 	A += bs;
12677 	B += ldb;
12678 	k += 1;
12679 
12680 	store:
12681 
12682 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
12683 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
12684 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
12685 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
12686 
12687 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
12688 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
12689 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
12690 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
12691 
12692 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
12693 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
12694 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
12695 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
12696 
12697 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
12698 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
12699 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
12700 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
12701 
12702 	double beta1 = 1.0;
12703 
12704 	kernel_dgemm_nt_4x4_lib4ccc(kmax-k, alpha, A, B, ldb, &beta1, CC, bs, CC, bs);
12705 
12706 	if(m1>=4)
12707 		{
12708 		D[0+ldd*0] = CC[0+bs*0];
12709 		D[1+ldd*0] = CC[1+bs*0];
12710 		D[2+ldd*0] = CC[2+bs*0];
12711 		D[3+ldd*0] = CC[3+bs*0];
12712 
12713 		if(n1==1)
12714 			return;
12715 
12716 		D[0+ldd*1] = CC[0+bs*1];
12717 		D[1+ldd*1] = CC[1+bs*1];
12718 		D[2+ldd*1] = CC[2+bs*1];
12719 		D[3+ldd*1] = CC[3+bs*1];
12720 
12721 		if(n1==2)
12722 			return;
12723 
12724 		D[0+ldd*2] = CC[0+bs*2];
12725 		D[1+ldd*2] = CC[1+bs*2];
12726 		D[2+ldd*2] = CC[2+bs*2];
12727 		D[3+ldd*2] = CC[3+bs*2];
12728 
12729 		if(n1==3)
12730 			return;
12731 
12732 		D[0+ldd*3] = CC[0+bs*3];
12733 		D[1+ldd*3] = CC[1+bs*3];
12734 		D[2+ldd*3] = CC[2+bs*3];
12735 		D[3+ldd*3] = CC[3+bs*3];
12736 		}
12737 	else if(m1>=3)
12738 		{
12739 		D[0+ldd*0] = CC[0+bs*0];
12740 		D[1+ldd*0] = CC[1+bs*0];
12741 		D[2+ldd*0] = CC[2+bs*0];
12742 
12743 		if(n1==1)
12744 			return;
12745 
12746 		D[0+ldd*1] = CC[0+bs*1];
12747 		D[1+ldd*1] = CC[1+bs*1];
12748 		D[2+ldd*1] = CC[2+bs*1];
12749 
12750 		if(n1==2)
12751 			return;
12752 
12753 		D[0+ldd*2] = CC[0+bs*2];
12754 		D[1+ldd*2] = CC[1+bs*2];
12755 		D[2+ldd*2] = CC[2+bs*2];
12756 
12757 		if(n1==3)
12758 			return;
12759 
12760 		D[0+ldd*3] = CC[0+bs*3];
12761 		D[1+ldd*3] = CC[1+bs*3];
12762 		D[2+ldd*3] = CC[2+bs*3];
12763 		}
12764 	else if(m1>=2)
12765 		{
12766 		D[0+ldd*0] = CC[0+bs*0];
12767 		D[1+ldd*0] = CC[1+bs*0];
12768 
12769 		if(n1==1)
12770 			return;
12771 
12772 		D[0+ldd*1] = CC[0+bs*1];
12773 		D[1+ldd*1] = CC[1+bs*1];
12774 
12775 		if(n1==2)
12776 			return;
12777 
12778 		D[0+ldd*2] = CC[0+bs*2];
12779 		D[1+ldd*2] = CC[1+bs*2];
12780 
12781 		if(n1==3)
12782 			return;
12783 
12784 		D[0+ldd*3] = CC[0+bs*3];
12785 		D[1+ldd*3] = CC[1+bs*3];
12786 		}
12787 	else //if(m1>=1)
12788 		{
12789 		D[0+ldd*0] = CC[0+bs*0];
12790 
12791 		if(n1==1)
12792 			return;
12793 
12794 		D[0+ldd*1] = CC[0+bs*1];
12795 
12796 		if(n1==2)
12797 			return;
12798 
12799 		D[0+ldd*2] = CC[0+bs*2];
12800 
12801 		if(n1==3)
12802 			return;
12803 
12804 		D[0+ldd*3] = CC[0+bs*3];
12805 		}
12806 
12807 	return;
12808 
12809 	}
12810 #endif
12811 
12812 
12813 
12814 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_tran_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd)12815 void kernel_dtrmm_nt_ru_4x4_tran_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd)
12816 	{
12817 
12818 	const int bs = 4;
12819 
12820 	double
12821 		a_0, a_1, a_2, a_3,
12822 		b_0, b_1, b_2, b_3;
12823 
12824 #if defined(TARGET_GENERIC)
12825 	double CC[16] = {0};
12826 #else
12827 	ALIGNED( double CC[16], 64 ) = {0};
12828 #endif
12829 
12830 	int k;
12831 
12832 	k = 0;
12833 
12834 	// k = 0
12835 
12836 	a_0 = A[0];
12837 	a_1 = A[1];
12838 	a_2 = A[2];
12839 	a_3 = A[3];
12840 
12841 	b_0 = B[0+0*ldb];
12842 	CC[0+bs*0] += a_0 * b_0;
12843 	CC[1+bs*0] += a_1 * b_0;
12844 	CC[2+bs*0] += a_2 * b_0;
12845 	CC[3+bs*0] += a_3 * b_0;
12846 
12847 	A += bs;
12848 	B += ldb;
12849 	k += 1;
12850 
12851 	if(k>=kmax)
12852 		goto store;
12853 
12854 	// k = 1
12855 
12856 	a_0 = A[0];
12857 	a_1 = A[1];
12858 	a_2 = A[2];
12859 	a_3 = A[3];
12860 
12861 	b_0 = B[0+0*ldb];
12862 	CC[0+bs*0] += a_0 * b_0;
12863 	CC[1+bs*0] += a_1 * b_0;
12864 	CC[2+bs*0] += a_2 * b_0;
12865 	CC[3+bs*0] += a_3 * b_0;
12866 
12867 	b_1 = B[1+0*ldb];
12868 	CC[0+bs*1] += a_0 * b_1;
12869 	CC[1+bs*1] += a_1 * b_1;
12870 	CC[2+bs*1] += a_2 * b_1;
12871 	CC[3+bs*1] += a_3 * b_1;
12872 
12873 	A += bs;
12874 	B += ldb;
12875 	k += 1;
12876 
12877 	if(k>=kmax)
12878 		goto store;
12879 
12880 	// k = 2
12881 
12882 	a_0 = A[0];
12883 	a_1 = A[1];
12884 	a_2 = A[2];
12885 	a_3 = A[3];
12886 
12887 	b_0 = B[0+0*ldb];
12888 	CC[0+bs*0] += a_0 * b_0;
12889 	CC[1+bs*0] += a_1 * b_0;
12890 	CC[2+bs*0] += a_2 * b_0;
12891 	CC[3+bs*0] += a_3 * b_0;
12892 
12893 	b_1 = B[1+0*ldb];
12894 	CC[0+bs*1] += a_0 * b_1;
12895 	CC[1+bs*1] += a_1 * b_1;
12896 	CC[2+bs*1] += a_2 * b_1;
12897 	CC[3+bs*1] += a_3 * b_1;
12898 
12899 	b_2 = B[2+0*ldb];
12900 	CC[0+bs*2] += a_0 * b_2;
12901 	CC[1+bs*2] += a_1 * b_2;
12902 	CC[2+bs*2] += a_2 * b_2;
12903 	CC[3+bs*2] += a_3 * b_2;
12904 
12905 	A += bs;
12906 	B += ldb;
12907 	k += 1;
12908 
12909 	if(k>=kmax)
12910 		goto store;
12911 
12912 	// k = 3
12913 
12914 	a_0 = A[0];
12915 	a_1 = A[1];
12916 	a_2 = A[2];
12917 	a_3 = A[3];
12918 
12919 	b_0 = B[0+0*ldb];
12920 	CC[0+bs*0] += a_0 * b_0;
12921 	CC[1+bs*0] += a_1 * b_0;
12922 	CC[2+bs*0] += a_2 * b_0;
12923 	CC[3+bs*0] += a_3 * b_0;
12924 
12925 	b_1 = B[1+0*ldb];
12926 	CC[0+bs*1] += a_0 * b_1;
12927 	CC[1+bs*1] += a_1 * b_1;
12928 	CC[2+bs*1] += a_2 * b_1;
12929 	CC[3+bs*1] += a_3 * b_1;
12930 
12931 	b_2 = B[2+0*ldb];
12932 	CC[0+bs*2] += a_0 * b_2;
12933 	CC[1+bs*2] += a_1 * b_2;
12934 	CC[2+bs*2] += a_2 * b_2;
12935 	CC[3+bs*2] += a_3 * b_2;
12936 
12937 	b_3 = B[3+0*ldb];
12938 	CC[0+bs*3] += a_0 * b_3;
12939 	CC[1+bs*3] += a_1 * b_3;
12940 	CC[2+bs*3] += a_2 * b_3;
12941 	CC[3+bs*3] += a_3 * b_3;
12942 
12943 	A += bs;
12944 	B += ldb;
12945 	k += 1;
12946 
12947 	double alpha1 = 1.0;
12948 	double beta1 = 1.0;
12949 
12950 	kernel_dgemm_nt_4x4_lib4ccc(kmax-k, &alpha1, A, B, ldb, &beta1, CC, bs, CC, bs);
12951 
12952 	store:
12953 
12954 	// scale & tranpose & store
12955 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
12956 	D[0+ldd*1] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
12957 	D[0+ldd*2] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
12958 	D[0+ldd*3] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
12959 
12960 	D[1+ldd*0] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
12961 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
12962 	D[1+ldd*2] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
12963 	D[1+ldd*3] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
12964 
12965 	D[2+ldd*0] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
12966 	D[2+ldd*1] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
12967 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
12968 	D[2+ldd*3] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
12969 
12970 	D[3+ldd*0] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
12971 	D[3+ldd*1] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
12972 	D[3+ldd*2] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
12973 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
12974 
12975 	return;
12976 
12977 	}
12978 #endif
12979 
12980 
12981 
12982 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_tran_vs_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd,int m1,int n1)12983 void kernel_dtrmm_nt_ru_4x4_tran_vs_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1)
12984 	{
12985 
12986 	const int bs = 4;
12987 
12988 	double
12989 		tmp,
12990 		a_0, a_1, a_2, a_3,
12991 		b_0, b_1, b_2, b_3;
12992 
12993 #if defined(TARGET_GENERIC)
12994 	double CC[16] = {0};
12995 #else
12996 	ALIGNED( double CC[16], 64 ) = {0};
12997 #endif
12998 
12999 	int k;
13000 
13001 	k = 0;
13002 
13003 	// k = 0
13004 
13005 	a_0 = A[0];
13006 	a_1 = A[1];
13007 	a_2 = A[2];
13008 	a_3 = A[3];
13009 
13010 	b_0 = B[0+0*ldb];
13011 	CC[0+bs*0] += a_0 * b_0;
13012 	CC[1+bs*0] += a_1 * b_0;
13013 	CC[2+bs*0] += a_2 * b_0;
13014 	CC[3+bs*0] += a_3 * b_0;
13015 
13016 	A += bs;
13017 	B += ldb;
13018 	k += 1;
13019 
13020 	if(k>=kmax)
13021 		goto store;
13022 
13023 	// k = 1
13024 
13025 	a_0 = A[0];
13026 	a_1 = A[1];
13027 	a_2 = A[2];
13028 	a_3 = A[3];
13029 
13030 	b_0 = B[0+0*ldb];
13031 	CC[0+bs*0] += a_0 * b_0;
13032 	CC[1+bs*0] += a_1 * b_0;
13033 	CC[2+bs*0] += a_2 * b_0;
13034 	CC[3+bs*0] += a_3 * b_0;
13035 
13036 	b_1 = B[1+0*ldb];
13037 	CC[0+bs*1] += a_0 * b_1;
13038 	CC[1+bs*1] += a_1 * b_1;
13039 	CC[2+bs*1] += a_2 * b_1;
13040 	CC[3+bs*1] += a_3 * b_1;
13041 
13042 	A += bs;
13043 	B += ldb;
13044 	k += 1;
13045 
13046 	if(k>=kmax)
13047 		goto store;
13048 
13049 	// k = 2
13050 
13051 	a_0 = A[0];
13052 	a_1 = A[1];
13053 	a_2 = A[2];
13054 	a_3 = A[3];
13055 
13056 	b_0 = B[0+0*ldb];
13057 	CC[0+bs*0] += a_0 * b_0;
13058 	CC[1+bs*0] += a_1 * b_0;
13059 	CC[2+bs*0] += a_2 * b_0;
13060 	CC[3+bs*0] += a_3 * b_0;
13061 
13062 	b_1 = B[1+0*ldb];
13063 	CC[0+bs*1] += a_0 * b_1;
13064 	CC[1+bs*1] += a_1 * b_1;
13065 	CC[2+bs*1] += a_2 * b_1;
13066 	CC[3+bs*1] += a_3 * b_1;
13067 
13068 	b_2 = B[2+0*ldb];
13069 	CC[0+bs*2] += a_0 * b_2;
13070 	CC[1+bs*2] += a_1 * b_2;
13071 	CC[2+bs*2] += a_2 * b_2;
13072 	CC[3+bs*2] += a_3 * b_2;
13073 
13074 	A += bs;
13075 	B += ldb;
13076 	k += 1;
13077 
13078 	if(k>=kmax)
13079 		goto store;
13080 
13081 	// k = 3
13082 
13083 	a_0 = A[0];
13084 	a_1 = A[1];
13085 	a_2 = A[2];
13086 	a_3 = A[3];
13087 
13088 	b_0 = B[0+0*ldb];
13089 	CC[0+bs*0] += a_0 * b_0;
13090 	CC[1+bs*0] += a_1 * b_0;
13091 	CC[2+bs*0] += a_2 * b_0;
13092 	CC[3+bs*0] += a_3 * b_0;
13093 
13094 	b_1 = B[1+0*ldb];
13095 	CC[0+bs*1] += a_0 * b_1;
13096 	CC[1+bs*1] += a_1 * b_1;
13097 	CC[2+bs*1] += a_2 * b_1;
13098 	CC[3+bs*1] += a_3 * b_1;
13099 
13100 	b_2 = B[2+0*ldb];
13101 	CC[0+bs*2] += a_0 * b_2;
13102 	CC[1+bs*2] += a_1 * b_2;
13103 	CC[2+bs*2] += a_2 * b_2;
13104 	CC[3+bs*2] += a_3 * b_2;
13105 
13106 	b_3 = B[3+0*ldb];
13107 	CC[0+bs*3] += a_0 * b_3;
13108 	CC[1+bs*3] += a_1 * b_3;
13109 	CC[2+bs*3] += a_2 * b_3;
13110 	CC[3+bs*3] += a_3 * b_3;
13111 
13112 	A += bs;
13113 	B += ldb;
13114 	k += 1;
13115 
13116 	double alpha1 = 1.0;
13117 	double beta1 = 1.0;
13118 
13119 	kernel_dgemm_nt_4x4_lib4ccc(kmax-k, &alpha1, A, B, ldb, &beta1, CC, bs, CC, bs);
13120 
13121 	store:
13122 
13123 	// scale
13124 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
13125 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
13126 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
13127 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
13128 
13129 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
13130 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
13131 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
13132 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
13133 
13134 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
13135 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
13136 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
13137 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
13138 
13139 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
13140 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
13141 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
13142 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
13143 
13144 	// transpose
13145 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
13146 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
13147 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
13148 
13149 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
13150 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
13151 
13152 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
13153 
13154 	// store
13155 	if(m1>=4)
13156 		{
13157 		D[0+ldd*0] = CC[0+bs*0];
13158 		D[1+ldd*0] = CC[1+bs*0];
13159 		D[2+ldd*0] = CC[2+bs*0];
13160 		D[3+ldd*0] = CC[3+bs*0];
13161 
13162 		if(n1==1)
13163 			return;
13164 
13165 		D[0+ldd*1] = CC[0+bs*1];
13166 		D[1+ldd*1] = CC[1+bs*1];
13167 		D[2+ldd*1] = CC[2+bs*1];
13168 		D[3+ldd*1] = CC[3+bs*1];
13169 
13170 		if(n1==2)
13171 			return;
13172 
13173 		D[0+ldd*2] = CC[0+bs*2];
13174 		D[1+ldd*2] = CC[1+bs*2];
13175 		D[2+ldd*2] = CC[2+bs*2];
13176 		D[3+ldd*2] = CC[3+bs*2];
13177 
13178 		if(n1==3)
13179 			return;
13180 
13181 		D[0+ldd*3] = CC[0+bs*3];
13182 		D[1+ldd*3] = CC[1+bs*3];
13183 		D[2+ldd*3] = CC[2+bs*3];
13184 		D[3+ldd*3] = CC[3+bs*3];
13185 		}
13186 	else if(m1>=3)
13187 		{
13188 		D[0+ldd*0] = CC[0+bs*0];
13189 		D[1+ldd*0] = CC[1+bs*0];
13190 		D[2+ldd*0] = CC[2+bs*0];
13191 
13192 		if(n1==1)
13193 			return;
13194 
13195 		D[0+ldd*1] = CC[0+bs*1];
13196 		D[1+ldd*1] = CC[1+bs*1];
13197 		D[2+ldd*1] = CC[2+bs*1];
13198 
13199 		if(n1==2)
13200 			return;
13201 
13202 		D[0+ldd*2] = CC[0+bs*2];
13203 		D[1+ldd*2] = CC[1+bs*2];
13204 		D[2+ldd*2] = CC[2+bs*2];
13205 
13206 		if(n1==3)
13207 			return;
13208 
13209 		D[0+ldd*3] = CC[0+bs*3];
13210 		D[1+ldd*3] = CC[1+bs*3];
13211 		D[2+ldd*3] = CC[2+bs*3];
13212 		}
13213 	else if(m1>=2)
13214 		{
13215 		D[0+ldd*0] = CC[0+bs*0];
13216 		D[1+ldd*0] = CC[1+bs*0];
13217 
13218 		if(n1==1)
13219 			return;
13220 
13221 		D[0+ldd*1] = CC[0+bs*1];
13222 		D[1+ldd*1] = CC[1+bs*1];
13223 
13224 		if(n1==2)
13225 			return;
13226 
13227 		D[0+ldd*2] = CC[0+bs*2];
13228 		D[1+ldd*2] = CC[1+bs*2];
13229 
13230 		if(n1==3)
13231 			return;
13232 
13233 		D[0+ldd*3] = CC[0+bs*3];
13234 		D[1+ldd*3] = CC[1+bs*3];
13235 		}
13236 	else //if(m1>=1)
13237 		{
13238 		D[0+ldd*0] = CC[0+bs*0];
13239 
13240 		if(n1==1)
13241 			return;
13242 
13243 		D[0+ldd*1] = CC[0+bs*1];
13244 
13245 		if(n1==2)
13246 			return;
13247 
13248 		D[0+ldd*2] = CC[0+bs*2];
13249 
13250 		if(n1==3)
13251 			return;
13252 
13253 		D[0+ldd*3] = CC[0+bs*3];
13254 		}
13255 
13256 	return;
13257 
13258 	}
13259 #endif
13260 
13261 
13262 
13263 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_one_4x4_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd)13264 void kernel_dtrmm_nt_ru_one_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
13265 	{
13266 
13267 	const int bs = 4;
13268 
13269 	double
13270 		a_0, a_1, a_2, a_3,
13271 		b_0, b_1, b_2, b_3;
13272 
13273 #if defined(TARGET_GENERIC)
13274 	double CC[16] = {0};
13275 #else
13276 	ALIGNED( double CC[16], 64 ) = {0};
13277 #endif
13278 
13279 	int k;
13280 
13281 	k = 0;
13282 
13283 	// k = 0
13284 
13285 	a_0 = A[0];
13286 	a_1 = A[1];
13287 	a_2 = A[2];
13288 	a_3 = A[3];
13289 
13290 	CC[0+bs*0] += a_0;
13291 	CC[1+bs*0] += a_1;
13292 	CC[2+bs*0] += a_2;
13293 	CC[3+bs*0] += a_3;
13294 
13295 	A += bs;
13296 	B += bs;
13297 	k += 1;
13298 
13299 	if(k>=kmax)
13300 		goto store;
13301 
13302 	// k = 1
13303 
13304 	a_0 = A[0];
13305 	a_1 = A[1];
13306 	a_2 = A[2];
13307 	a_3 = A[3];
13308 
13309 	b_0 = B[0+0*bs];
13310 	CC[0+bs*0] += a_0 * b_0;
13311 	CC[1+bs*0] += a_1 * b_0;
13312 	CC[2+bs*0] += a_2 * b_0;
13313 	CC[3+bs*0] += a_3 * b_0;
13314 
13315 	CC[0+bs*1] += a_0;
13316 	CC[1+bs*1] += a_1;
13317 	CC[2+bs*1] += a_2;
13318 	CC[3+bs*1] += a_3;
13319 
13320 	A += bs;
13321 	B += bs;
13322 	k += 1;
13323 
13324 	if(k>=kmax)
13325 		goto store;
13326 
13327 	// k = 2
13328 
13329 	a_0 = A[0];
13330 	a_1 = A[1];
13331 	a_2 = A[2];
13332 	a_3 = A[3];
13333 
13334 	b_0 = B[0+0*bs];
13335 	CC[0+bs*0] += a_0 * b_0;
13336 	CC[1+bs*0] += a_1 * b_0;
13337 	CC[2+bs*0] += a_2 * b_0;
13338 	CC[3+bs*0] += a_3 * b_0;
13339 
13340 	b_1 = B[1+0*bs];
13341 	CC[0+bs*1] += a_0 * b_1;
13342 	CC[1+bs*1] += a_1 * b_1;
13343 	CC[2+bs*1] += a_2 * b_1;
13344 	CC[3+bs*1] += a_3 * b_1;
13345 
13346 	CC[0+bs*2] += a_0;
13347 	CC[1+bs*2] += a_1;
13348 	CC[2+bs*2] += a_2;
13349 	CC[3+bs*2] += a_3;
13350 
13351 	A += bs;
13352 	B += bs;
13353 	k += 1;
13354 
13355 	if(k>=kmax)
13356 		goto store;
13357 
13358 	// k = 3
13359 
13360 	a_0 = A[0];
13361 	a_1 = A[1];
13362 	a_2 = A[2];
13363 	a_3 = A[3];
13364 
13365 	b_0 = B[0+0*bs];
13366 	CC[0+bs*0] += a_0 * b_0;
13367 	CC[1+bs*0] += a_1 * b_0;
13368 	CC[2+bs*0] += a_2 * b_0;
13369 	CC[3+bs*0] += a_3 * b_0;
13370 
13371 	b_1 = B[1+0*bs];
13372 	CC[0+bs*1] += a_0 * b_1;
13373 	CC[1+bs*1] += a_1 * b_1;
13374 	CC[2+bs*1] += a_2 * b_1;
13375 	CC[3+bs*1] += a_3 * b_1;
13376 
13377 	b_2 = B[2+0*bs];
13378 	CC[0+bs*2] += a_0 * b_2;
13379 	CC[1+bs*2] += a_1 * b_2;
13380 	CC[2+bs*2] += a_2 * b_2;
13381 	CC[3+bs*2] += a_3 * b_2;
13382 
13383 	CC[0+bs*3] += a_0;
13384 	CC[1+bs*3] += a_1;
13385 	CC[2+bs*3] += a_2;
13386 	CC[3+bs*3] += a_3;
13387 
13388 	A += bs;
13389 	B += bs;
13390 	k += 1;
13391 
13392 	store:
13393 
13394 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
13395 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
13396 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
13397 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
13398 
13399 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
13400 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
13401 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
13402 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
13403 
13404 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
13405 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
13406 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
13407 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
13408 
13409 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
13410 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
13411 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
13412 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
13413 
13414 	double beta1 = 1.0;
13415 
13416 	kernel_dgemm_nt_4x4_lib4(kmax-k, alpha, A, B, &beta1, CC, CC);
13417 
13418 	D[0+ldd*0] = CC[0+bs*0];
13419 	D[1+ldd*0] = CC[1+bs*0];
13420 	D[2+ldd*0] = CC[2+bs*0];
13421 	D[3+ldd*0] = CC[3+bs*0];
13422 
13423 	D[0+ldd*1] = CC[0+bs*1];
13424 	D[1+ldd*1] = CC[1+bs*1];
13425 	D[2+ldd*1] = CC[2+bs*1];
13426 	D[3+ldd*1] = CC[3+bs*1];
13427 
13428 	D[0+ldd*2] = CC[0+bs*2];
13429 	D[1+ldd*2] = CC[1+bs*2];
13430 	D[2+ldd*2] = CC[2+bs*2];
13431 	D[3+ldd*2] = CC[3+bs*2];
13432 
13433 	D[0+ldd*3] = CC[0+bs*3];
13434 	D[1+ldd*3] = CC[1+bs*3];
13435 	D[2+ldd*3] = CC[2+bs*3];
13436 	D[3+ldd*3] = CC[3+bs*3];
13437 
13438 	return;
13439 
13440 	}
13441 #endif
13442 
13443 
13444 
13445 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_one_4x4_vs_lib44cc(int kmax,double * alpha,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)13446 void kernel_dtrmm_nt_ru_one_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
13447 	{
13448 
13449 	const int bs = 4;
13450 
13451 	double
13452 		a_0, a_1, a_2, a_3,
13453 		b_0, b_1, b_2, b_3;
13454 
13455 #if defined(TARGET_GENERIC)
13456 	double CC[16] = {0};
13457 #else
13458 	ALIGNED( double CC[16], 64 ) = {0};
13459 #endif
13460 
13461 	int k;
13462 
13463 	k = 0;
13464 
13465 	// k = 0
13466 
13467 	a_0 = A[0];
13468 	a_1 = A[1];
13469 	a_2 = A[2];
13470 	a_3 = A[3];
13471 
13472 	CC[0+bs*0] += a_0;
13473 	CC[1+bs*0] += a_1;
13474 	CC[2+bs*0] += a_2;
13475 	CC[3+bs*0] += a_3;
13476 
13477 	A += bs;
13478 	B += bs;
13479 	k += 1;
13480 
13481 	if(k>=kmax)
13482 		goto store;
13483 
13484 	// k = 1
13485 
13486 	a_0 = A[0];
13487 	a_1 = A[1];
13488 	a_2 = A[2];
13489 	a_3 = A[3];
13490 
13491 	b_0 = B[0+0*bs];
13492 	CC[0+bs*0] += a_0 * b_0;
13493 	CC[1+bs*0] += a_1 * b_0;
13494 	CC[2+bs*0] += a_2 * b_0;
13495 	CC[3+bs*0] += a_3 * b_0;
13496 
13497 	CC[0+bs*1] += a_0;
13498 	CC[1+bs*1] += a_1;
13499 	CC[2+bs*1] += a_2;
13500 	CC[3+bs*1] += a_3;
13501 
13502 	A += bs;
13503 	B += bs;
13504 	k += 1;
13505 
13506 	if(k>=kmax)
13507 		goto store;
13508 
13509 	// k = 2
13510 
13511 	a_0 = A[0];
13512 	a_1 = A[1];
13513 	a_2 = A[2];
13514 	a_3 = A[3];
13515 
13516 	b_0 = B[0+0*bs];
13517 	CC[0+bs*0] += a_0 * b_0;
13518 	CC[1+bs*0] += a_1 * b_0;
13519 	CC[2+bs*0] += a_2 * b_0;
13520 	CC[3+bs*0] += a_3 * b_0;
13521 
13522 	b_1 = B[1+0*bs];
13523 	CC[0+bs*1] += a_0 * b_1;
13524 	CC[1+bs*1] += a_1 * b_1;
13525 	CC[2+bs*1] += a_2 * b_1;
13526 	CC[3+bs*1] += a_3 * b_1;
13527 
13528 	CC[0+bs*2] += a_0;
13529 	CC[1+bs*2] += a_1;
13530 	CC[2+bs*2] += a_2;
13531 	CC[3+bs*2] += a_3;
13532 
13533 	A += bs;
13534 	B += bs;
13535 	k += 1;
13536 
13537 	if(k>=kmax)
13538 		goto store;
13539 
13540 	// k = 3
13541 
13542 	a_0 = A[0];
13543 	a_1 = A[1];
13544 	a_2 = A[2];
13545 	a_3 = A[3];
13546 
13547 	b_0 = B[0+0*bs];
13548 	CC[0+bs*0] += a_0 * b_0;
13549 	CC[1+bs*0] += a_1 * b_0;
13550 	CC[2+bs*0] += a_2 * b_0;
13551 	CC[3+bs*0] += a_3 * b_0;
13552 
13553 	b_1 = B[1+0*bs];
13554 	CC[0+bs*1] += a_0 * b_1;
13555 	CC[1+bs*1] += a_1 * b_1;
13556 	CC[2+bs*1] += a_2 * b_1;
13557 	CC[3+bs*1] += a_3 * b_1;
13558 
13559 	b_2 = B[2+0*bs];
13560 	CC[0+bs*2] += a_0 * b_2;
13561 	CC[1+bs*2] += a_1 * b_2;
13562 	CC[2+bs*2] += a_2 * b_2;
13563 	CC[3+bs*2] += a_3 * b_2;
13564 
13565 	CC[0+bs*3] += a_0;
13566 	CC[1+bs*3] += a_1;
13567 	CC[2+bs*3] += a_2;
13568 	CC[3+bs*3] += a_3;
13569 
13570 	A += bs;
13571 	B += bs;
13572 	k += 1;
13573 
13574 	store:
13575 
13576 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
13577 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
13578 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
13579 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
13580 
13581 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
13582 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
13583 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
13584 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
13585 
13586 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
13587 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
13588 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
13589 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
13590 
13591 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
13592 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
13593 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
13594 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
13595 
13596 	double beta1 = 1.0;
13597 
13598 	kernel_dgemm_nt_4x4_lib4(kmax-k, alpha, A, B, &beta1, CC, CC);
13599 
13600 	if(m1>=4)
13601 		{
13602 		D[0+ldd*0] = CC[0+bs*0];
13603 		D[1+ldd*0] = CC[1+bs*0];
13604 		D[2+ldd*0] = CC[2+bs*0];
13605 		D[3+ldd*0] = CC[3+bs*0];
13606 
13607 		if(n1==1)
13608 			return;
13609 
13610 		D[0+ldd*1] = CC[0+bs*1];
13611 		D[1+ldd*1] = CC[1+bs*1];
13612 		D[2+ldd*1] = CC[2+bs*1];
13613 		D[3+ldd*1] = CC[3+bs*1];
13614 
13615 		if(n1==2)
13616 			return;
13617 
13618 		D[0+ldd*2] = CC[0+bs*2];
13619 		D[1+ldd*2] = CC[1+bs*2];
13620 		D[2+ldd*2] = CC[2+bs*2];
13621 		D[3+ldd*2] = CC[3+bs*2];
13622 
13623 		if(n1==3)
13624 			return;
13625 
13626 		D[0+ldd*3] = CC[0+bs*3];
13627 		D[1+ldd*3] = CC[1+bs*3];
13628 		D[2+ldd*3] = CC[2+bs*3];
13629 		D[3+ldd*3] = CC[3+bs*3];
13630 		}
13631 	else if(m1>=3)
13632 		{
13633 		D[0+ldd*0] = CC[0+bs*0];
13634 		D[1+ldd*0] = CC[1+bs*0];
13635 		D[2+ldd*0] = CC[2+bs*0];
13636 
13637 		if(n1==1)
13638 			return;
13639 
13640 		D[0+ldd*1] = CC[0+bs*1];
13641 		D[1+ldd*1] = CC[1+bs*1];
13642 		D[2+ldd*1] = CC[2+bs*1];
13643 
13644 		if(n1==2)
13645 			return;
13646 
13647 		D[0+ldd*2] = CC[0+bs*2];
13648 		D[1+ldd*2] = CC[1+bs*2];
13649 		D[2+ldd*2] = CC[2+bs*2];
13650 
13651 		if(n1==3)
13652 			return;
13653 
13654 		D[0+ldd*3] = CC[0+bs*3];
13655 		D[1+ldd*3] = CC[1+bs*3];
13656 		D[2+ldd*3] = CC[2+bs*3];
13657 		}
13658 	else if(m1>=2)
13659 		{
13660 		D[0+ldd*0] = CC[0+bs*0];
13661 		D[1+ldd*0] = CC[1+bs*0];
13662 
13663 		if(n1==1)
13664 			return;
13665 
13666 		D[0+ldd*1] = CC[0+bs*1];
13667 		D[1+ldd*1] = CC[1+bs*1];
13668 
13669 		if(n1==2)
13670 			return;
13671 
13672 		D[0+ldd*2] = CC[0+bs*2];
13673 		D[1+ldd*2] = CC[1+bs*2];
13674 
13675 		if(n1==3)
13676 			return;
13677 
13678 		D[0+ldd*3] = CC[0+bs*3];
13679 		D[1+ldd*3] = CC[1+bs*3];
13680 		}
13681 	else //if(m1>=1)
13682 		{
13683 		D[0+ldd*0] = CC[0+bs*0];
13684 
13685 		if(n1==1)
13686 			return;
13687 
13688 		D[0+ldd*1] = CC[0+bs*1];
13689 
13690 		if(n1==2)
13691 			return;
13692 
13693 		D[0+ldd*2] = CC[0+bs*2];
13694 
13695 		if(n1==3)
13696 			return;
13697 
13698 		D[0+ldd*3] = CC[0+bs*3];
13699 		}
13700 
13701 	return;
13702 
13703 	}
13704 #endif
13705 
13706 
13707 
13708 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_one_4x4_tran_lib444c(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int ldd)13709 void kernel_dtrmm_nt_ru_one_4x4_tran_lib444c(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd)
13710 	{
13711 
13712 	const int bs = 4;
13713 
13714 	double
13715 		a_0, a_1, a_2, a_3,
13716 		b_0, b_1, b_2, b_3;
13717 
13718 #if defined(TARGET_GENERIC)
13719 	double CC[16] = {0};
13720 #else
13721 	ALIGNED( double CC[16], 64 ) = {0};
13722 #endif
13723 
13724 	int k;
13725 
13726 	k = 0;
13727 
13728 	// k = 0
13729 
13730 	a_0 = A[0];
13731 	a_1 = A[1];
13732 	a_2 = A[2];
13733 	a_3 = A[3];
13734 
13735 	CC[0+bs*0] += a_0;
13736 	CC[1+bs*0] += a_1;
13737 	CC[2+bs*0] += a_2;
13738 	CC[3+bs*0] += a_3;
13739 
13740 	A += bs;
13741 	B += bs;
13742 	k += 1;
13743 
13744 	if(k>=kmax)
13745 		goto store;
13746 
13747 	// k = 1
13748 
13749 	a_0 = A[0];
13750 	a_1 = A[1];
13751 	a_2 = A[2];
13752 	a_3 = A[3];
13753 
13754 	b_0 = B[0+0*bs];
13755 	CC[0+bs*0] += a_0 * b_0;
13756 	CC[1+bs*0] += a_1 * b_0;
13757 	CC[2+bs*0] += a_2 * b_0;
13758 	CC[3+bs*0] += a_3 * b_0;
13759 
13760 	CC[0+bs*1] += a_0;
13761 	CC[1+bs*1] += a_1;
13762 	CC[2+bs*1] += a_2;
13763 	CC[3+bs*1] += a_3;
13764 
13765 	A += bs;
13766 	B += bs;
13767 	k += 1;
13768 
13769 	if(k>=kmax)
13770 		goto store;
13771 
13772 	// k = 2
13773 
13774 	a_0 = A[0];
13775 	a_1 = A[1];
13776 	a_2 = A[2];
13777 	a_3 = A[3];
13778 
13779 	b_0 = B[0+0*bs];
13780 	CC[0+bs*0] += a_0 * b_0;
13781 	CC[1+bs*0] += a_1 * b_0;
13782 	CC[2+bs*0] += a_2 * b_0;
13783 	CC[3+bs*0] += a_3 * b_0;
13784 
13785 	b_1 = B[1+0*bs];
13786 	CC[0+bs*1] += a_0 * b_1;
13787 	CC[1+bs*1] += a_1 * b_1;
13788 	CC[2+bs*1] += a_2 * b_1;
13789 	CC[3+bs*1] += a_3 * b_1;
13790 
13791 	CC[0+bs*2] += a_0;
13792 	CC[1+bs*2] += a_1;
13793 	CC[2+bs*2] += a_2;
13794 	CC[3+bs*2] += a_3;
13795 
13796 	A += bs;
13797 	B += bs;
13798 	k += 1;
13799 
13800 	if(k>=kmax)
13801 		goto store;
13802 
13803 	// k = 3
13804 
13805 	a_0 = A[0];
13806 	a_1 = A[1];
13807 	a_2 = A[2];
13808 	a_3 = A[3];
13809 
13810 	b_0 = B[0+0*bs];
13811 	CC[0+bs*0] += a_0 * b_0;
13812 	CC[1+bs*0] += a_1 * b_0;
13813 	CC[2+bs*0] += a_2 * b_0;
13814 	CC[3+bs*0] += a_3 * b_0;
13815 
13816 	b_1 = B[1+0*bs];
13817 	CC[0+bs*1] += a_0 * b_1;
13818 	CC[1+bs*1] += a_1 * b_1;
13819 	CC[2+bs*1] += a_2 * b_1;
13820 	CC[3+bs*1] += a_3 * b_1;
13821 
13822 	b_2 = B[2+0*bs];
13823 	CC[0+bs*2] += a_0 * b_2;
13824 	CC[1+bs*2] += a_1 * b_2;
13825 	CC[2+bs*2] += a_2 * b_2;
13826 	CC[3+bs*2] += a_3 * b_2;
13827 
13828 	CC[0+bs*3] += a_0;
13829 	CC[1+bs*3] += a_1;
13830 	CC[2+bs*3] += a_2;
13831 	CC[3+bs*3] += a_3;
13832 
13833 	A += bs;
13834 	B += bs;
13835 	k += 1;
13836 
13837 	double alpha1 = 1.0;
13838 	double beta1 = 1.0;
13839 
13840 	kernel_dgemm_nt_4x4_lib4(kmax-k, &alpha1, A, B, &beta1, CC, CC);
13841 
13842 	store:
13843 
13844 	// scale & tranpose & store
13845 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
13846 	D[0+ldd*1] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
13847 	D[0+ldd*2] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
13848 	D[0+ldd*3] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
13849 
13850 	D[1+ldd*0] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
13851 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
13852 	D[1+ldd*2] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
13853 	D[1+ldd*3] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
13854 
13855 	D[2+ldd*0] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
13856 	D[2+ldd*1] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
13857 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
13858 	D[2+ldd*3] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
13859 
13860 	D[3+ldd*0] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
13861 	D[3+ldd*1] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
13862 	D[3+ldd*2] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
13863 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
13864 
13865 	return;
13866 
13867 	}
13868 #endif
13869 
13870 
13871 
13872 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib444c(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int ldd,int m1,int n1)13873 void kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib444c(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1)
13874 	{
13875 
13876 	const int bs = 4;
13877 
13878 	double
13879 		tmp,
13880 		a_0, a_1, a_2, a_3,
13881 		b_0, b_1, b_2, b_3;
13882 
13883 #if defined(TARGET_GENERIC)
13884 	double CC[16] = {0};
13885 #else
13886 	ALIGNED( double CC[16], 64 ) = {0};
13887 #endif
13888 
13889 	int k;
13890 
13891 	k = 0;
13892 
13893 	// k = 0
13894 
13895 	a_0 = A[0];
13896 	a_1 = A[1];
13897 	a_2 = A[2];
13898 	a_3 = A[3];
13899 
13900 	CC[0+bs*0] += a_0;
13901 	CC[1+bs*0] += a_1;
13902 	CC[2+bs*0] += a_2;
13903 	CC[3+bs*0] += a_3;
13904 
13905 	A += bs;
13906 	B += bs;
13907 	k += 1;
13908 
13909 	if(k>=kmax)
13910 		goto store;
13911 
13912 	// k = 1
13913 
13914 	a_0 = A[0];
13915 	a_1 = A[1];
13916 	a_2 = A[2];
13917 	a_3 = A[3];
13918 
13919 	b_0 = B[0+0*bs];
13920 	CC[0+bs*0] += a_0 * b_0;
13921 	CC[1+bs*0] += a_1 * b_0;
13922 	CC[2+bs*0] += a_2 * b_0;
13923 	CC[3+bs*0] += a_3 * b_0;
13924 
13925 	CC[0+bs*1] += a_0;
13926 	CC[1+bs*1] += a_1;
13927 	CC[2+bs*1] += a_2;
13928 	CC[3+bs*1] += a_3;
13929 
13930 	A += bs;
13931 	B += bs;
13932 	k += 1;
13933 
13934 	if(k>=kmax)
13935 		goto store;
13936 
13937 	// k = 2
13938 
13939 	a_0 = A[0];
13940 	a_1 = A[1];
13941 	a_2 = A[2];
13942 	a_3 = A[3];
13943 
13944 	b_0 = B[0+0*bs];
13945 	CC[0+bs*0] += a_0 * b_0;
13946 	CC[1+bs*0] += a_1 * b_0;
13947 	CC[2+bs*0] += a_2 * b_0;
13948 	CC[3+bs*0] += a_3 * b_0;
13949 
13950 	b_1 = B[1+0*bs];
13951 	CC[0+bs*1] += a_0 * b_1;
13952 	CC[1+bs*1] += a_1 * b_1;
13953 	CC[2+bs*1] += a_2 * b_1;
13954 	CC[3+bs*1] += a_3 * b_1;
13955 
13956 	CC[0+bs*2] += a_0;
13957 	CC[1+bs*2] += a_1;
13958 	CC[2+bs*2] += a_2;
13959 	CC[3+bs*2] += a_3;
13960 
13961 	A += bs;
13962 	B += bs;
13963 	k += 1;
13964 
13965 	if(k>=kmax)
13966 		goto store;
13967 
13968 	// k = 3
13969 
13970 	a_0 = A[0];
13971 	a_1 = A[1];
13972 	a_2 = A[2];
13973 	a_3 = A[3];
13974 
13975 	b_0 = B[0+0*bs];
13976 	CC[0+bs*0] += a_0 * b_0;
13977 	CC[1+bs*0] += a_1 * b_0;
13978 	CC[2+bs*0] += a_2 * b_0;
13979 	CC[3+bs*0] += a_3 * b_0;
13980 
13981 	b_1 = B[1+0*bs];
13982 	CC[0+bs*1] += a_0 * b_1;
13983 	CC[1+bs*1] += a_1 * b_1;
13984 	CC[2+bs*1] += a_2 * b_1;
13985 	CC[3+bs*1] += a_3 * b_1;
13986 
13987 	b_2 = B[2+0*bs];
13988 	CC[0+bs*2] += a_0 * b_2;
13989 	CC[1+bs*2] += a_1 * b_2;
13990 	CC[2+bs*2] += a_2 * b_2;
13991 	CC[3+bs*2] += a_3 * b_2;
13992 
13993 	CC[0+bs*3] += a_0;
13994 	CC[1+bs*3] += a_1;
13995 	CC[2+bs*3] += a_2;
13996 	CC[3+bs*3] += a_3;
13997 
13998 	A += bs;
13999 	B += bs;
14000 	k += 1;
14001 
14002 	double alpha1 = 1.0;
14003 	double beta1 = 1.0;
14004 
14005 	kernel_dgemm_nt_4x4_lib4(kmax-k, &alpha1, A, B, &beta1, CC, CC);
14006 
14007 	store:
14008 
14009 	// scale
14010 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
14011 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
14012 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
14013 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
14014 
14015 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
14016 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
14017 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
14018 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
14019 
14020 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
14021 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
14022 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
14023 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
14024 
14025 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
14026 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
14027 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
14028 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
14029 
14030 	// transpose
14031 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
14032 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
14033 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
14034 
14035 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
14036 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
14037 
14038 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
14039 
14040 	// store
14041 	if(m1>=4)
14042 		{
14043 		D[0+ldd*0] = CC[0+bs*0];
14044 		D[1+ldd*0] = CC[1+bs*0];
14045 		D[2+ldd*0] = CC[2+bs*0];
14046 		D[3+ldd*0] = CC[3+bs*0];
14047 
14048 		if(n1==1)
14049 			return;
14050 
14051 		D[0+ldd*1] = CC[0+bs*1];
14052 		D[1+ldd*1] = CC[1+bs*1];
14053 		D[2+ldd*1] = CC[2+bs*1];
14054 		D[3+ldd*1] = CC[3+bs*1];
14055 
14056 		if(n1==2)
14057 			return;
14058 
14059 		D[0+ldd*2] = CC[0+bs*2];
14060 		D[1+ldd*2] = CC[1+bs*2];
14061 		D[2+ldd*2] = CC[2+bs*2];
14062 		D[3+ldd*2] = CC[3+bs*2];
14063 
14064 		if(n1==3)
14065 			return;
14066 
14067 		D[0+ldd*3] = CC[0+bs*3];
14068 		D[1+ldd*3] = CC[1+bs*3];
14069 		D[2+ldd*3] = CC[2+bs*3];
14070 		D[3+ldd*3] = CC[3+bs*3];
14071 		}
14072 	else if(m1>=3)
14073 		{
14074 		D[0+ldd*0] = CC[0+bs*0];
14075 		D[1+ldd*0] = CC[1+bs*0];
14076 		D[2+ldd*0] = CC[2+bs*0];
14077 
14078 		if(n1==1)
14079 			return;
14080 
14081 		D[0+ldd*1] = CC[0+bs*1];
14082 		D[1+ldd*1] = CC[1+bs*1];
14083 		D[2+ldd*1] = CC[2+bs*1];
14084 
14085 		if(n1==2)
14086 			return;
14087 
14088 		D[0+ldd*2] = CC[0+bs*2];
14089 		D[1+ldd*2] = CC[1+bs*2];
14090 		D[2+ldd*2] = CC[2+bs*2];
14091 
14092 		if(n1==3)
14093 			return;
14094 
14095 		D[0+ldd*3] = CC[0+bs*3];
14096 		D[1+ldd*3] = CC[1+bs*3];
14097 		D[2+ldd*3] = CC[2+bs*3];
14098 		}
14099 	else if(m1>=2)
14100 		{
14101 		D[0+ldd*0] = CC[0+bs*0];
14102 		D[1+ldd*0] = CC[1+bs*0];
14103 
14104 		if(n1==1)
14105 			return;
14106 
14107 		D[0+ldd*1] = CC[0+bs*1];
14108 		D[1+ldd*1] = CC[1+bs*1];
14109 
14110 		if(n1==2)
14111 			return;
14112 
14113 		D[0+ldd*2] = CC[0+bs*2];
14114 		D[1+ldd*2] = CC[1+bs*2];
14115 
14116 		if(n1==3)
14117 			return;
14118 
14119 		D[0+ldd*3] = CC[0+bs*3];
14120 		D[1+ldd*3] = CC[1+bs*3];
14121 		}
14122 	else //if(m1>=1)
14123 		{
14124 		D[0+ldd*0] = CC[0+bs*0];
14125 
14126 		if(n1==1)
14127 			return;
14128 
14129 		D[0+ldd*1] = CC[0+bs*1];
14130 
14131 		if(n1==2)
14132 			return;
14133 
14134 		D[0+ldd*2] = CC[0+bs*2];
14135 
14136 		if(n1==3)
14137 			return;
14138 
14139 		D[0+ldd*3] = CC[0+bs*3];
14140 		}
14141 
14142 	return;
14143 
14144 	}
14145 #endif
14146 
14147 
14148 
14149 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_one_4x4_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd)14150 void kernel_dtrmm_nt_ru_one_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
14151 	{
14152 
14153 	const int bs = 4;
14154 
14155 	double
14156 		a_0, a_1, a_2, a_3,
14157 		b_0, b_1, b_2, b_3;
14158 
14159 #if defined(TARGET_GENERIC)
14160 	double CC[16] = {0};
14161 #else
14162 	ALIGNED( double CC[16], 64 ) = {0};
14163 #endif
14164 
14165 	int k;
14166 
14167 	k = 0;
14168 
14169 	// k = 0
14170 
14171 	a_0 = A[0];
14172 	a_1 = A[1];
14173 	a_2 = A[2];
14174 	a_3 = A[3];
14175 
14176 	CC[0+bs*0] += a_0;
14177 	CC[1+bs*0] += a_1;
14178 	CC[2+bs*0] += a_2;
14179 	CC[3+bs*0] += a_3;
14180 
14181 	A += bs;
14182 	B += ldb;
14183 	k += 1;
14184 
14185 	if(k>=kmax)
14186 		goto store;
14187 
14188 	// k = 1
14189 
14190 	a_0 = A[0];
14191 	a_1 = A[1];
14192 	a_2 = A[2];
14193 	a_3 = A[3];
14194 
14195 	b_0 = B[0+0*ldb];
14196 	CC[0+bs*0] += a_0 * b_0;
14197 	CC[1+bs*0] += a_1 * b_0;
14198 	CC[2+bs*0] += a_2 * b_0;
14199 	CC[3+bs*0] += a_3 * b_0;
14200 
14201 	CC[0+bs*1] += a_0;
14202 	CC[1+bs*1] += a_1;
14203 	CC[2+bs*1] += a_2;
14204 	CC[3+bs*1] += a_3;
14205 
14206 	A += bs;
14207 	B += ldb;
14208 	k += 1;
14209 
14210 	if(k>=kmax)
14211 		goto store;
14212 
14213 	// k = 2
14214 
14215 	a_0 = A[0];
14216 	a_1 = A[1];
14217 	a_2 = A[2];
14218 	a_3 = A[3];
14219 
14220 	b_0 = B[0+0*ldb];
14221 	CC[0+bs*0] += a_0 * b_0;
14222 	CC[1+bs*0] += a_1 * b_0;
14223 	CC[2+bs*0] += a_2 * b_0;
14224 	CC[3+bs*0] += a_3 * b_0;
14225 
14226 	b_1 = B[1+0*ldb];
14227 	CC[0+bs*1] += a_0 * b_1;
14228 	CC[1+bs*1] += a_1 * b_1;
14229 	CC[2+bs*1] += a_2 * b_1;
14230 	CC[3+bs*1] += a_3 * b_1;
14231 
14232 	CC[0+bs*2] += a_0;
14233 	CC[1+bs*2] += a_1;
14234 	CC[2+bs*2] += a_2;
14235 	CC[3+bs*2] += a_3;
14236 
14237 	A += bs;
14238 	B += ldb;
14239 	k += 1;
14240 
14241 	if(k>=kmax)
14242 		goto store;
14243 
14244 	// k = 3
14245 
14246 	a_0 = A[0];
14247 	a_1 = A[1];
14248 	a_2 = A[2];
14249 	a_3 = A[3];
14250 
14251 	b_0 = B[0+0*ldb];
14252 	CC[0+bs*0] += a_0 * b_0;
14253 	CC[1+bs*0] += a_1 * b_0;
14254 	CC[2+bs*0] += a_2 * b_0;
14255 	CC[3+bs*0] += a_3 * b_0;
14256 
14257 	b_1 = B[1+0*ldb];
14258 	CC[0+bs*1] += a_0 * b_1;
14259 	CC[1+bs*1] += a_1 * b_1;
14260 	CC[2+bs*1] += a_2 * b_1;
14261 	CC[3+bs*1] += a_3 * b_1;
14262 
14263 	b_2 = B[2+0*ldb];
14264 	CC[0+bs*2] += a_0 * b_2;
14265 	CC[1+bs*2] += a_1 * b_2;
14266 	CC[2+bs*2] += a_2 * b_2;
14267 	CC[3+bs*2] += a_3 * b_2;
14268 
14269 	CC[0+bs*3] += a_0;
14270 	CC[1+bs*3] += a_1;
14271 	CC[2+bs*3] += a_2;
14272 	CC[3+bs*3] += a_3;
14273 
14274 	A += bs;
14275 	B += ldb;
14276 	k += 1;
14277 
14278 	store:
14279 
14280 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
14281 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
14282 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
14283 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
14284 
14285 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
14286 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
14287 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
14288 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
14289 
14290 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
14291 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
14292 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
14293 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
14294 
14295 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
14296 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
14297 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
14298 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
14299 
14300 	double beta1 = 1.0;
14301 
14302 	kernel_dgemm_nt_4x4_lib4ccc(kmax-k, alpha, A, B, ldb, &beta1, CC, bs, D, ldd);
14303 
14304 	return;
14305 
14306 	}
14307 #endif
14308 
14309 
14310 
14311 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_one_4x4_vs_lib4ccc(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,int m1,int n1)14312 void kernel_dtrmm_nt_ru_one_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
14313 	{
14314 
14315 	const int bs = 4;
14316 
14317 	double
14318 		a_0, a_1, a_2, a_3,
14319 		b_0, b_1, b_2, b_3;
14320 
14321 #if defined(TARGET_GENERIC)
14322 	double CC[16] = {0};
14323 #else
14324 	ALIGNED( double CC[16], 64 ) = {0};
14325 #endif
14326 
14327 	int k;
14328 
14329 	k = 0;
14330 
14331 	// k = 0
14332 
14333 	a_0 = A[0];
14334 	a_1 = A[1];
14335 	a_2 = A[2];
14336 	a_3 = A[3];
14337 
14338 	CC[0+bs*0] += a_0;
14339 	CC[1+bs*0] += a_1;
14340 	CC[2+bs*0] += a_2;
14341 	CC[3+bs*0] += a_3;
14342 
14343 	A += bs;
14344 	B += ldb;
14345 	k += 1;
14346 
14347 	if(k>=kmax)
14348 		goto store;
14349 
14350 	// k = 1
14351 
14352 	a_0 = A[0];
14353 	a_1 = A[1];
14354 	a_2 = A[2];
14355 	a_3 = A[3];
14356 
14357 	b_0 = B[0+0*ldb];
14358 	CC[0+bs*0] += a_0 * b_0;
14359 	CC[1+bs*0] += a_1 * b_0;
14360 	CC[2+bs*0] += a_2 * b_0;
14361 	CC[3+bs*0] += a_3 * b_0;
14362 
14363 	CC[0+bs*1] += a_0;
14364 	CC[1+bs*1] += a_1;
14365 	CC[2+bs*1] += a_2;
14366 	CC[3+bs*1] += a_3;
14367 
14368 	A += bs;
14369 	B += ldb;
14370 	k += 1;
14371 
14372 	if(k>=kmax)
14373 		goto store;
14374 
14375 	// k = 2
14376 
14377 	a_0 = A[0];
14378 	a_1 = A[1];
14379 	a_2 = A[2];
14380 	a_3 = A[3];
14381 
14382 	b_0 = B[0+0*ldb];
14383 	CC[0+bs*0] += a_0 * b_0;
14384 	CC[1+bs*0] += a_1 * b_0;
14385 	CC[2+bs*0] += a_2 * b_0;
14386 	CC[3+bs*0] += a_3 * b_0;
14387 
14388 	b_1 = B[1+0*ldb];
14389 	CC[0+bs*1] += a_0 * b_1;
14390 	CC[1+bs*1] += a_1 * b_1;
14391 	CC[2+bs*1] += a_2 * b_1;
14392 	CC[3+bs*1] += a_3 * b_1;
14393 
14394 	CC[0+bs*2] += a_0;
14395 	CC[1+bs*2] += a_1;
14396 	CC[2+bs*2] += a_2;
14397 	CC[3+bs*2] += a_3;
14398 
14399 	A += bs;
14400 	B += ldb;
14401 	k += 1;
14402 
14403 	if(k>=kmax)
14404 		goto store;
14405 
14406 	// k = 3
14407 
14408 	a_0 = A[0];
14409 	a_1 = A[1];
14410 	a_2 = A[2];
14411 	a_3 = A[3];
14412 
14413 	b_0 = B[0+0*ldb];
14414 	CC[0+bs*0] += a_0 * b_0;
14415 	CC[1+bs*0] += a_1 * b_0;
14416 	CC[2+bs*0] += a_2 * b_0;
14417 	CC[3+bs*0] += a_3 * b_0;
14418 
14419 	b_1 = B[1+0*ldb];
14420 	CC[0+bs*1] += a_0 * b_1;
14421 	CC[1+bs*1] += a_1 * b_1;
14422 	CC[2+bs*1] += a_2 * b_1;
14423 	CC[3+bs*1] += a_3 * b_1;
14424 
14425 	b_2 = B[2+0*ldb];
14426 	CC[0+bs*2] += a_0 * b_2;
14427 	CC[1+bs*2] += a_1 * b_2;
14428 	CC[2+bs*2] += a_2 * b_2;
14429 	CC[3+bs*2] += a_3 * b_2;
14430 
14431 	CC[0+bs*3] += a_0;
14432 	CC[1+bs*3] += a_1;
14433 	CC[2+bs*3] += a_2;
14434 	CC[3+bs*3] += a_3;
14435 
14436 	A += bs;
14437 	B += ldb;
14438 	k += 1;
14439 
14440 	store:
14441 
14442 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+ldc*0];
14443 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+ldc*0];
14444 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+ldc*0];
14445 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+ldc*0];
14446 
14447 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+ldc*1];
14448 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+ldc*1];
14449 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+ldc*1];
14450 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+ldc*1];
14451 
14452 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+ldc*2];
14453 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+ldc*2];
14454 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+ldc*2];
14455 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+ldc*2];
14456 
14457 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+ldc*3];
14458 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+ldc*3];
14459 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+ldc*3];
14460 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+ldc*3];
14461 
14462 	double beta1 = 1.0;
14463 
14464 	kernel_dgemm_nt_4x4_lib4ccc(kmax-k, alpha, A, B, ldb, &beta1, CC, bs, CC, bs);
14465 
14466 	if(m1>=4)
14467 		{
14468 		D[0+ldd*0] = CC[0+bs*0];
14469 		D[1+ldd*0] = CC[1+bs*0];
14470 		D[2+ldd*0] = CC[2+bs*0];
14471 		D[3+ldd*0] = CC[3+bs*0];
14472 
14473 		if(n1==1)
14474 			return;
14475 
14476 		D[0+ldd*1] = CC[0+bs*1];
14477 		D[1+ldd*1] = CC[1+bs*1];
14478 		D[2+ldd*1] = CC[2+bs*1];
14479 		D[3+ldd*1] = CC[3+bs*1];
14480 
14481 		if(n1==2)
14482 			return;
14483 
14484 		D[0+ldd*2] = CC[0+bs*2];
14485 		D[1+ldd*2] = CC[1+bs*2];
14486 		D[2+ldd*2] = CC[2+bs*2];
14487 		D[3+ldd*2] = CC[3+bs*2];
14488 
14489 		if(n1==3)
14490 			return;
14491 
14492 		D[0+ldd*3] = CC[0+bs*3];
14493 		D[1+ldd*3] = CC[1+bs*3];
14494 		D[2+ldd*3] = CC[2+bs*3];
14495 		D[3+ldd*3] = CC[3+bs*3];
14496 		}
14497 	else if(m1>=3)
14498 		{
14499 		D[0+ldd*0] = CC[0+bs*0];
14500 		D[1+ldd*0] = CC[1+bs*0];
14501 		D[2+ldd*0] = CC[2+bs*0];
14502 
14503 		if(n1==1)
14504 			return;
14505 
14506 		D[0+ldd*1] = CC[0+bs*1];
14507 		D[1+ldd*1] = CC[1+bs*1];
14508 		D[2+ldd*1] = CC[2+bs*1];
14509 
14510 		if(n1==2)
14511 			return;
14512 
14513 		D[0+ldd*2] = CC[0+bs*2];
14514 		D[1+ldd*2] = CC[1+bs*2];
14515 		D[2+ldd*2] = CC[2+bs*2];
14516 
14517 		if(n1==3)
14518 			return;
14519 
14520 		D[0+ldd*3] = CC[0+bs*3];
14521 		D[1+ldd*3] = CC[1+bs*3];
14522 		D[2+ldd*3] = CC[2+bs*3];
14523 		}
14524 	else if(m1>=2)
14525 		{
14526 		D[0+ldd*0] = CC[0+bs*0];
14527 		D[1+ldd*0] = CC[1+bs*0];
14528 
14529 		if(n1==1)
14530 			return;
14531 
14532 		D[0+ldd*1] = CC[0+bs*1];
14533 		D[1+ldd*1] = CC[1+bs*1];
14534 
14535 		if(n1==2)
14536 			return;
14537 
14538 		D[0+ldd*2] = CC[0+bs*2];
14539 		D[1+ldd*2] = CC[1+bs*2];
14540 
14541 		if(n1==3)
14542 			return;
14543 
14544 		D[0+ldd*3] = CC[0+bs*3];
14545 		D[1+ldd*3] = CC[1+bs*3];
14546 		}
14547 	else //if(m1>=1)
14548 		{
14549 		D[0+ldd*0] = CC[0+bs*0];
14550 
14551 		if(n1==1)
14552 			return;
14553 
14554 		D[0+ldd*1] = CC[0+bs*1];
14555 
14556 		if(n1==2)
14557 			return;
14558 
14559 		D[0+ldd*2] = CC[0+bs*2];
14560 
14561 		if(n1==3)
14562 			return;
14563 
14564 		D[0+ldd*3] = CC[0+bs*3];
14565 		}
14566 
14567 	return;
14568 
14569 	}
14570 #endif
14571 
14572 
14573 
14574 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_one_4x4_tran_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd)14575 void kernel_dtrmm_nt_ru_one_4x4_tran_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd)
14576 	{
14577 
14578 	const int bs = 4;
14579 
14580 	double
14581 		a_0, a_1, a_2, a_3,
14582 		b_0, b_1, b_2, b_3;
14583 
14584 #if defined(TARGET_GENERIC)
14585 	double CC[16] = {0};
14586 #else
14587 	ALIGNED( double CC[16], 64 ) = {0};
14588 #endif
14589 
14590 	int k;
14591 
14592 	k = 0;
14593 
14594 	// k = 0
14595 
14596 	a_0 = A[0];
14597 	a_1 = A[1];
14598 	a_2 = A[2];
14599 	a_3 = A[3];
14600 
14601 	CC[0+bs*0] += a_0;
14602 	CC[1+bs*0] += a_1;
14603 	CC[2+bs*0] += a_2;
14604 	CC[3+bs*0] += a_3;
14605 
14606 	A += bs;
14607 	B += ldb;
14608 	k += 1;
14609 
14610 	if(k>=kmax)
14611 		goto store;
14612 
14613 	// k = 1
14614 
14615 	a_0 = A[0];
14616 	a_1 = A[1];
14617 	a_2 = A[2];
14618 	a_3 = A[3];
14619 
14620 	b_0 = B[0+0*ldb];
14621 	CC[0+bs*0] += a_0 * b_0;
14622 	CC[1+bs*0] += a_1 * b_0;
14623 	CC[2+bs*0] += a_2 * b_0;
14624 	CC[3+bs*0] += a_3 * b_0;
14625 
14626 	CC[0+bs*1] += a_0;
14627 	CC[1+bs*1] += a_1;
14628 	CC[2+bs*1] += a_2;
14629 	CC[3+bs*1] += a_3;
14630 
14631 	A += bs;
14632 	B += ldb;
14633 	k += 1;
14634 
14635 	if(k>=kmax)
14636 		goto store;
14637 
14638 	// k = 2
14639 
14640 	a_0 = A[0];
14641 	a_1 = A[1];
14642 	a_2 = A[2];
14643 	a_3 = A[3];
14644 
14645 	b_0 = B[0+0*ldb];
14646 	CC[0+bs*0] += a_0 * b_0;
14647 	CC[1+bs*0] += a_1 * b_0;
14648 	CC[2+bs*0] += a_2 * b_0;
14649 	CC[3+bs*0] += a_3 * b_0;
14650 
14651 	b_1 = B[1+0*ldb];
14652 	CC[0+bs*1] += a_0 * b_1;
14653 	CC[1+bs*1] += a_1 * b_1;
14654 	CC[2+bs*1] += a_2 * b_1;
14655 	CC[3+bs*1] += a_3 * b_1;
14656 
14657 	CC[0+bs*2] += a_0;
14658 	CC[1+bs*2] += a_1;
14659 	CC[2+bs*2] += a_2;
14660 	CC[3+bs*2] += a_3;
14661 
14662 	A += bs;
14663 	B += ldb;
14664 	k += 1;
14665 
14666 	if(k>=kmax)
14667 		goto store;
14668 
14669 	// k = 3
14670 
14671 	a_0 = A[0];
14672 	a_1 = A[1];
14673 	a_2 = A[2];
14674 	a_3 = A[3];
14675 
14676 	b_0 = B[0+0*ldb];
14677 	CC[0+bs*0] += a_0 * b_0;
14678 	CC[1+bs*0] += a_1 * b_0;
14679 	CC[2+bs*0] += a_2 * b_0;
14680 	CC[3+bs*0] += a_3 * b_0;
14681 
14682 	b_1 = B[1+0*ldb];
14683 	CC[0+bs*1] += a_0 * b_1;
14684 	CC[1+bs*1] += a_1 * b_1;
14685 	CC[2+bs*1] += a_2 * b_1;
14686 	CC[3+bs*1] += a_3 * b_1;
14687 
14688 	b_2 = B[2+0*ldb];
14689 	CC[0+bs*2] += a_0 * b_2;
14690 	CC[1+bs*2] += a_1 * b_2;
14691 	CC[2+bs*2] += a_2 * b_2;
14692 	CC[3+bs*2] += a_3 * b_2;
14693 
14694 	CC[0+bs*3] += a_0;
14695 	CC[1+bs*3] += a_1;
14696 	CC[2+bs*3] += a_2;
14697 	CC[3+bs*3] += a_3;
14698 
14699 	A += bs;
14700 	B += ldb;
14701 	k += 1;
14702 
14703 	double alpha1 = 1.0;
14704 	double beta1 = 1.0;
14705 
14706 	kernel_dgemm_nt_4x4_lib4ccc(kmax-k, &alpha1, A, B, ldb, &beta1, CC, bs, CC, bs);
14707 
14708 	store:
14709 
14710 	// scale & tranpose & store
14711 	D[0+ldd*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
14712 	D[0+ldd*1] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
14713 	D[0+ldd*2] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
14714 	D[0+ldd*3] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
14715 
14716 	D[1+ldd*0] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
14717 	D[1+ldd*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
14718 	D[1+ldd*2] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
14719 	D[1+ldd*3] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
14720 
14721 	D[2+ldd*0] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
14722 	D[2+ldd*1] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
14723 	D[2+ldd*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
14724 	D[2+ldd*3] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
14725 
14726 	D[3+ldd*0] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
14727 	D[3+ldd*1] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
14728 	D[3+ldd*2] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
14729 	D[3+ldd*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
14730 
14731 	return;
14732 
14733 	}
14734 #endif
14735 
14736 
14737 
14738 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib4c4c(int kmax,double * alpha,double * A,double * B,int ldb,double * beta,double * C,double * D,int ldd,int m1,int n1)14739 void kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib4c4c(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1)
14740 	{
14741 
14742 	const int bs = 4;
14743 
14744 	double
14745 		tmp,
14746 		a_0, a_1, a_2, a_3,
14747 		b_0, b_1, b_2, b_3;
14748 
14749 #if defined(TARGET_GENERIC)
14750 	double CC[16] = {0};
14751 #else
14752 	ALIGNED( double CC[16], 64 ) = {0};
14753 #endif
14754 
14755 	int k;
14756 
14757 	k = 0;
14758 
14759 	// k = 0
14760 
14761 	a_0 = A[0];
14762 	a_1 = A[1];
14763 	a_2 = A[2];
14764 	a_3 = A[3];
14765 
14766 	CC[0+bs*0] += a_0;
14767 	CC[1+bs*0] += a_1;
14768 	CC[2+bs*0] += a_2;
14769 	CC[3+bs*0] += a_3;
14770 
14771 	A += bs;
14772 	B += ldb;
14773 	k += 1;
14774 
14775 	if(k>=kmax)
14776 		goto store;
14777 
14778 	// k = 1
14779 
14780 	a_0 = A[0];
14781 	a_1 = A[1];
14782 	a_2 = A[2];
14783 	a_3 = A[3];
14784 
14785 	b_0 = B[0+0*ldb];
14786 	CC[0+bs*0] += a_0 * b_0;
14787 	CC[1+bs*0] += a_1 * b_0;
14788 	CC[2+bs*0] += a_2 * b_0;
14789 	CC[3+bs*0] += a_3 * b_0;
14790 
14791 	CC[0+bs*1] += a_0;
14792 	CC[1+bs*1] += a_1;
14793 	CC[2+bs*1] += a_2;
14794 	CC[3+bs*1] += a_3;
14795 
14796 	A += bs;
14797 	B += ldb;
14798 	k += 1;
14799 
14800 	if(k>=kmax)
14801 		goto store;
14802 
14803 	// k = 2
14804 
14805 	a_0 = A[0];
14806 	a_1 = A[1];
14807 	a_2 = A[2];
14808 	a_3 = A[3];
14809 
14810 	b_0 = B[0+0*ldb];
14811 	CC[0+bs*0] += a_0 * b_0;
14812 	CC[1+bs*0] += a_1 * b_0;
14813 	CC[2+bs*0] += a_2 * b_0;
14814 	CC[3+bs*0] += a_3 * b_0;
14815 
14816 	b_1 = B[1+0*ldb];
14817 	CC[0+bs*1] += a_0 * b_1;
14818 	CC[1+bs*1] += a_1 * b_1;
14819 	CC[2+bs*1] += a_2 * b_1;
14820 	CC[3+bs*1] += a_3 * b_1;
14821 
14822 	CC[0+bs*2] += a_0;
14823 	CC[1+bs*2] += a_1;
14824 	CC[2+bs*2] += a_2;
14825 	CC[3+bs*2] += a_3;
14826 
14827 	A += bs;
14828 	B += ldb;
14829 	k += 1;
14830 
14831 	if(k>=kmax)
14832 		goto store;
14833 
14834 	// k = 3
14835 
14836 	a_0 = A[0];
14837 	a_1 = A[1];
14838 	a_2 = A[2];
14839 	a_3 = A[3];
14840 
14841 	b_0 = B[0+0*ldb];
14842 	CC[0+bs*0] += a_0 * b_0;
14843 	CC[1+bs*0] += a_1 * b_0;
14844 	CC[2+bs*0] += a_2 * b_0;
14845 	CC[3+bs*0] += a_3 * b_0;
14846 
14847 	b_1 = B[1+0*ldb];
14848 	CC[0+bs*1] += a_0 * b_1;
14849 	CC[1+bs*1] += a_1 * b_1;
14850 	CC[2+bs*1] += a_2 * b_1;
14851 	CC[3+bs*1] += a_3 * b_1;
14852 
14853 	b_2 = B[2+0*ldb];
14854 	CC[0+bs*2] += a_0 * b_2;
14855 	CC[1+bs*2] += a_1 * b_2;
14856 	CC[2+bs*2] += a_2 * b_2;
14857 	CC[3+bs*2] += a_3 * b_2;
14858 
14859 	CC[0+bs*3] += a_0;
14860 	CC[1+bs*3] += a_1;
14861 	CC[2+bs*3] += a_2;
14862 	CC[3+bs*3] += a_3;
14863 
14864 	A += bs;
14865 	B += ldb;
14866 	k += 1;
14867 
14868 	double alpha1 = 1.0;
14869 	double beta1 = 1.0;
14870 
14871 	kernel_dgemm_nt_4x4_lib4ccc(kmax-k, &alpha1, A, B, ldb, &beta1, CC, bs, CC, bs);
14872 
14873 	store:
14874 
14875 	// scale
14876 	CC[0+bs*0] = alpha[0]*CC[0+bs*0] + beta[0]*C[0+bs*0];
14877 	CC[1+bs*0] = alpha[0]*CC[1+bs*0] + beta[0]*C[1+bs*0];
14878 	CC[2+bs*0] = alpha[0]*CC[2+bs*0] + beta[0]*C[2+bs*0];
14879 	CC[3+bs*0] = alpha[0]*CC[3+bs*0] + beta[0]*C[3+bs*0];
14880 
14881 	CC[0+bs*1] = alpha[0]*CC[0+bs*1] + beta[0]*C[0+bs*1];
14882 	CC[1+bs*1] = alpha[0]*CC[1+bs*1] + beta[0]*C[1+bs*1];
14883 	CC[2+bs*1] = alpha[0]*CC[2+bs*1] + beta[0]*C[2+bs*1];
14884 	CC[3+bs*1] = alpha[0]*CC[3+bs*1] + beta[0]*C[3+bs*1];
14885 
14886 	CC[0+bs*2] = alpha[0]*CC[0+bs*2] + beta[0]*C[0+bs*2];
14887 	CC[1+bs*2] = alpha[0]*CC[1+bs*2] + beta[0]*C[1+bs*2];
14888 	CC[2+bs*2] = alpha[0]*CC[2+bs*2] + beta[0]*C[2+bs*2];
14889 	CC[3+bs*2] = alpha[0]*CC[3+bs*2] + beta[0]*C[3+bs*2];
14890 
14891 	CC[0+bs*3] = alpha[0]*CC[0+bs*3] + beta[0]*C[0+bs*3];
14892 	CC[1+bs*3] = alpha[0]*CC[1+bs*3] + beta[0]*C[1+bs*3];
14893 	CC[2+bs*3] = alpha[0]*CC[2+bs*3] + beta[0]*C[2+bs*3];
14894 	CC[3+bs*3] = alpha[0]*CC[3+bs*3] + beta[0]*C[3+bs*3];
14895 
14896 	// transpose
14897 	tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
14898 	tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
14899 	tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
14900 
14901 	tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
14902 	tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
14903 
14904 	tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
14905 
14906 	// store
14907 	if(m1>=4)
14908 		{
14909 		D[0+ldd*0] = CC[0+bs*0];
14910 		D[1+ldd*0] = CC[1+bs*0];
14911 		D[2+ldd*0] = CC[2+bs*0];
14912 		D[3+ldd*0] = CC[3+bs*0];
14913 
14914 		if(n1==1)
14915 			return;
14916 
14917 		D[0+ldd*1] = CC[0+bs*1];
14918 		D[1+ldd*1] = CC[1+bs*1];
14919 		D[2+ldd*1] = CC[2+bs*1];
14920 		D[3+ldd*1] = CC[3+bs*1];
14921 
14922 		if(n1==2)
14923 			return;
14924 
14925 		D[0+ldd*2] = CC[0+bs*2];
14926 		D[1+ldd*2] = CC[1+bs*2];
14927 		D[2+ldd*2] = CC[2+bs*2];
14928 		D[3+ldd*2] = CC[3+bs*2];
14929 
14930 		if(n1==3)
14931 			return;
14932 
14933 		D[0+ldd*3] = CC[0+bs*3];
14934 		D[1+ldd*3] = CC[1+bs*3];
14935 		D[2+ldd*3] = CC[2+bs*3];
14936 		D[3+ldd*3] = CC[3+bs*3];
14937 		}
14938 	else if(m1>=3)
14939 		{
14940 		D[0+ldd*0] = CC[0+bs*0];
14941 		D[1+ldd*0] = CC[1+bs*0];
14942 		D[2+ldd*0] = CC[2+bs*0];
14943 
14944 		if(n1==1)
14945 			return;
14946 
14947 		D[0+ldd*1] = CC[0+bs*1];
14948 		D[1+ldd*1] = CC[1+bs*1];
14949 		D[2+ldd*1] = CC[2+bs*1];
14950 
14951 		if(n1==2)
14952 			return;
14953 
14954 		D[0+ldd*2] = CC[0+bs*2];
14955 		D[1+ldd*2] = CC[1+bs*2];
14956 		D[2+ldd*2] = CC[2+bs*2];
14957 
14958 		if(n1==3)
14959 			return;
14960 
14961 		D[0+ldd*3] = CC[0+bs*3];
14962 		D[1+ldd*3] = CC[1+bs*3];
14963 		D[2+ldd*3] = CC[2+bs*3];
14964 		}
14965 	else if(m1>=2)
14966 		{
14967 		D[0+ldd*0] = CC[0+bs*0];
14968 		D[1+ldd*0] = CC[1+bs*0];
14969 
14970 		if(n1==1)
14971 			return;
14972 
14973 		D[0+ldd*1] = CC[0+bs*1];
14974 		D[1+ldd*1] = CC[1+bs*1];
14975 
14976 		if(n1==2)
14977 			return;
14978 
14979 		D[0+ldd*2] = CC[0+bs*2];
14980 		D[1+ldd*2] = CC[1+bs*2];
14981 
14982 		if(n1==3)
14983 			return;
14984 
14985 		D[0+ldd*3] = CC[0+bs*3];
14986 		D[1+ldd*3] = CC[1+bs*3];
14987 		}
14988 	else //if(m1>=1)
14989 		{
14990 		D[0+ldd*0] = CC[0+bs*0];
14991 
14992 		if(n1==1)
14993 			return;
14994 
14995 		D[0+ldd*1] = CC[0+bs*1];
14996 
14997 		if(n1==2)
14998 			return;
14999 
15000 		D[0+ldd*2] = CC[0+bs*2];
15001 
15002 		if(n1==3)
15003 			return;
15004 
15005 		D[0+ldd*3] = CC[0+bs*3];
15006 		}
15007 
15008 	return;
15009 
15010 	}
15011 #endif
15012 
15013 
15014 
15015 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
kernel_dpotrf_nt_l_4x4_lib44cc(int kmax,double * A,double * B,double * C,int ldc,double * D,int ldd,double * inv_diag_D)15016 void kernel_dpotrf_nt_l_4x4_lib44cc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D)
15017 	{
15018 
15019 	const int bs = 4;
15020 
15021 #if defined(TARGET_GENERIC)
15022 	double CC[16] = {0};
15023 #else
15024 	ALIGNED( double CC[16], 64 ) = {0};
15025 #endif
15026 
15027 	CC[0+bs*0] = C[0+ldc*0];
15028 	CC[1+bs*0] = C[1+ldc*0];
15029 	CC[2+bs*0] = C[2+ldc*0];
15030 	CC[3+bs*0] = C[3+ldc*0];
15031 
15032 	CC[1+bs*1] = C[1+ldc*1];
15033 	CC[2+bs*1] = C[2+ldc*1];
15034 	CC[3+bs*1] = C[3+ldc*1];
15035 
15036 	CC[2+bs*2] = C[2+ldc*2];
15037 	CC[3+bs*2] = C[3+ldc*2];
15038 
15039 	CC[3+bs*3] = C[3+ldc*3];
15040 
15041 	kernel_dpotrf_nt_l_4x4_lib4(kmax, A, B, CC, CC, inv_diag_D);
15042 
15043 	D[0+ldd*0] = CC[0+bs*0];
15044 	D[1+ldd*0] = CC[1+bs*0];
15045 	D[2+ldd*0] = CC[2+bs*0];
15046 	D[3+ldd*0] = CC[3+bs*0];
15047 
15048 	D[1+ldd*1] = CC[1+bs*1];
15049 	D[2+ldd*1] = CC[2+bs*1];
15050 	D[3+ldd*1] = CC[3+bs*1];
15051 
15052 	D[2+ldd*2] = CC[2+bs*2];
15053 	D[3+ldd*2] = CC[3+bs*2];
15054 
15055 	D[3+ldd*3] = CC[3+bs*3];
15056 
15057 	return;
15058 
15059 	}
15060 #endif
15061 
15062 
15063 
15064 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dpotrf_nt_l_4x4_vs_lib44cc(int kmax,double * A,double * B,double * C,int ldc,double * D,int ldd,double * inv_diag_D,int m1,int n1)15065 void kernel_dpotrf_nt_l_4x4_vs_lib44cc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D, int m1, int n1)
15066 	{
15067 
15068 	const int bs = 4;
15069 
15070 #if defined(TARGET_GENERIC)
15071 	double CC[16] = {0};
15072 #else
15073 	ALIGNED( double CC[16], 64 ) = {0};
15074 #endif
15075 
15076 	if(m1>=4)
15077 		{
15078 		CC[0+bs*0] = C[0+ldc*0];
15079 		CC[1+bs*0] = C[1+ldc*0];
15080 		CC[2+bs*0] = C[2+ldc*0];
15081 		CC[3+bs*0] = C[3+ldc*0];
15082 
15083 		if(n1==1)
15084 			goto kernel;
15085 
15086 		CC[1+bs*1] = C[1+ldc*1];
15087 		CC[2+bs*1] = C[2+ldc*1];
15088 		CC[3+bs*1] = C[3+ldc*1];
15089 
15090 		if(n1==2)
15091 			goto kernel;
15092 
15093 		CC[2+bs*2] = C[2+ldc*2];
15094 		CC[3+bs*2] = C[3+ldc*2];
15095 
15096 		if(n1==3)
15097 			goto kernel;
15098 
15099 		CC[3+bs*3] = C[3+ldc*3];
15100 		}
15101 	else if(m1>=3)
15102 		{
15103 		CC[0+bs*0] = C[0+ldc*0];
15104 		CC[1+bs*0] = C[1+ldc*0];
15105 		CC[2+bs*0] = C[2+ldc*0];
15106 
15107 		if(n1==1)
15108 			goto kernel;
15109 
15110 		CC[1+bs*1] = C[1+ldc*1];
15111 		CC[2+bs*1] = C[2+ldc*1];
15112 
15113 		if(n1==2)
15114 			goto kernel;
15115 
15116 		CC[2+bs*2] = C[2+ldc*2];
15117 		}
15118 	else if(m1>=2)
15119 		{
15120 		CC[0+bs*0] = C[0+ldc*0];
15121 		CC[1+bs*0] = C[1+ldc*0];
15122 
15123 		if(n1==1)
15124 			goto kernel;
15125 
15126 		CC[1+bs*1] = C[1+ldc*1];
15127 		}
15128 	else //if(m1>=1)
15129 		{
15130 		CC[0+bs*0] = C[0+ldc*0];
15131 		}
15132 
15133 kernel:
15134 	kernel_dpotrf_nt_l_4x4_vs_lib4(kmax, A, B, CC, CC, inv_diag_D, m1, n1);
15135 
15136 	if(m1>=4)
15137 		{
15138 		D[0+ldd*0] = CC[0+bs*0];
15139 		D[1+ldd*0] = CC[1+bs*0];
15140 		D[2+ldd*0] = CC[2+bs*0];
15141 		D[3+ldd*0] = CC[3+bs*0];
15142 
15143 		if(n1==1)
15144 			goto end;
15145 
15146 		D[1+ldd*1] = CC[1+bs*1];
15147 		D[2+ldd*1] = CC[2+bs*1];
15148 		D[3+ldd*1] = CC[3+bs*1];
15149 
15150 		if(n1==2)
15151 			goto end;
15152 
15153 		D[2+ldd*2] = CC[2+bs*2];
15154 		D[3+ldd*2] = CC[3+bs*2];
15155 
15156 		if(n1==3)
15157 			goto end;
15158 
15159 		D[3+ldd*3] = CC[3+bs*3];
15160 		}
15161 	else if(m1>=3)
15162 		{
15163 		D[0+ldd*0] = CC[0+bs*0];
15164 		D[1+ldd*0] = CC[1+bs*0];
15165 		D[2+ldd*0] = CC[2+bs*0];
15166 
15167 		if(n1==1)
15168 			goto end;
15169 
15170 		D[1+ldd*1] = CC[1+bs*1];
15171 		D[2+ldd*1] = CC[2+bs*1];
15172 
15173 		if(n1==2)
15174 			goto end;
15175 
15176 		D[2+ldd*2] = CC[2+bs*2];
15177 		}
15178 	else if(m1>=2)
15179 		{
15180 		D[0+ldd*0] = CC[0+bs*0];
15181 		D[1+ldd*0] = CC[1+bs*0];
15182 
15183 		if(n1==1)
15184 			goto end;
15185 
15186 		D[1+ldd*1] = CC[1+bs*1];
15187 		}
15188 	else //if(m1>=1)
15189 		{
15190 		D[0+ldd*0] = CC[0+bs*0];
15191 		}
15192 
15193 end:
15194 	return;
15195 
15196 	}
15197 #endif
15198 
15199 
15200 
15201 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_rl_inv_4x4_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,double * inv_diag_E)15202 void kernel_dtrsm_nn_rl_inv_4x4_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E)
15203 	{
15204 
15205 	const int bs = 4;
15206 
15207 	double tmp;
15208 
15209 #if defined(TARGET_GENERIC)
15210 	double CC[16] = {0};
15211 #else
15212 	ALIGNED( double CC[16], 64 ) = {0};
15213 #endif
15214 
15215 	double alpha1 = -1.0;
15216 
15217 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
15218 
15219 	tmp = inv_diag_E[3];
15220 	CC[0+bs*3] *= tmp;
15221 	CC[1+bs*3] *= tmp;
15222 	CC[2+bs*3] *= tmp;
15223 	CC[3+bs*3] *= tmp;
15224 	tmp = E[3+lde*0];
15225 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
15226 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
15227 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
15228 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
15229 	tmp = E[3+lde*1];
15230 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
15231 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
15232 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
15233 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
15234 	tmp = E[3+lde*2];
15235 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
15236 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
15237 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
15238 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
15239 
15240 	tmp = inv_diag_E[2];
15241 	CC[0+bs*2] *= tmp;
15242 	CC[1+bs*2] *= tmp;
15243 	CC[2+bs*2] *= tmp;
15244 	CC[3+bs*2] *= tmp;
15245 	tmp = E[2+lde*0];
15246 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
15247 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
15248 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
15249 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
15250 	tmp = E[2+lde*1];
15251 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
15252 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
15253 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
15254 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
15255 
15256 	tmp = inv_diag_E[1];
15257 	CC[0+bs*1] *= tmp;
15258 	CC[1+bs*1] *= tmp;
15259 	CC[2+bs*1] *= tmp;
15260 	CC[3+bs*1] *= tmp;
15261 	tmp = E[1+lde*0];
15262 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
15263 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
15264 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
15265 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
15266 
15267 	tmp = inv_diag_E[0];
15268 	CC[0+bs*0] *= tmp;
15269 	CC[1+bs*0] *= tmp;
15270 	CC[2+bs*0] *= tmp;
15271 	CC[3+bs*0] *= tmp;
15272 
15273 
15274 	D[0+bs*0] = CC[0+bs*0];
15275 	D[1+bs*0] = CC[1+bs*0];
15276 	D[2+bs*0] = CC[2+bs*0];
15277 	D[3+bs*0] = CC[3+bs*0];
15278 
15279 	D[0+bs*1] = CC[0+bs*1];
15280 	D[1+bs*1] = CC[1+bs*1];
15281 	D[2+bs*1] = CC[2+bs*1];
15282 	D[3+bs*1] = CC[3+bs*1];
15283 
15284 	D[0+bs*2] = CC[0+bs*2];
15285 	D[1+bs*2] = CC[1+bs*2];
15286 	D[2+bs*2] = CC[2+bs*2];
15287 	D[3+bs*2] = CC[3+bs*2];
15288 
15289 	D[0+bs*3] = CC[0+bs*3];
15290 	D[1+bs*3] = CC[1+bs*3];
15291 	D[2+bs*3] = CC[2+bs*3];
15292 	D[3+bs*3] = CC[3+bs*3];
15293 
15294 	return;
15295 
15296 	}
15297 #endif
15298 
15299 
15300 
15301 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_rl_inv_4x4_vs_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,double * inv_diag_E,int m1,int n1)15302 void kernel_dtrsm_nn_rl_inv_4x4_vs_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1)
15303 	{
15304 
15305 	const int bs = 4;
15306 
15307 	double tmp;
15308 
15309 #if defined(TARGET_GENERIC)
15310 	double CC[16] = {0};
15311 #else
15312 	ALIGNED( double CC[16], 64 ) = {0};
15313 #endif
15314 
15315 	double alpha1 = -1.0;
15316 
15317 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
15318 
15319 	if(n1<=3)
15320 		goto n3;
15321 
15322 	tmp = inv_diag_E[3];
15323 	CC[0+bs*3] *= tmp;
15324 	CC[1+bs*3] *= tmp;
15325 	CC[2+bs*3] *= tmp;
15326 	CC[3+bs*3] *= tmp;
15327 	tmp = E[3+lde*0];
15328 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
15329 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
15330 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
15331 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
15332 	tmp = E[3+lde*1];
15333 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
15334 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
15335 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
15336 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
15337 	tmp = E[3+lde*2];
15338 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
15339 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
15340 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
15341 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
15342 
15343 n3:
15344 	if(n1<=2)
15345 		goto n2;
15346 
15347 	tmp = inv_diag_E[2];
15348 	CC[0+bs*2] *= tmp;
15349 	CC[1+bs*2] *= tmp;
15350 	CC[2+bs*2] *= tmp;
15351 	CC[3+bs*2] *= tmp;
15352 	tmp = E[2+lde*0];
15353 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
15354 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
15355 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
15356 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
15357 	tmp = E[2+lde*1];
15358 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
15359 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
15360 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
15361 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
15362 
15363 n2:
15364 	if(n1<=1)
15365 		goto n1;
15366 
15367 	tmp = inv_diag_E[1];
15368 	CC[0+bs*1] *= tmp;
15369 	CC[1+bs*1] *= tmp;
15370 	CC[2+bs*1] *= tmp;
15371 	CC[3+bs*1] *= tmp;
15372 	tmp = E[1+lde*0];
15373 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
15374 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
15375 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
15376 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
15377 
15378 n1:
15379 
15380 	tmp = inv_diag_E[0];
15381 	CC[0+bs*0] *= tmp;
15382 	CC[1+bs*0] *= tmp;
15383 	CC[2+bs*0] *= tmp;
15384 	CC[3+bs*0] *= tmp;
15385 
15386 	store:
15387 
15388 	if(m1>=4)
15389 		{
15390 		D[0+bs*0] = CC[0+bs*0];
15391 		D[1+bs*0] = CC[1+bs*0];
15392 		D[2+bs*0] = CC[2+bs*0];
15393 		D[3+bs*0] = CC[3+bs*0];
15394 
15395 		if(n1==1)
15396 			return;
15397 
15398 		D[0+bs*1] = CC[0+bs*1];
15399 		D[1+bs*1] = CC[1+bs*1];
15400 		D[2+bs*1] = CC[2+bs*1];
15401 		D[3+bs*1] = CC[3+bs*1];
15402 
15403 		if(n1==2)
15404 			return;
15405 
15406 		D[0+bs*2] = CC[0+bs*2];
15407 		D[1+bs*2] = CC[1+bs*2];
15408 		D[2+bs*2] = CC[2+bs*2];
15409 		D[3+bs*2] = CC[3+bs*2];
15410 
15411 		if(n1==3)
15412 			return;
15413 
15414 		D[0+bs*3] = CC[0+bs*3];
15415 		D[1+bs*3] = CC[1+bs*3];
15416 		D[2+bs*3] = CC[2+bs*3];
15417 		D[3+bs*3] = CC[3+bs*3];
15418 		}
15419 	else if(m1>=3)
15420 		{
15421 		D[0+bs*0] = CC[0+bs*0];
15422 		D[1+bs*0] = CC[1+bs*0];
15423 		D[2+bs*0] = CC[2+bs*0];
15424 
15425 		if(n1==1)
15426 			return;
15427 
15428 		D[0+bs*1] = CC[0+bs*1];
15429 		D[1+bs*1] = CC[1+bs*1];
15430 		D[2+bs*1] = CC[2+bs*1];
15431 
15432 		if(n1==2)
15433 			return;
15434 
15435 		D[0+bs*2] = CC[0+bs*2];
15436 		D[1+bs*2] = CC[1+bs*2];
15437 		D[2+bs*2] = CC[2+bs*2];
15438 
15439 		if(n1==3)
15440 			return;
15441 
15442 		D[0+bs*3] = CC[0+bs*3];
15443 		D[1+bs*3] = CC[1+bs*3];
15444 		D[2+bs*3] = CC[2+bs*3];
15445 		}
15446 	else if(m1>=2)
15447 		{
15448 		D[0+bs*0] = CC[0+bs*0];
15449 		D[1+bs*0] = CC[1+bs*0];
15450 
15451 		if(n1==1)
15452 			return;
15453 
15454 		D[0+bs*1] = CC[0+bs*1];
15455 		D[1+bs*1] = CC[1+bs*1];
15456 
15457 		if(n1==2)
15458 			return;
15459 
15460 		D[0+bs*2] = CC[0+bs*2];
15461 		D[1+bs*2] = CC[1+bs*2];
15462 
15463 		if(n1==3)
15464 			return;
15465 
15466 		D[0+bs*3] = CC[0+bs*3];
15467 		D[1+bs*3] = CC[1+bs*3];
15468 		}
15469 	else //if(m1>=1)
15470 		{
15471 		D[0+bs*0] = CC[0+bs*0];
15472 
15473 		if(n1==1)
15474 			return;
15475 
15476 		D[0+bs*1] = CC[0+bs*1];
15477 
15478 		if(n1==2)
15479 			return;
15480 
15481 		D[0+bs*2] = CC[0+bs*2];
15482 
15483 		if(n1==3)
15484 			return;
15485 
15486 		D[0+bs*3] = CC[0+bs*3];
15487 		}
15488 
15489 	return;
15490 
15491 	}
15492 #endif
15493 
15494 
15495 
15496 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_rl_inv_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E)15497 void kernel_dtrsm_nn_rl_inv_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E)
15498 	{
15499 
15500 	const int bs = 4;
15501 
15502 	double tmp;
15503 
15504 #if defined(TARGET_GENERIC)
15505 	double CC[16] = {0};
15506 #else
15507 	ALIGNED( double CC[16], 64 ) = {0};
15508 #endif
15509 
15510 	double alpha1 = -1.0;
15511 
15512 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
15513 
15514 	tmp = inv_diag_E[3];
15515 	CC[0+bs*3] *= tmp;
15516 	CC[1+bs*3] *= tmp;
15517 	CC[2+bs*3] *= tmp;
15518 	CC[3+bs*3] *= tmp;
15519 	tmp = E[3+lde*0];
15520 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
15521 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
15522 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
15523 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
15524 	tmp = E[3+lde*1];
15525 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
15526 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
15527 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
15528 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
15529 	tmp = E[3+lde*2];
15530 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
15531 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
15532 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
15533 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
15534 
15535 	tmp = inv_diag_E[2];
15536 	CC[0+bs*2] *= tmp;
15537 	CC[1+bs*2] *= tmp;
15538 	CC[2+bs*2] *= tmp;
15539 	CC[3+bs*2] *= tmp;
15540 	tmp = E[2+lde*0];
15541 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
15542 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
15543 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
15544 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
15545 	tmp = E[2+lde*1];
15546 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
15547 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
15548 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
15549 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
15550 
15551 	tmp = inv_diag_E[1];
15552 	CC[0+bs*1] *= tmp;
15553 	CC[1+bs*1] *= tmp;
15554 	CC[2+bs*1] *= tmp;
15555 	CC[3+bs*1] *= tmp;
15556 	tmp = E[1+lde*0];
15557 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
15558 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
15559 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
15560 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
15561 
15562 	tmp = inv_diag_E[0];
15563 	CC[0+bs*0] *= tmp;
15564 	CC[1+bs*0] *= tmp;
15565 	CC[2+bs*0] *= tmp;
15566 	CC[3+bs*0] *= tmp;
15567 
15568 
15569 	D[0+ldd*0] = CC[0+bs*0];
15570 	D[1+ldd*0] = CC[1+bs*0];
15571 	D[2+ldd*0] = CC[2+bs*0];
15572 	D[3+ldd*0] = CC[3+bs*0];
15573 
15574 	D[0+ldd*1] = CC[0+bs*1];
15575 	D[1+ldd*1] = CC[1+bs*1];
15576 	D[2+ldd*1] = CC[2+bs*1];
15577 	D[3+ldd*1] = CC[3+bs*1];
15578 
15579 	D[0+ldd*2] = CC[0+bs*2];
15580 	D[1+ldd*2] = CC[1+bs*2];
15581 	D[2+ldd*2] = CC[2+bs*2];
15582 	D[3+ldd*2] = CC[3+bs*2];
15583 
15584 	D[0+ldd*3] = CC[0+bs*3];
15585 	D[1+ldd*3] = CC[1+bs*3];
15586 	D[2+ldd*3] = CC[2+bs*3];
15587 	D[3+ldd*3] = CC[3+bs*3];
15588 
15589 	return;
15590 
15591 	}
15592 #endif
15593 
15594 
15595 
15596 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_rl_inv_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E,int m1,int n1)15597 void kernel_dtrsm_nn_rl_inv_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1)
15598 	{
15599 
15600 	const int bs = 4;
15601 
15602 	double tmp;
15603 
15604 #if defined(TARGET_GENERIC)
15605 	double CC[16] = {0};
15606 #else
15607 	ALIGNED( double CC[16], 64 ) = {0};
15608 #endif
15609 
15610 	double alpha1 = -1.0;
15611 
15612 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
15613 
15614 	if(n1<=3)
15615 		goto n3;
15616 
15617 	tmp = inv_diag_E[3];
15618 	CC[0+bs*3] *= tmp;
15619 	CC[1+bs*3] *= tmp;
15620 	CC[2+bs*3] *= tmp;
15621 	CC[3+bs*3] *= tmp;
15622 	tmp = E[3+lde*0];
15623 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
15624 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
15625 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
15626 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
15627 	tmp = E[3+lde*1];
15628 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
15629 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
15630 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
15631 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
15632 	tmp = E[3+lde*2];
15633 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
15634 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
15635 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
15636 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
15637 
15638 n3:
15639 	if(n1<=2)
15640 		goto n2;
15641 
15642 	tmp = inv_diag_E[2];
15643 	CC[0+bs*2] *= tmp;
15644 	CC[1+bs*2] *= tmp;
15645 	CC[2+bs*2] *= tmp;
15646 	CC[3+bs*2] *= tmp;
15647 	tmp = E[2+lde*0];
15648 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
15649 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
15650 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
15651 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
15652 	tmp = E[2+lde*1];
15653 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
15654 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
15655 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
15656 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
15657 
15658 n2:
15659 	if(n1<=1)
15660 		goto n1;
15661 
15662 	tmp = inv_diag_E[1];
15663 	CC[0+bs*1] *= tmp;
15664 	CC[1+bs*1] *= tmp;
15665 	CC[2+bs*1] *= tmp;
15666 	CC[3+bs*1] *= tmp;
15667 	tmp = E[1+lde*0];
15668 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
15669 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
15670 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
15671 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
15672 
15673 n1:
15674 
15675 	tmp = inv_diag_E[0];
15676 	CC[0+bs*0] *= tmp;
15677 	CC[1+bs*0] *= tmp;
15678 	CC[2+bs*0] *= tmp;
15679 	CC[3+bs*0] *= tmp;
15680 
15681 	store:
15682 
15683 	if(m1>=4)
15684 		{
15685 		D[0+ldd*0] = CC[0+bs*0];
15686 		D[1+ldd*0] = CC[1+bs*0];
15687 		D[2+ldd*0] = CC[2+bs*0];
15688 		D[3+ldd*0] = CC[3+bs*0];
15689 
15690 		if(n1==1)
15691 			return;
15692 
15693 		D[0+ldd*1] = CC[0+bs*1];
15694 		D[1+ldd*1] = CC[1+bs*1];
15695 		D[2+ldd*1] = CC[2+bs*1];
15696 		D[3+ldd*1] = CC[3+bs*1];
15697 
15698 		if(n1==2)
15699 			return;
15700 
15701 		D[0+ldd*2] = CC[0+bs*2];
15702 		D[1+ldd*2] = CC[1+bs*2];
15703 		D[2+ldd*2] = CC[2+bs*2];
15704 		D[3+ldd*2] = CC[3+bs*2];
15705 
15706 		if(n1==3)
15707 			return;
15708 
15709 		D[0+ldd*3] = CC[0+bs*3];
15710 		D[1+ldd*3] = CC[1+bs*3];
15711 		D[2+ldd*3] = CC[2+bs*3];
15712 		D[3+ldd*3] = CC[3+bs*3];
15713 		}
15714 	else if(m1>=3)
15715 		{
15716 		D[0+ldd*0] = CC[0+bs*0];
15717 		D[1+ldd*0] = CC[1+bs*0];
15718 		D[2+ldd*0] = CC[2+bs*0];
15719 
15720 		if(n1==1)
15721 			return;
15722 
15723 		D[0+ldd*1] = CC[0+bs*1];
15724 		D[1+ldd*1] = CC[1+bs*1];
15725 		D[2+ldd*1] = CC[2+bs*1];
15726 
15727 		if(n1==2)
15728 			return;
15729 
15730 		D[0+ldd*2] = CC[0+bs*2];
15731 		D[1+ldd*2] = CC[1+bs*2];
15732 		D[2+ldd*2] = CC[2+bs*2];
15733 
15734 		if(n1==3)
15735 			return;
15736 
15737 		D[0+ldd*3] = CC[0+bs*3];
15738 		D[1+ldd*3] = CC[1+bs*3];
15739 		D[2+ldd*3] = CC[2+bs*3];
15740 		}
15741 	else if(m1>=2)
15742 		{
15743 		D[0+ldd*0] = CC[0+bs*0];
15744 		D[1+ldd*0] = CC[1+bs*0];
15745 
15746 		if(n1==1)
15747 			return;
15748 
15749 		D[0+ldd*1] = CC[0+bs*1];
15750 		D[1+ldd*1] = CC[1+bs*1];
15751 
15752 		if(n1==2)
15753 			return;
15754 
15755 		D[0+ldd*2] = CC[0+bs*2];
15756 		D[1+ldd*2] = CC[1+bs*2];
15757 
15758 		if(n1==3)
15759 			return;
15760 
15761 		D[0+ldd*3] = CC[0+bs*3];
15762 		D[1+ldd*3] = CC[1+bs*3];
15763 		}
15764 	else //if(m1>=1)
15765 		{
15766 		D[0+ldd*0] = CC[0+bs*0];
15767 
15768 		if(n1==1)
15769 			return;
15770 
15771 		D[0+ldd*1] = CC[0+bs*1];
15772 
15773 		if(n1==2)
15774 			return;
15775 
15776 		D[0+ldd*2] = CC[0+bs*2];
15777 
15778 		if(n1==3)
15779 			return;
15780 
15781 		D[0+ldd*3] = CC[0+bs*3];
15782 		}
15783 
15784 	return;
15785 
15786 	}
15787 #endif
15788 
15789 
15790 
15791 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_rl_one_4x4_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde)15792 void kernel_dtrsm_nn_rl_one_4x4_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde)
15793 	{
15794 
15795 	const int bs = 4;
15796 
15797 	double tmp;
15798 
15799 #if defined(TARGET_GENERIC)
15800 	double CC[16] = {0};
15801 #else
15802 	ALIGNED( double CC[16], 64 ) = {0};
15803 #endif
15804 
15805 	double alpha1 = -1.0;
15806 
15807 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
15808 
15809 	tmp = E[3+lde*0];
15810 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
15811 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
15812 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
15813 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
15814 	tmp = E[3+lde*1];
15815 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
15816 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
15817 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
15818 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
15819 	tmp = E[3+lde*2];
15820 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
15821 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
15822 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
15823 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
15824 
15825 	tmp = E[2+lde*0];
15826 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
15827 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
15828 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
15829 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
15830 	tmp = E[2+lde*1];
15831 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
15832 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
15833 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
15834 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
15835 
15836 	tmp = E[1+lde*0];
15837 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
15838 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
15839 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
15840 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
15841 
15842 
15843 	D[0+bs*0] = CC[0+bs*0];
15844 	D[1+bs*0] = CC[1+bs*0];
15845 	D[2+bs*0] = CC[2+bs*0];
15846 	D[3+bs*0] = CC[3+bs*0];
15847 
15848 	D[0+bs*1] = CC[0+bs*1];
15849 	D[1+bs*1] = CC[1+bs*1];
15850 	D[2+bs*1] = CC[2+bs*1];
15851 	D[3+bs*1] = CC[3+bs*1];
15852 
15853 	D[0+bs*2] = CC[0+bs*2];
15854 	D[1+bs*2] = CC[1+bs*2];
15855 	D[2+bs*2] = CC[2+bs*2];
15856 	D[3+bs*2] = CC[3+bs*2];
15857 
15858 	D[0+bs*3] = CC[0+bs*3];
15859 	D[1+bs*3] = CC[1+bs*3];
15860 	D[2+bs*3] = CC[2+bs*3];
15861 	D[3+bs*3] = CC[3+bs*3];
15862 
15863 	return;
15864 
15865 	}
15866 #endif
15867 
15868 
15869 
15870 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_rl_one_4x4_vs_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,int m1,int n1)15871 void kernel_dtrsm_nn_rl_one_4x4_vs_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1)
15872 	{
15873 
15874 	const int bs = 4;
15875 
15876 	double tmp;
15877 
15878 #if defined(TARGET_GENERIC)
15879 	double CC[16] = {0};
15880 #else
15881 	ALIGNED( double CC[16], 64 ) = {0};
15882 #endif
15883 
15884 	double alpha1 = -1.0;
15885 
15886 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
15887 
15888 	if(n1<=3)
15889 		goto n3;
15890 
15891 	tmp = E[3+lde*0];
15892 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
15893 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
15894 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
15895 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
15896 	tmp = E[3+lde*1];
15897 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
15898 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
15899 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
15900 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
15901 	tmp = E[3+lde*2];
15902 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
15903 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
15904 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
15905 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
15906 
15907 n3:
15908 	if(n1<=2)
15909 		goto n2;
15910 
15911 	tmp = E[2+lde*0];
15912 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
15913 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
15914 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
15915 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
15916 	tmp = E[2+lde*1];
15917 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
15918 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
15919 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
15920 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
15921 
15922 n2:
15923 	if(n1<=1)
15924 		goto n1;
15925 
15926 	tmp = E[1+lde*0];
15927 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
15928 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
15929 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
15930 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
15931 
15932 n1:
15933 
15934 	store:
15935 
15936 	if(m1>=4)
15937 		{
15938 		D[0+bs*0] = CC[0+bs*0];
15939 		D[1+bs*0] = CC[1+bs*0];
15940 		D[2+bs*0] = CC[2+bs*0];
15941 		D[3+bs*0] = CC[3+bs*0];
15942 
15943 		if(n1==1)
15944 			return;
15945 
15946 		D[0+bs*1] = CC[0+bs*1];
15947 		D[1+bs*1] = CC[1+bs*1];
15948 		D[2+bs*1] = CC[2+bs*1];
15949 		D[3+bs*1] = CC[3+bs*1];
15950 
15951 		if(n1==2)
15952 			return;
15953 
15954 		D[0+bs*2] = CC[0+bs*2];
15955 		D[1+bs*2] = CC[1+bs*2];
15956 		D[2+bs*2] = CC[2+bs*2];
15957 		D[3+bs*2] = CC[3+bs*2];
15958 
15959 		if(n1==3)
15960 			return;
15961 
15962 		D[0+bs*3] = CC[0+bs*3];
15963 		D[1+bs*3] = CC[1+bs*3];
15964 		D[2+bs*3] = CC[2+bs*3];
15965 		D[3+bs*3] = CC[3+bs*3];
15966 		}
15967 	else if(m1>=3)
15968 		{
15969 		D[0+bs*0] = CC[0+bs*0];
15970 		D[1+bs*0] = CC[1+bs*0];
15971 		D[2+bs*0] = CC[2+bs*0];
15972 
15973 		if(n1==1)
15974 			return;
15975 
15976 		D[0+bs*1] = CC[0+bs*1];
15977 		D[1+bs*1] = CC[1+bs*1];
15978 		D[2+bs*1] = CC[2+bs*1];
15979 
15980 		if(n1==2)
15981 			return;
15982 
15983 		D[0+bs*2] = CC[0+bs*2];
15984 		D[1+bs*2] = CC[1+bs*2];
15985 		D[2+bs*2] = CC[2+bs*2];
15986 
15987 		if(n1==3)
15988 			return;
15989 
15990 		D[0+bs*3] = CC[0+bs*3];
15991 		D[1+bs*3] = CC[1+bs*3];
15992 		D[2+bs*3] = CC[2+bs*3];
15993 		}
15994 	else if(m1>=2)
15995 		{
15996 		D[0+bs*0] = CC[0+bs*0];
15997 		D[1+bs*0] = CC[1+bs*0];
15998 
15999 		if(n1==1)
16000 			return;
16001 
16002 		D[0+bs*1] = CC[0+bs*1];
16003 		D[1+bs*1] = CC[1+bs*1];
16004 
16005 		if(n1==2)
16006 			return;
16007 
16008 		D[0+bs*2] = CC[0+bs*2];
16009 		D[1+bs*2] = CC[1+bs*2];
16010 
16011 		if(n1==3)
16012 			return;
16013 
16014 		D[0+bs*3] = CC[0+bs*3];
16015 		D[1+bs*3] = CC[1+bs*3];
16016 		}
16017 	else //if(m1>=1)
16018 		{
16019 		D[0+bs*0] = CC[0+bs*0];
16020 
16021 		if(n1==1)
16022 			return;
16023 
16024 		D[0+bs*1] = CC[0+bs*1];
16025 
16026 		if(n1==2)
16027 			return;
16028 
16029 		D[0+bs*2] = CC[0+bs*2];
16030 
16031 		if(n1==3)
16032 			return;
16033 
16034 		D[0+bs*3] = CC[0+bs*3];
16035 		}
16036 
16037 	return;
16038 
16039 	}
16040 #endif
16041 
16042 
16043 
16044 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_rl_one_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde)16045 void kernel_dtrsm_nn_rl_one_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde)
16046 	{
16047 
16048 	const int bs = 4;
16049 
16050 	double tmp;
16051 
16052 #if defined(TARGET_GENERIC)
16053 	double CC[16] = {0};
16054 #else
16055 	ALIGNED( double CC[16], 64 ) = {0};
16056 #endif
16057 
16058 	double alpha1 = -1.0;
16059 
16060 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
16061 
16062 	tmp = E[3+lde*0];
16063 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
16064 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
16065 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
16066 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
16067 	tmp = E[3+lde*1];
16068 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
16069 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
16070 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
16071 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
16072 	tmp = E[3+lde*2];
16073 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
16074 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
16075 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
16076 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
16077 
16078 	tmp = E[2+lde*0];
16079 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
16080 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
16081 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
16082 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
16083 	tmp = E[2+lde*1];
16084 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
16085 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
16086 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
16087 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
16088 
16089 	tmp = E[1+lde*0];
16090 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
16091 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
16092 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
16093 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
16094 
16095 
16096 	D[0+ldd*0] = CC[0+bs*0];
16097 	D[1+ldd*0] = CC[1+bs*0];
16098 	D[2+ldd*0] = CC[2+bs*0];
16099 	D[3+ldd*0] = CC[3+bs*0];
16100 
16101 	D[0+ldd*1] = CC[0+bs*1];
16102 	D[1+ldd*1] = CC[1+bs*1];
16103 	D[2+ldd*1] = CC[2+bs*1];
16104 	D[3+ldd*1] = CC[3+bs*1];
16105 
16106 	D[0+ldd*2] = CC[0+bs*2];
16107 	D[1+ldd*2] = CC[1+bs*2];
16108 	D[2+ldd*2] = CC[2+bs*2];
16109 	D[3+ldd*2] = CC[3+bs*2];
16110 
16111 	D[0+ldd*3] = CC[0+bs*3];
16112 	D[1+ldd*3] = CC[1+bs*3];
16113 	D[2+ldd*3] = CC[2+bs*3];
16114 	D[3+ldd*3] = CC[3+bs*3];
16115 
16116 	return;
16117 
16118 	}
16119 #endif
16120 
16121 
16122 
16123 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_rl_one_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,int m1,int n1)16124 void kernel_dtrsm_nn_rl_one_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1)
16125 	{
16126 
16127 	const int bs = 4;
16128 
16129 	double tmp;
16130 
16131 #if defined(TARGET_GENERIC)
16132 	double CC[16] = {0};
16133 #else
16134 	ALIGNED( double CC[16], 64 ) = {0};
16135 #endif
16136 
16137 	double alpha1 = -1.0;
16138 
16139 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
16140 
16141 	if(n1<=3)
16142 		goto n3;
16143 
16144 	tmp = E[3+lde*0];
16145 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
16146 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
16147 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
16148 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
16149 	tmp = E[3+lde*1];
16150 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
16151 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
16152 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
16153 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
16154 	tmp = E[3+lde*2];
16155 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
16156 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
16157 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
16158 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
16159 
16160 n3:
16161 	if(n1<=2)
16162 		goto n2;
16163 
16164 	tmp = E[2+lde*0];
16165 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
16166 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
16167 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
16168 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
16169 	tmp = E[2+lde*1];
16170 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
16171 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
16172 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
16173 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
16174 
16175 n2:
16176 	if(n1<=1)
16177 		goto n1;
16178 
16179 	tmp = E[1+lde*0];
16180 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
16181 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
16182 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
16183 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
16184 
16185 n1:
16186 
16187 	store:
16188 
16189 	if(m1>=4)
16190 		{
16191 		D[0+ldd*0] = CC[0+bs*0];
16192 		D[1+ldd*0] = CC[1+bs*0];
16193 		D[2+ldd*0] = CC[2+bs*0];
16194 		D[3+ldd*0] = CC[3+bs*0];
16195 
16196 		if(n1==1)
16197 			return;
16198 
16199 		D[0+ldd*1] = CC[0+bs*1];
16200 		D[1+ldd*1] = CC[1+bs*1];
16201 		D[2+ldd*1] = CC[2+bs*1];
16202 		D[3+ldd*1] = CC[3+bs*1];
16203 
16204 		if(n1==2)
16205 			return;
16206 
16207 		D[0+ldd*2] = CC[0+bs*2];
16208 		D[1+ldd*2] = CC[1+bs*2];
16209 		D[2+ldd*2] = CC[2+bs*2];
16210 		D[3+ldd*2] = CC[3+bs*2];
16211 
16212 		if(n1==3)
16213 			return;
16214 
16215 		D[0+ldd*3] = CC[0+bs*3];
16216 		D[1+ldd*3] = CC[1+bs*3];
16217 		D[2+ldd*3] = CC[2+bs*3];
16218 		D[3+ldd*3] = CC[3+bs*3];
16219 		}
16220 	else if(m1>=3)
16221 		{
16222 		D[0+ldd*0] = CC[0+bs*0];
16223 		D[1+ldd*0] = CC[1+bs*0];
16224 		D[2+ldd*0] = CC[2+bs*0];
16225 
16226 		if(n1==1)
16227 			return;
16228 
16229 		D[0+ldd*1] = CC[0+bs*1];
16230 		D[1+ldd*1] = CC[1+bs*1];
16231 		D[2+ldd*1] = CC[2+bs*1];
16232 
16233 		if(n1==2)
16234 			return;
16235 
16236 		D[0+ldd*2] = CC[0+bs*2];
16237 		D[1+ldd*2] = CC[1+bs*2];
16238 		D[2+ldd*2] = CC[2+bs*2];
16239 
16240 		if(n1==3)
16241 			return;
16242 
16243 		D[0+ldd*3] = CC[0+bs*3];
16244 		D[1+ldd*3] = CC[1+bs*3];
16245 		D[2+ldd*3] = CC[2+bs*3];
16246 		}
16247 	else if(m1>=2)
16248 		{
16249 		D[0+ldd*0] = CC[0+bs*0];
16250 		D[1+ldd*0] = CC[1+bs*0];
16251 
16252 		if(n1==1)
16253 			return;
16254 
16255 		D[0+ldd*1] = CC[0+bs*1];
16256 		D[1+ldd*1] = CC[1+bs*1];
16257 
16258 		if(n1==2)
16259 			return;
16260 
16261 		D[0+ldd*2] = CC[0+bs*2];
16262 		D[1+ldd*2] = CC[1+bs*2];
16263 
16264 		if(n1==3)
16265 			return;
16266 
16267 		D[0+ldd*3] = CC[0+bs*3];
16268 		D[1+ldd*3] = CC[1+bs*3];
16269 		}
16270 	else //if(m1>=1)
16271 		{
16272 		D[0+ldd*0] = CC[0+bs*0];
16273 
16274 		if(n1==1)
16275 			return;
16276 
16277 		D[0+ldd*1] = CC[0+bs*1];
16278 
16279 		if(n1==2)
16280 			return;
16281 
16282 		D[0+ldd*2] = CC[0+bs*2];
16283 
16284 		if(n1==3)
16285 			return;
16286 
16287 		D[0+ldd*3] = CC[0+bs*3];
16288 		}
16289 
16290 	return;
16291 
16292 	}
16293 #endif
16294 
16295 
16296 
16297 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_inv_4x4_lib44cc4(int kmax,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,double * E,double * inv_diag_E)16298 void kernel_dtrsm_nt_rl_inv_4x4_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E)
16299 	{
16300 
16301 	const int bs = 4;
16302 
16303 	double tmp;
16304 
16305 #if defined(TARGET_GENERIC)
16306 	double CC[16] = {0};
16307 #else
16308 	ALIGNED( double CC[16], 64 ) = {0};
16309 #endif
16310 
16311 	double alpha1 = -1.0;
16312 
16313 	CC[0+bs*0] = C[0+ldc*0];
16314 	CC[1+bs*0] = C[1+ldc*0];
16315 	CC[2+bs*0] = C[2+ldc*0];
16316 	CC[3+bs*0] = C[3+ldc*0];
16317 
16318 	CC[0+bs*1] = C[0+ldc*1];
16319 	CC[1+bs*1] = C[1+ldc*1];
16320 	CC[2+bs*1] = C[2+ldc*1];
16321 	CC[3+bs*1] = C[3+ldc*1];
16322 
16323 	CC[0+bs*2] = C[0+ldc*2];
16324 	CC[1+bs*2] = C[1+ldc*2];
16325 	CC[2+bs*2] = C[2+ldc*2];
16326 	CC[3+bs*2] = C[3+ldc*2];
16327 
16328 	CC[0+bs*3] = C[0+ldc*3];
16329 	CC[1+bs*3] = C[1+ldc*3];
16330 	CC[2+bs*3] = C[2+ldc*3];
16331 	CC[3+bs*3] = C[3+ldc*3];
16332 
16333 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, CC, CC);
16334 
16335 	tmp = inv_diag_E[0];
16336 	CC[0+bs*0] *= tmp;
16337 	CC[1+bs*0] *= tmp;
16338 	CC[2+bs*0] *= tmp;
16339 	CC[3+bs*0] *= tmp;
16340 
16341 	tmp = E[1+bs*0];
16342 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
16343 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
16344 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
16345 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
16346 	tmp = inv_diag_E[1];
16347 	CC[0+bs*1] *= tmp;
16348 	CC[1+bs*1] *= tmp;
16349 	CC[2+bs*1] *= tmp;
16350 	CC[3+bs*1] *= tmp;
16351 
16352 	tmp = E[2+bs*0];
16353 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
16354 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
16355 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
16356 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
16357 	tmp = E[2+bs*1];
16358 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
16359 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
16360 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
16361 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
16362 	tmp = inv_diag_E[2];
16363 	CC[0+bs*2] *= tmp;
16364 	CC[1+bs*2] *= tmp;
16365 	CC[2+bs*2] *= tmp;
16366 	CC[3+bs*2] *= tmp;
16367 
16368 	tmp = E[3+bs*0];
16369 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
16370 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
16371 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
16372 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
16373 	tmp = E[3+bs*1];
16374 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
16375 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
16376 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
16377 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
16378 	tmp = E[3+bs*2];
16379 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
16380 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
16381 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
16382 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
16383 	tmp = inv_diag_E[3];
16384 	CC[0+bs*3] *= tmp;
16385 	CC[1+bs*3] *= tmp;
16386 	CC[2+bs*3] *= tmp;
16387 	CC[3+bs*3] *= tmp;
16388 
16389 	D[0+ldd*0] = CC[0+bs*0];
16390 	D[1+ldd*0] = CC[1+bs*0];
16391 	D[2+ldd*0] = CC[2+bs*0];
16392 	D[3+ldd*0] = CC[3+bs*0];
16393 
16394 	D[0+ldd*1] = CC[0+bs*1];
16395 	D[1+ldd*1] = CC[1+bs*1];
16396 	D[2+ldd*1] = CC[2+bs*1];
16397 	D[3+ldd*1] = CC[3+bs*1];
16398 
16399 	D[0+ldd*2] = CC[0+bs*2];
16400 	D[1+ldd*2] = CC[1+bs*2];
16401 	D[2+ldd*2] = CC[2+bs*2];
16402 	D[3+ldd*2] = CC[3+bs*2];
16403 
16404 	D[0+ldd*3] = CC[0+bs*3];
16405 	D[1+ldd*3] = CC[1+bs*3];
16406 	D[2+ldd*3] = CC[2+bs*3];
16407 	D[3+ldd*3] = CC[3+bs*3];
16408 
16409 	return;
16410 
16411 	}
16412 #endif
16413 
16414 
16415 
16416 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_inv_4x4_vs_lib44cc4(int kmax,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,double * E,double * inv_diag_E,int m1,int n1)16417 void kernel_dtrsm_nt_rl_inv_4x4_vs_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E, int m1, int n1)
16418 	{
16419 
16420 	const int bs = 4;
16421 
16422 	double tmp;
16423 
16424 #if defined(TARGET_GENERIC)
16425 	double CC[16] = {0};
16426 #else
16427 	ALIGNED( double CC[16], 64 ) = {0};
16428 #endif
16429 
16430 	double alpha1 = -1.0;
16431 
16432 	if(m1>=4)
16433 		{
16434 		CC[0+bs*0] = C[0+ldc*0];
16435 		CC[1+bs*0] = C[1+ldc*0];
16436 		CC[2+bs*0] = C[2+ldc*0];
16437 		CC[3+bs*0] = C[3+ldc*0];
16438 
16439 		if(n1==1)
16440 			goto kernel;
16441 
16442 		CC[0+bs*1] = C[0+ldc*1];
16443 		CC[1+bs*1] = C[1+ldc*1];
16444 		CC[2+bs*1] = C[2+ldc*1];
16445 		CC[3+bs*1] = C[3+ldc*1];
16446 
16447 		if(n1==2)
16448 			goto kernel;
16449 
16450 		CC[0+bs*2] = C[0+ldc*2];
16451 		CC[1+bs*2] = C[1+ldc*2];
16452 		CC[2+bs*2] = C[2+ldc*2];
16453 		CC[3+bs*2] = C[3+ldc*2];
16454 
16455 		if(n1==3)
16456 			goto kernel;
16457 
16458 		CC[0+bs*3] = C[0+ldc*3];
16459 		CC[1+bs*3] = C[1+ldc*3];
16460 		CC[2+bs*3] = C[2+ldc*3];
16461 		CC[3+bs*3] = C[3+ldc*3];
16462 		}
16463 	else if(m1>=3)
16464 		{
16465 		CC[0+bs*0] = C[0+ldc*0];
16466 		CC[1+bs*0] = C[1+ldc*0];
16467 		CC[2+bs*0] = C[2+ldc*0];
16468 
16469 		if(n1==1)
16470 			goto kernel;
16471 
16472 		CC[0+bs*1] = C[0+ldc*1];
16473 		CC[1+bs*1] = C[1+ldc*1];
16474 		CC[2+bs*1] = C[2+ldc*1];
16475 
16476 		if(n1==2)
16477 			goto kernel;
16478 
16479 		CC[0+bs*2] = C[0+ldc*2];
16480 		CC[1+bs*2] = C[1+ldc*2];
16481 		CC[2+bs*2] = C[2+ldc*2];
16482 
16483 		if(n1==3)
16484 			goto kernel;
16485 
16486 		CC[0+bs*3] = C[0+ldc*3];
16487 		CC[1+bs*3] = C[1+ldc*3];
16488 		CC[2+bs*3] = C[2+ldc*3];
16489 		}
16490 	else if(m1>=2)
16491 		{
16492 		CC[0+bs*0] = C[0+ldc*0];
16493 		CC[1+bs*0] = C[1+ldc*0];
16494 
16495 		if(n1==1)
16496 			goto kernel;
16497 
16498 		CC[0+bs*1] = C[0+ldc*1];
16499 		CC[1+bs*1] = C[1+ldc*1];
16500 
16501 		if(n1==2)
16502 			goto kernel;
16503 
16504 		CC[0+bs*2] = C[0+ldc*2];
16505 		CC[1+bs*2] = C[1+ldc*2];
16506 
16507 		if(n1==3)
16508 			goto kernel;
16509 
16510 		CC[0+bs*3] = C[0+ldc*3];
16511 		CC[1+bs*3] = C[1+ldc*3];
16512 		}
16513 	else //if(m1>=1)
16514 		{
16515 		CC[0+bs*0] = C[0+ldc*0];
16516 
16517 		if(n1==1)
16518 			goto kernel;
16519 
16520 		CC[0+bs*1] = C[0+ldc*1];
16521 
16522 		if(n1==2)
16523 			goto kernel;
16524 
16525 		CC[0+bs*2] = C[0+ldc*2];
16526 
16527 		if(n1==3)
16528 			goto kernel;
16529 
16530 		CC[0+bs*3] = C[0+ldc*3];
16531 		}
16532 
16533 kernel:
16534 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, CC, CC);
16535 
16536 	tmp = inv_diag_E[0];
16537 	CC[0+bs*0] *= tmp;
16538 	CC[1+bs*0] *= tmp;
16539 	CC[2+bs*0] *= tmp;
16540 	CC[3+bs*0] *= tmp;
16541 
16542 	if(n1==1)
16543 		goto store;
16544 
16545 	tmp = E[1+bs*0];
16546 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
16547 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
16548 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
16549 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
16550 	tmp = inv_diag_E[1];
16551 	CC[0+bs*1] *= tmp;
16552 	CC[1+bs*1] *= tmp;
16553 	CC[2+bs*1] *= tmp;
16554 	CC[3+bs*1] *= tmp;
16555 
16556 	if(n1==2)
16557 		goto store;
16558 
16559 	tmp = E[2+bs*0];
16560 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
16561 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
16562 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
16563 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
16564 	tmp = E[2+bs*1];
16565 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
16566 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
16567 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
16568 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
16569 	tmp = inv_diag_E[2];
16570 	CC[0+bs*2] *= tmp;
16571 	CC[1+bs*2] *= tmp;
16572 	CC[2+bs*2] *= tmp;
16573 	CC[3+bs*2] *= tmp;
16574 
16575 	if(n1==3)
16576 		goto store;
16577 
16578 	tmp = E[3+bs*0];
16579 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
16580 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
16581 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
16582 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
16583 	tmp = E[3+bs*1];
16584 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
16585 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
16586 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
16587 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
16588 	tmp = E[3+bs*2];
16589 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
16590 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
16591 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
16592 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
16593 	tmp = inv_diag_E[3];
16594 	CC[0+bs*3] *= tmp;
16595 	CC[1+bs*3] *= tmp;
16596 	CC[2+bs*3] *= tmp;
16597 	CC[3+bs*3] *= tmp;
16598 
16599 	store:
16600 
16601 	if(m1>=4)
16602 		{
16603 		D[0+ldd*0] = CC[0+bs*0];
16604 		D[1+ldd*0] = CC[1+bs*0];
16605 		D[2+ldd*0] = CC[2+bs*0];
16606 		D[3+ldd*0] = CC[3+bs*0];
16607 
16608 		if(n1==1)
16609 			return;
16610 
16611 		D[0+ldd*1] = CC[0+bs*1];
16612 		D[1+ldd*1] = CC[1+bs*1];
16613 		D[2+ldd*1] = CC[2+bs*1];
16614 		D[3+ldd*1] = CC[3+bs*1];
16615 
16616 		if(n1==2)
16617 			return;
16618 
16619 		D[0+ldd*2] = CC[0+bs*2];
16620 		D[1+ldd*2] = CC[1+bs*2];
16621 		D[2+ldd*2] = CC[2+bs*2];
16622 		D[3+ldd*2] = CC[3+bs*2];
16623 
16624 		if(n1==3)
16625 			return;
16626 
16627 		D[0+ldd*3] = CC[0+bs*3];
16628 		D[1+ldd*3] = CC[1+bs*3];
16629 		D[2+ldd*3] = CC[2+bs*3];
16630 		D[3+ldd*3] = CC[3+bs*3];
16631 		}
16632 	else if(m1>=3)
16633 		{
16634 		D[0+ldd*0] = CC[0+bs*0];
16635 		D[1+ldd*0] = CC[1+bs*0];
16636 		D[2+ldd*0] = CC[2+bs*0];
16637 
16638 		if(n1==1)
16639 			return;
16640 
16641 		D[0+ldd*1] = CC[0+bs*1];
16642 		D[1+ldd*1] = CC[1+bs*1];
16643 		D[2+ldd*1] = CC[2+bs*1];
16644 
16645 		if(n1==2)
16646 			return;
16647 
16648 		D[0+ldd*2] = CC[0+bs*2];
16649 		D[1+ldd*2] = CC[1+bs*2];
16650 		D[2+ldd*2] = CC[2+bs*2];
16651 
16652 		if(n1==3)
16653 			return;
16654 
16655 		D[0+ldd*3] = CC[0+bs*3];
16656 		D[1+ldd*3] = CC[1+bs*3];
16657 		D[2+ldd*3] = CC[2+bs*3];
16658 		}
16659 	else if(m1>=2)
16660 		{
16661 		D[0+ldd*0] = CC[0+bs*0];
16662 		D[1+ldd*0] = CC[1+bs*0];
16663 
16664 		if(n1==1)
16665 			return;
16666 
16667 		D[0+ldd*1] = CC[0+bs*1];
16668 		D[1+ldd*1] = CC[1+bs*1];
16669 
16670 		if(n1==2)
16671 			return;
16672 
16673 		D[0+ldd*2] = CC[0+bs*2];
16674 		D[1+ldd*2] = CC[1+bs*2];
16675 
16676 		if(n1==3)
16677 			return;
16678 
16679 		D[0+ldd*3] = CC[0+bs*3];
16680 		D[1+ldd*3] = CC[1+bs*3];
16681 		}
16682 	else //if(m1>=1)
16683 		{
16684 		D[0+ldd*0] = CC[0+bs*0];
16685 
16686 		if(n1==1)
16687 			return;
16688 
16689 		D[0+ldd*1] = CC[0+bs*1];
16690 
16691 		if(n1==2)
16692 			return;
16693 
16694 		D[0+ldd*2] = CC[0+bs*2];
16695 
16696 		if(n1==3)
16697 			return;
16698 
16699 		D[0+ldd*3] = CC[0+bs*3];
16700 		}
16701 
16702 	return;
16703 
16704 	}
16705 #endif
16706 
16707 
16708 
16709 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER)
kernel_dtrsm_nt_rl_inv_4x4_lib44ccc(int kmax,double * A,double * B,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E)16710 void kernel_dtrsm_nt_rl_inv_4x4_lib44ccc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E)
16711 	{
16712 
16713 	const int bs = 4;
16714 
16715 	double tmp;
16716 
16717 #if defined(TARGET_GENERIC)
16718 	double CC[16] = {0};
16719 #else
16720 	ALIGNED( double CC[16], 64 ) = {0};
16721 #endif
16722 
16723 	double alpha1 = -1.0;
16724 	double beta1  = 1.0;
16725 
16726 	CC[0+bs*0] = C[0+ldc*0];
16727 	CC[1+bs*0] = C[1+ldc*0];
16728 	CC[2+bs*0] = C[2+ldc*0];
16729 	CC[3+bs*0] = C[3+ldc*0];
16730 
16731 	CC[0+bs*1] = C[0+ldc*1];
16732 	CC[1+bs*1] = C[1+ldc*1];
16733 	CC[2+bs*1] = C[2+ldc*1];
16734 	CC[3+bs*1] = C[3+ldc*1];
16735 
16736 	CC[0+bs*2] = C[0+ldc*2];
16737 	CC[1+bs*2] = C[1+ldc*2];
16738 	CC[2+bs*2] = C[2+ldc*2];
16739 	CC[3+bs*2] = C[3+ldc*2];
16740 
16741 	CC[0+bs*3] = C[0+ldc*3];
16742 	CC[1+bs*3] = C[1+ldc*3];
16743 	CC[2+bs*3] = C[2+ldc*3];
16744 	CC[3+bs*3] = C[3+ldc*3];
16745 
16746 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
16747 
16748 	tmp = inv_diag_E[0];
16749 	CC[0+bs*0] *= tmp;
16750 	CC[1+bs*0] *= tmp;
16751 	CC[2+bs*0] *= tmp;
16752 	CC[3+bs*0] *= tmp;
16753 
16754 	tmp = E[1+lde*0];
16755 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
16756 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
16757 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
16758 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
16759 	tmp = inv_diag_E[1];
16760 	CC[0+bs*1] *= tmp;
16761 	CC[1+bs*1] *= tmp;
16762 	CC[2+bs*1] *= tmp;
16763 	CC[3+bs*1] *= tmp;
16764 
16765 	tmp = E[2+lde*0];
16766 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
16767 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
16768 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
16769 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
16770 	tmp = E[2+lde*1];
16771 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
16772 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
16773 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
16774 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
16775 	tmp = inv_diag_E[2];
16776 	CC[0+bs*2] *= tmp;
16777 	CC[1+bs*2] *= tmp;
16778 	CC[2+bs*2] *= tmp;
16779 	CC[3+bs*2] *= tmp;
16780 
16781 	tmp = E[3+lde*0];
16782 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
16783 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
16784 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
16785 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
16786 	tmp = E[3+lde*1];
16787 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
16788 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
16789 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
16790 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
16791 	tmp = E[3+lde*2];
16792 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
16793 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
16794 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
16795 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
16796 	tmp = inv_diag_E[3];
16797 	CC[0+bs*3] *= tmp;
16798 	CC[1+bs*3] *= tmp;
16799 	CC[2+bs*3] *= tmp;
16800 	CC[3+bs*3] *= tmp;
16801 
16802 	D[0+ldd*0] = CC[0+bs*0];
16803 	D[1+ldd*0] = CC[1+bs*0];
16804 	D[2+ldd*0] = CC[2+bs*0];
16805 	D[3+ldd*0] = CC[3+bs*0];
16806 
16807 	D[0+ldd*1] = CC[0+bs*1];
16808 	D[1+ldd*1] = CC[1+bs*1];
16809 	D[2+ldd*1] = CC[2+bs*1];
16810 	D[3+ldd*1] = CC[3+bs*1];
16811 
16812 	D[0+ldd*2] = CC[0+bs*2];
16813 	D[1+ldd*2] = CC[1+bs*2];
16814 	D[2+ldd*2] = CC[2+bs*2];
16815 	D[3+ldd*2] = CC[3+bs*2];
16816 
16817 	D[0+ldd*3] = CC[0+bs*3];
16818 	D[1+ldd*3] = CC[1+bs*3];
16819 	D[2+ldd*3] = CC[2+bs*3];
16820 	D[3+ldd*3] = CC[3+bs*3];
16821 
16822 	return;
16823 
16824 	}
16825 #endif
16826 
16827 
16828 
16829 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc(int kmax,double * A,double * B,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E,int m1,int n1)16830 void kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1)
16831 	{
16832 
16833 	const int bs = 4;
16834 
16835 	double tmp;
16836 
16837 #if defined(TARGET_GENERIC)
16838 	double CC[16] = {0};
16839 #else
16840 	ALIGNED( double CC[16], 64 ) = {0};
16841 #endif
16842 
16843 	double alpha1 = -1.0;
16844 	double beta1  = 1.0;
16845 
16846 	if(m1>=4)
16847 		{
16848 		CC[0+bs*0] = C[0+ldc*0];
16849 		CC[1+bs*0] = C[1+ldc*0];
16850 		CC[2+bs*0] = C[2+ldc*0];
16851 		CC[3+bs*0] = C[3+ldc*0];
16852 
16853 		if(n1==1)
16854 			goto kernel;
16855 
16856 		CC[0+bs*1] = C[0+ldc*1];
16857 		CC[1+bs*1] = C[1+ldc*1];
16858 		CC[2+bs*1] = C[2+ldc*1];
16859 		CC[3+bs*1] = C[3+ldc*1];
16860 
16861 		if(n1==2)
16862 			goto kernel;
16863 
16864 		CC[0+bs*2] = C[0+ldc*2];
16865 		CC[1+bs*2] = C[1+ldc*2];
16866 		CC[2+bs*2] = C[2+ldc*2];
16867 		CC[3+bs*2] = C[3+ldc*2];
16868 
16869 		if(n1==3)
16870 			goto kernel;
16871 
16872 		CC[0+bs*3] = C[0+ldc*3];
16873 		CC[1+bs*3] = C[1+ldc*3];
16874 		CC[2+bs*3] = C[2+ldc*3];
16875 		CC[3+bs*3] = C[3+ldc*3];
16876 		}
16877 	else if(m1>=3)
16878 		{
16879 		CC[0+bs*0] = C[0+ldc*0];
16880 		CC[1+bs*0] = C[1+ldc*0];
16881 		CC[2+bs*0] = C[2+ldc*0];
16882 
16883 		if(n1==1)
16884 			goto kernel;
16885 
16886 		CC[0+bs*1] = C[0+ldc*1];
16887 		CC[1+bs*1] = C[1+ldc*1];
16888 		CC[2+bs*1] = C[2+ldc*1];
16889 
16890 		if(n1==2)
16891 			goto kernel;
16892 
16893 		CC[0+bs*2] = C[0+ldc*2];
16894 		CC[1+bs*2] = C[1+ldc*2];
16895 		CC[2+bs*2] = C[2+ldc*2];
16896 
16897 		if(n1==3)
16898 			goto kernel;
16899 
16900 		CC[0+bs*3] = C[0+ldc*3];
16901 		CC[1+bs*3] = C[1+ldc*3];
16902 		CC[2+bs*3] = C[2+ldc*3];
16903 		}
16904 	else if(m1>=2)
16905 		{
16906 		CC[0+bs*0] = C[0+ldc*0];
16907 		CC[1+bs*0] = C[1+ldc*0];
16908 
16909 		if(n1==1)
16910 			goto kernel;
16911 
16912 		CC[0+bs*1] = C[0+ldc*1];
16913 		CC[1+bs*1] = C[1+ldc*1];
16914 
16915 		if(n1==2)
16916 			goto kernel;
16917 
16918 		CC[0+bs*2] = C[0+ldc*2];
16919 		CC[1+bs*2] = C[1+ldc*2];
16920 
16921 		if(n1==3)
16922 			goto kernel;
16923 
16924 		CC[0+bs*3] = C[0+ldc*3];
16925 		CC[1+bs*3] = C[1+ldc*3];
16926 		}
16927 	else //if(m1>=1)
16928 		{
16929 		CC[0+bs*0] = C[0+ldc*0];
16930 
16931 		if(n1==1)
16932 			goto kernel;
16933 
16934 		CC[0+bs*1] = C[0+ldc*1];
16935 
16936 		if(n1==2)
16937 			goto kernel;
16938 
16939 		CC[0+bs*2] = C[0+ldc*2];
16940 
16941 		if(n1==3)
16942 			goto kernel;
16943 
16944 		CC[0+bs*3] = C[0+ldc*3];
16945 		}
16946 
16947 kernel:
16948 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, CC, CC);
16949 
16950 	tmp = inv_diag_E[0];
16951 	CC[0+bs*0] *= tmp;
16952 	CC[1+bs*0] *= tmp;
16953 	CC[2+bs*0] *= tmp;
16954 	CC[3+bs*0] *= tmp;
16955 
16956 	if(n1==1)
16957 		goto store;
16958 
16959 	tmp = E[1+lde*0];
16960 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
16961 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
16962 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
16963 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
16964 	tmp = inv_diag_E[1];
16965 	CC[0+bs*1] *= tmp;
16966 	CC[1+bs*1] *= tmp;
16967 	CC[2+bs*1] *= tmp;
16968 	CC[3+bs*1] *= tmp;
16969 
16970 	if(n1==2)
16971 		goto store;
16972 
16973 	tmp = E[2+lde*0];
16974 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
16975 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
16976 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
16977 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
16978 	tmp = E[2+lde*1];
16979 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
16980 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
16981 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
16982 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
16983 	tmp = inv_diag_E[2];
16984 	CC[0+bs*2] *= tmp;
16985 	CC[1+bs*2] *= tmp;
16986 	CC[2+bs*2] *= tmp;
16987 	CC[3+bs*2] *= tmp;
16988 
16989 	if(n1==3)
16990 		goto store;
16991 
16992 	tmp = E[3+lde*0];
16993 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
16994 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
16995 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
16996 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
16997 	tmp = E[3+lde*1];
16998 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
16999 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
17000 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
17001 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
17002 	tmp = E[3+lde*2];
17003 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
17004 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
17005 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
17006 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
17007 	tmp = inv_diag_E[3];
17008 	CC[0+bs*3] *= tmp;
17009 	CC[1+bs*3] *= tmp;
17010 	CC[2+bs*3] *= tmp;
17011 	CC[3+bs*3] *= tmp;
17012 
17013 	store:
17014 
17015 	if(m1>=4)
17016 		{
17017 		D[0+ldd*0] = CC[0+bs*0];
17018 		D[1+ldd*0] = CC[1+bs*0];
17019 		D[2+ldd*0] = CC[2+bs*0];
17020 		D[3+ldd*0] = CC[3+bs*0];
17021 
17022 		if(n1==1)
17023 			return;
17024 
17025 		D[0+ldd*1] = CC[0+bs*1];
17026 		D[1+ldd*1] = CC[1+bs*1];
17027 		D[2+ldd*1] = CC[2+bs*1];
17028 		D[3+ldd*1] = CC[3+bs*1];
17029 
17030 		if(n1==2)
17031 			return;
17032 
17033 		D[0+ldd*2] = CC[0+bs*2];
17034 		D[1+ldd*2] = CC[1+bs*2];
17035 		D[2+ldd*2] = CC[2+bs*2];
17036 		D[3+ldd*2] = CC[3+bs*2];
17037 
17038 		if(n1==3)
17039 			return;
17040 
17041 		D[0+ldd*3] = CC[0+bs*3];
17042 		D[1+ldd*3] = CC[1+bs*3];
17043 		D[2+ldd*3] = CC[2+bs*3];
17044 		D[3+ldd*3] = CC[3+bs*3];
17045 		}
17046 	else if(m1>=3)
17047 		{
17048 		D[0+ldd*0] = CC[0+bs*0];
17049 		D[1+ldd*0] = CC[1+bs*0];
17050 		D[2+ldd*0] = CC[2+bs*0];
17051 
17052 		if(n1==1)
17053 			return;
17054 
17055 		D[0+ldd*1] = CC[0+bs*1];
17056 		D[1+ldd*1] = CC[1+bs*1];
17057 		D[2+ldd*1] = CC[2+bs*1];
17058 
17059 		if(n1==2)
17060 			return;
17061 
17062 		D[0+ldd*2] = CC[0+bs*2];
17063 		D[1+ldd*2] = CC[1+bs*2];
17064 		D[2+ldd*2] = CC[2+bs*2];
17065 
17066 		if(n1==3)
17067 			return;
17068 
17069 		D[0+ldd*3] = CC[0+bs*3];
17070 		D[1+ldd*3] = CC[1+bs*3];
17071 		D[2+ldd*3] = CC[2+bs*3];
17072 		}
17073 	else if(m1>=2)
17074 		{
17075 		D[0+ldd*0] = CC[0+bs*0];
17076 		D[1+ldd*0] = CC[1+bs*0];
17077 
17078 		if(n1==1)
17079 			return;
17080 
17081 		D[0+ldd*1] = CC[0+bs*1];
17082 		D[1+ldd*1] = CC[1+bs*1];
17083 
17084 		if(n1==2)
17085 			return;
17086 
17087 		D[0+ldd*2] = CC[0+bs*2];
17088 		D[1+ldd*2] = CC[1+bs*2];
17089 
17090 		if(n1==3)
17091 			return;
17092 
17093 		D[0+ldd*3] = CC[0+bs*3];
17094 		D[1+ldd*3] = CC[1+bs*3];
17095 		}
17096 	else //if(m1>=1)
17097 		{
17098 		D[0+ldd*0] = CC[0+bs*0];
17099 
17100 		if(n1==1)
17101 			return;
17102 
17103 		D[0+ldd*1] = CC[0+bs*1];
17104 
17105 		if(n1==2)
17106 			return;
17107 
17108 		D[0+ldd*2] = CC[0+bs*2];
17109 
17110 		if(n1==3)
17111 			return;
17112 
17113 		D[0+ldd*3] = CC[0+bs*3];
17114 		}
17115 
17116 	return;
17117 
17118 	}
17119 #endif
17120 
17121 
17122 
17123 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_inv_4x4_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,double * inv_diag_E)17124 void kernel_dtrsm_nt_rl_inv_4x4_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E)
17125 	{
17126 
17127 	const int bs = 4;
17128 
17129 	double tmp;
17130 
17131 #if defined(TARGET_GENERIC)
17132 	double CC[16] = {0};
17133 #else
17134 	ALIGNED( double CC[16], 64 ) = {0};
17135 #endif
17136 
17137 	double alpha1 = -1.0;
17138 
17139 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
17140 
17141 	tmp = inv_diag_E[0];
17142 	CC[0+bs*0] *= tmp;
17143 	CC[1+bs*0] *= tmp;
17144 	CC[2+bs*0] *= tmp;
17145 	CC[3+bs*0] *= tmp;
17146 
17147 	tmp = E[1+lde*0];
17148 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
17149 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
17150 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
17151 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
17152 	tmp = inv_diag_E[1];
17153 	CC[0+bs*1] *= tmp;
17154 	CC[1+bs*1] *= tmp;
17155 	CC[2+bs*1] *= tmp;
17156 	CC[3+bs*1] *= tmp;
17157 
17158 	tmp = E[2+lde*0];
17159 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
17160 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
17161 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
17162 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
17163 	tmp = E[2+lde*1];
17164 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
17165 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
17166 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
17167 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
17168 	tmp = inv_diag_E[2];
17169 	CC[0+bs*2] *= tmp;
17170 	CC[1+bs*2] *= tmp;
17171 	CC[2+bs*2] *= tmp;
17172 	CC[3+bs*2] *= tmp;
17173 
17174 	tmp = E[3+lde*0];
17175 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
17176 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
17177 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
17178 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
17179 	tmp = E[3+lde*1];
17180 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
17181 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
17182 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
17183 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
17184 	tmp = E[3+lde*2];
17185 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
17186 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
17187 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
17188 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
17189 	tmp = inv_diag_E[3];
17190 	CC[0+bs*3] *= tmp;
17191 	CC[1+bs*3] *= tmp;
17192 	CC[2+bs*3] *= tmp;
17193 	CC[3+bs*3] *= tmp;
17194 
17195 	D[0+bs*0] = CC[0+bs*0];
17196 	D[1+bs*0] = CC[1+bs*0];
17197 	D[2+bs*0] = CC[2+bs*0];
17198 	D[3+bs*0] = CC[3+bs*0];
17199 
17200 	D[0+bs*1] = CC[0+bs*1];
17201 	D[1+bs*1] = CC[1+bs*1];
17202 	D[2+bs*1] = CC[2+bs*1];
17203 	D[3+bs*1] = CC[3+bs*1];
17204 
17205 	D[0+bs*2] = CC[0+bs*2];
17206 	D[1+bs*2] = CC[1+bs*2];
17207 	D[2+bs*2] = CC[2+bs*2];
17208 	D[3+bs*2] = CC[3+bs*2];
17209 
17210 	D[0+bs*3] = CC[0+bs*3];
17211 	D[1+bs*3] = CC[1+bs*3];
17212 	D[2+bs*3] = CC[2+bs*3];
17213 	D[3+bs*3] = CC[3+bs*3];
17214 
17215 	return;
17216 
17217 	}
17218 #endif
17219 
17220 
17221 
17222 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_inv_4x4_vs_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,double * inv_diag_E,int m1,int n1)17223 void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1)
17224 	{
17225 
17226 	const int bs = 4;
17227 
17228 	double tmp;
17229 
17230 #if defined(TARGET_GENERIC)
17231 	double CC[16] = {0};
17232 #else
17233 	ALIGNED( double CC[16], 64 ) = {0};
17234 #endif
17235 
17236 	double alpha1 = -1.0;
17237 
17238 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
17239 
17240 	tmp = inv_diag_E[0];
17241 	CC[0+bs*0] *= tmp;
17242 	CC[1+bs*0] *= tmp;
17243 	CC[2+bs*0] *= tmp;
17244 	CC[3+bs*0] *= tmp;
17245 
17246 	if(n1==1)
17247 		goto store;
17248 
17249 	tmp = E[1+lde*0];
17250 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
17251 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
17252 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
17253 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
17254 	tmp = inv_diag_E[1];
17255 	CC[0+bs*1] *= tmp;
17256 	CC[1+bs*1] *= tmp;
17257 	CC[2+bs*1] *= tmp;
17258 	CC[3+bs*1] *= tmp;
17259 
17260 	if(n1==2)
17261 		goto store;
17262 
17263 	tmp = E[2+lde*0];
17264 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
17265 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
17266 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
17267 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
17268 	tmp = E[2+lde*1];
17269 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
17270 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
17271 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
17272 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
17273 	tmp = inv_diag_E[2];
17274 	CC[0+bs*2] *= tmp;
17275 	CC[1+bs*2] *= tmp;
17276 	CC[2+bs*2] *= tmp;
17277 	CC[3+bs*2] *= tmp;
17278 
17279 	if(n1==3)
17280 		goto store;
17281 
17282 	tmp = E[3+lde*0];
17283 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
17284 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
17285 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
17286 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
17287 	tmp = E[3+lde*1];
17288 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
17289 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
17290 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
17291 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
17292 	tmp = E[3+lde*2];
17293 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
17294 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
17295 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
17296 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
17297 	tmp = inv_diag_E[3];
17298 	CC[0+bs*3] *= tmp;
17299 	CC[1+bs*3] *= tmp;
17300 	CC[2+bs*3] *= tmp;
17301 	CC[3+bs*3] *= tmp;
17302 
17303 	store:
17304 
17305 	if(m1>=4)
17306 		{
17307 		D[0+bs*0] = CC[0+bs*0];
17308 		D[1+bs*0] = CC[1+bs*0];
17309 		D[2+bs*0] = CC[2+bs*0];
17310 		D[3+bs*0] = CC[3+bs*0];
17311 
17312 		if(n1==1)
17313 			return;
17314 
17315 		D[0+bs*1] = CC[0+bs*1];
17316 		D[1+bs*1] = CC[1+bs*1];
17317 		D[2+bs*1] = CC[2+bs*1];
17318 		D[3+bs*1] = CC[3+bs*1];
17319 
17320 		if(n1==2)
17321 			return;
17322 
17323 		D[0+bs*2] = CC[0+bs*2];
17324 		D[1+bs*2] = CC[1+bs*2];
17325 		D[2+bs*2] = CC[2+bs*2];
17326 		D[3+bs*2] = CC[3+bs*2];
17327 
17328 		if(n1==3)
17329 			return;
17330 
17331 		D[0+bs*3] = CC[0+bs*3];
17332 		D[1+bs*3] = CC[1+bs*3];
17333 		D[2+bs*3] = CC[2+bs*3];
17334 		D[3+bs*3] = CC[3+bs*3];
17335 		}
17336 	else if(m1>=3)
17337 		{
17338 		D[0+bs*0] = CC[0+bs*0];
17339 		D[1+bs*0] = CC[1+bs*0];
17340 		D[2+bs*0] = CC[2+bs*0];
17341 
17342 		if(n1==1)
17343 			return;
17344 
17345 		D[0+bs*1] = CC[0+bs*1];
17346 		D[1+bs*1] = CC[1+bs*1];
17347 		D[2+bs*1] = CC[2+bs*1];
17348 
17349 		if(n1==2)
17350 			return;
17351 
17352 		D[0+bs*2] = CC[0+bs*2];
17353 		D[1+bs*2] = CC[1+bs*2];
17354 		D[2+bs*2] = CC[2+bs*2];
17355 
17356 		if(n1==3)
17357 			return;
17358 
17359 		D[0+bs*3] = CC[0+bs*3];
17360 		D[1+bs*3] = CC[1+bs*3];
17361 		D[2+bs*3] = CC[2+bs*3];
17362 		}
17363 	else if(m1>=2)
17364 		{
17365 		D[0+bs*0] = CC[0+bs*0];
17366 		D[1+bs*0] = CC[1+bs*0];
17367 
17368 		if(n1==1)
17369 			return;
17370 
17371 		D[0+bs*1] = CC[0+bs*1];
17372 		D[1+bs*1] = CC[1+bs*1];
17373 
17374 		if(n1==2)
17375 			return;
17376 
17377 		D[0+bs*2] = CC[0+bs*2];
17378 		D[1+bs*2] = CC[1+bs*2];
17379 
17380 		if(n1==3)
17381 			return;
17382 
17383 		D[0+bs*3] = CC[0+bs*3];
17384 		D[1+bs*3] = CC[1+bs*3];
17385 		}
17386 	else //if(m1>=1)
17387 		{
17388 		D[0+bs*0] = CC[0+bs*0];
17389 
17390 		if(n1==1)
17391 			return;
17392 
17393 		D[0+bs*1] = CC[0+bs*1];
17394 
17395 		if(n1==2)
17396 			return;
17397 
17398 		D[0+bs*2] = CC[0+bs*2];
17399 
17400 		if(n1==3)
17401 			return;
17402 
17403 		D[0+bs*3] = CC[0+bs*3];
17404 		}
17405 
17406 	return;
17407 
17408 	}
17409 #endif
17410 
17411 
17412 
17413 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_inv_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E)17414 void kernel_dtrsm_nt_rl_inv_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E)
17415 	{
17416 
17417 	const int bs = 4;
17418 
17419 	double tmp;
17420 
17421 #if defined(TARGET_GENERIC)
17422 	double CC[16] = {0};
17423 #else
17424 	ALIGNED( double CC[16], 64 ) = {0};
17425 #endif
17426 
17427 	double alpha1 = -1.0;
17428 
17429 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
17430 
17431 	tmp = inv_diag_E[0];
17432 	CC[0+bs*0] *= tmp;
17433 	CC[1+bs*0] *= tmp;
17434 	CC[2+bs*0] *= tmp;
17435 	CC[3+bs*0] *= tmp;
17436 
17437 	tmp = E[1+lde*0];
17438 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
17439 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
17440 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
17441 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
17442 	tmp = inv_diag_E[1];
17443 	CC[0+bs*1] *= tmp;
17444 	CC[1+bs*1] *= tmp;
17445 	CC[2+bs*1] *= tmp;
17446 	CC[3+bs*1] *= tmp;
17447 
17448 	tmp = E[2+lde*0];
17449 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
17450 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
17451 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
17452 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
17453 	tmp = E[2+lde*1];
17454 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
17455 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
17456 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
17457 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
17458 	tmp = inv_diag_E[2];
17459 	CC[0+bs*2] *= tmp;
17460 	CC[1+bs*2] *= tmp;
17461 	CC[2+bs*2] *= tmp;
17462 	CC[3+bs*2] *= tmp;
17463 
17464 	tmp = E[3+lde*0];
17465 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
17466 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
17467 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
17468 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
17469 	tmp = E[3+lde*1];
17470 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
17471 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
17472 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
17473 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
17474 	tmp = E[3+lde*2];
17475 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
17476 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
17477 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
17478 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
17479 	tmp = inv_diag_E[3];
17480 	CC[0+bs*3] *= tmp;
17481 	CC[1+bs*3] *= tmp;
17482 	CC[2+bs*3] *= tmp;
17483 	CC[3+bs*3] *= tmp;
17484 
17485 	D[0+ldd*0] = CC[0+bs*0];
17486 	D[1+ldd*0] = CC[1+bs*0];
17487 	D[2+ldd*0] = CC[2+bs*0];
17488 	D[3+ldd*0] = CC[3+bs*0];
17489 
17490 	D[0+ldd*1] = CC[0+bs*1];
17491 	D[1+ldd*1] = CC[1+bs*1];
17492 	D[2+ldd*1] = CC[2+bs*1];
17493 	D[3+ldd*1] = CC[3+bs*1];
17494 
17495 	D[0+ldd*2] = CC[0+bs*2];
17496 	D[1+ldd*2] = CC[1+bs*2];
17497 	D[2+ldd*2] = CC[2+bs*2];
17498 	D[3+ldd*2] = CC[3+bs*2];
17499 
17500 	D[0+ldd*3] = CC[0+bs*3];
17501 	D[1+ldd*3] = CC[1+bs*3];
17502 	D[2+ldd*3] = CC[2+bs*3];
17503 	D[3+ldd*3] = CC[3+bs*3];
17504 
17505 	return;
17506 
17507 	}
17508 #endif
17509 
17510 
17511 
17512 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_inv_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E,int m1,int n1)17513 void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1)
17514 	{
17515 
17516 	const int bs = 4;
17517 
17518 	double tmp;
17519 
17520 #if defined(TARGET_GENERIC)
17521 	double CC[16] = {0};
17522 #else
17523 	ALIGNED( double CC[16], 64 ) = {0};
17524 #endif
17525 
17526 	double alpha1 = -1.0;
17527 
17528 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
17529 
17530 	tmp = inv_diag_E[0];
17531 	CC[0+bs*0] *= tmp;
17532 	CC[1+bs*0] *= tmp;
17533 	CC[2+bs*0] *= tmp;
17534 	CC[3+bs*0] *= tmp;
17535 
17536 	if(n1==1)
17537 		goto store;
17538 
17539 	tmp = E[1+lde*0];
17540 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
17541 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
17542 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
17543 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
17544 	tmp = inv_diag_E[1];
17545 	CC[0+bs*1] *= tmp;
17546 	CC[1+bs*1] *= tmp;
17547 	CC[2+bs*1] *= tmp;
17548 	CC[3+bs*1] *= tmp;
17549 
17550 	if(n1==2)
17551 		goto store;
17552 
17553 	tmp = E[2+lde*0];
17554 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
17555 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
17556 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
17557 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
17558 	tmp = E[2+lde*1];
17559 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
17560 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
17561 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
17562 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
17563 	tmp = inv_diag_E[2];
17564 	CC[0+bs*2] *= tmp;
17565 	CC[1+bs*2] *= tmp;
17566 	CC[2+bs*2] *= tmp;
17567 	CC[3+bs*2] *= tmp;
17568 
17569 	if(n1==3)
17570 		goto store;
17571 
17572 	tmp = E[3+lde*0];
17573 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
17574 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
17575 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
17576 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
17577 	tmp = E[3+lde*1];
17578 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
17579 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
17580 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
17581 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
17582 	tmp = E[3+lde*2];
17583 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
17584 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
17585 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
17586 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
17587 	tmp = inv_diag_E[3];
17588 	CC[0+bs*3] *= tmp;
17589 	CC[1+bs*3] *= tmp;
17590 	CC[2+bs*3] *= tmp;
17591 	CC[3+bs*3] *= tmp;
17592 
17593 	store:
17594 
17595 	if(m1>=4)
17596 		{
17597 		D[0+ldd*0] = CC[0+bs*0];
17598 		D[1+ldd*0] = CC[1+bs*0];
17599 		D[2+ldd*0] = CC[2+bs*0];
17600 		D[3+ldd*0] = CC[3+bs*0];
17601 
17602 		if(n1==1)
17603 			return;
17604 
17605 		D[0+ldd*1] = CC[0+bs*1];
17606 		D[1+ldd*1] = CC[1+bs*1];
17607 		D[2+ldd*1] = CC[2+bs*1];
17608 		D[3+ldd*1] = CC[3+bs*1];
17609 
17610 		if(n1==2)
17611 			return;
17612 
17613 		D[0+ldd*2] = CC[0+bs*2];
17614 		D[1+ldd*2] = CC[1+bs*2];
17615 		D[2+ldd*2] = CC[2+bs*2];
17616 		D[3+ldd*2] = CC[3+bs*2];
17617 
17618 		if(n1==3)
17619 			return;
17620 
17621 		D[0+ldd*3] = CC[0+bs*3];
17622 		D[1+ldd*3] = CC[1+bs*3];
17623 		D[2+ldd*3] = CC[2+bs*3];
17624 		D[3+ldd*3] = CC[3+bs*3];
17625 		}
17626 	else if(m1>=3)
17627 		{
17628 		D[0+ldd*0] = CC[0+bs*0];
17629 		D[1+ldd*0] = CC[1+bs*0];
17630 		D[2+ldd*0] = CC[2+bs*0];
17631 
17632 		if(n1==1)
17633 			return;
17634 
17635 		D[0+ldd*1] = CC[0+bs*1];
17636 		D[1+ldd*1] = CC[1+bs*1];
17637 		D[2+ldd*1] = CC[2+bs*1];
17638 
17639 		if(n1==2)
17640 			return;
17641 
17642 		D[0+ldd*2] = CC[0+bs*2];
17643 		D[1+ldd*2] = CC[1+bs*2];
17644 		D[2+ldd*2] = CC[2+bs*2];
17645 
17646 		if(n1==3)
17647 			return;
17648 
17649 		D[0+ldd*3] = CC[0+bs*3];
17650 		D[1+ldd*3] = CC[1+bs*3];
17651 		D[2+ldd*3] = CC[2+bs*3];
17652 		}
17653 	else if(m1>=2)
17654 		{
17655 		D[0+ldd*0] = CC[0+bs*0];
17656 		D[1+ldd*0] = CC[1+bs*0];
17657 
17658 		if(n1==1)
17659 			return;
17660 
17661 		D[0+ldd*1] = CC[0+bs*1];
17662 		D[1+ldd*1] = CC[1+bs*1];
17663 
17664 		if(n1==2)
17665 			return;
17666 
17667 		D[0+ldd*2] = CC[0+bs*2];
17668 		D[1+ldd*2] = CC[1+bs*2];
17669 
17670 		if(n1==3)
17671 			return;
17672 
17673 		D[0+ldd*3] = CC[0+bs*3];
17674 		D[1+ldd*3] = CC[1+bs*3];
17675 		}
17676 	else //if(m1>=1)
17677 		{
17678 		D[0+ldd*0] = CC[0+bs*0];
17679 
17680 		if(n1==1)
17681 			return;
17682 
17683 		D[0+ldd*1] = CC[0+bs*1];
17684 
17685 		if(n1==2)
17686 			return;
17687 
17688 		D[0+ldd*2] = CC[0+bs*2];
17689 
17690 		if(n1==3)
17691 			return;
17692 
17693 		D[0+ldd*3] = CC[0+bs*3];
17694 		}
17695 
17696 	return;
17697 
17698 	}
17699 #endif
17700 
17701 
17702 
17703 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nt_rl_one_4x4_lib44cc4(int kmax,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,double * E)17704 void kernel_dtrsm_nt_rl_one_4x4_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E)
17705 	{
17706 
17707 	const int bs = 4;
17708 
17709 	double tmp;
17710 
17711 #if defined(TARGET_GENERIC)
17712 	double CC[16] = {0};
17713 #else
17714 	ALIGNED( double CC[16], 64 ) = {0};
17715 #endif
17716 
17717 	double alpha1 = -1.0;
17718 
17719 	CC[0+bs*0] = C[0+ldc*0];
17720 	CC[1+bs*0] = C[1+ldc*0];
17721 	CC[2+bs*0] = C[2+ldc*0];
17722 	CC[3+bs*0] = C[3+ldc*0];
17723 
17724 	CC[0+bs*1] = C[0+ldc*1];
17725 	CC[1+bs*1] = C[1+ldc*1];
17726 	CC[2+bs*1] = C[2+ldc*1];
17727 	CC[3+bs*1] = C[3+ldc*1];
17728 
17729 	CC[0+bs*2] = C[0+ldc*2];
17730 	CC[1+bs*2] = C[1+ldc*2];
17731 	CC[2+bs*2] = C[2+ldc*2];
17732 	CC[3+bs*2] = C[3+ldc*2];
17733 
17734 	CC[0+bs*3] = C[0+ldc*3];
17735 	CC[1+bs*3] = C[1+ldc*3];
17736 	CC[2+bs*3] = C[2+ldc*3];
17737 	CC[3+bs*3] = C[3+ldc*3];
17738 
17739 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, CC, CC);
17740 
17741 	tmp = E[1+bs*0];
17742 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
17743 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
17744 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
17745 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
17746 
17747 	tmp = E[2+bs*0];
17748 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
17749 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
17750 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
17751 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
17752 	tmp = E[2+bs*1];
17753 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
17754 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
17755 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
17756 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
17757 
17758 	tmp = E[3+bs*0];
17759 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
17760 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
17761 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
17762 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
17763 	tmp = E[3+bs*1];
17764 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
17765 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
17766 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
17767 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
17768 	tmp = E[3+bs*2];
17769 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
17770 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
17771 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
17772 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
17773 
17774 	D[0+ldd*0] = CC[0+bs*0];
17775 	D[1+ldd*0] = CC[1+bs*0];
17776 	D[2+ldd*0] = CC[2+bs*0];
17777 	D[3+ldd*0] = CC[3+bs*0];
17778 
17779 	D[0+ldd*1] = CC[0+bs*1];
17780 	D[1+ldd*1] = CC[1+bs*1];
17781 	D[2+ldd*1] = CC[2+bs*1];
17782 	D[3+ldd*1] = CC[3+bs*1];
17783 
17784 	D[0+ldd*2] = CC[0+bs*2];
17785 	D[1+ldd*2] = CC[1+bs*2];
17786 	D[2+ldd*2] = CC[2+bs*2];
17787 	D[3+ldd*2] = CC[3+bs*2];
17788 
17789 	D[0+ldd*3] = CC[0+bs*3];
17790 	D[1+ldd*3] = CC[1+bs*3];
17791 	D[2+ldd*3] = CC[2+bs*3];
17792 	D[3+ldd*3] = CC[3+bs*3];
17793 
17794 	return;
17795 
17796 	}
17797 #endif
17798 
17799 
17800 
17801 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4(int kmax,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,double * E,int m1,int n1)17802 void kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, int m1, int n1)
17803 	{
17804 
17805 	const int bs = 4;
17806 
17807 	double tmp;
17808 
17809 #if defined(TARGET_GENERIC)
17810 	double CC[16] = {0};
17811 #else
17812 	ALIGNED( double CC[16], 64 ) = {0};
17813 #endif
17814 
17815 	double alpha1 = -1.0;
17816 
17817 	if(m1>=4)
17818 		{
17819 		CC[0+bs*0] = C[0+ldc*0];
17820 		CC[1+bs*0] = C[1+ldc*0];
17821 		CC[2+bs*0] = C[2+ldc*0];
17822 		CC[3+bs*0] = C[3+ldc*0];
17823 
17824 		if(n1==1)
17825 			goto kernel;
17826 
17827 		CC[0+bs*1] = C[0+ldc*1];
17828 		CC[1+bs*1] = C[1+ldc*1];
17829 		CC[2+bs*1] = C[2+ldc*1];
17830 		CC[3+bs*1] = C[3+ldc*1];
17831 
17832 		if(n1==2)
17833 			goto kernel;
17834 
17835 		CC[0+bs*2] = C[0+ldc*2];
17836 		CC[1+bs*2] = C[1+ldc*2];
17837 		CC[2+bs*2] = C[2+ldc*2];
17838 		CC[3+bs*2] = C[3+ldc*2];
17839 
17840 		if(n1==3)
17841 			goto kernel;
17842 
17843 		CC[0+bs*3] = C[0+ldc*3];
17844 		CC[1+bs*3] = C[1+ldc*3];
17845 		CC[2+bs*3] = C[2+ldc*3];
17846 		CC[3+bs*3] = C[3+ldc*3];
17847 		}
17848 	else if(m1>=3)
17849 		{
17850 		CC[0+bs*0] = C[0+ldc*0];
17851 		CC[1+bs*0] = C[1+ldc*0];
17852 		CC[2+bs*0] = C[2+ldc*0];
17853 
17854 		if(n1==1)
17855 			goto kernel;
17856 
17857 		CC[0+bs*1] = C[0+ldc*1];
17858 		CC[1+bs*1] = C[1+ldc*1];
17859 		CC[2+bs*1] = C[2+ldc*1];
17860 
17861 		if(n1==2)
17862 			goto kernel;
17863 
17864 		CC[0+bs*2] = C[0+ldc*2];
17865 		CC[1+bs*2] = C[1+ldc*2];
17866 		CC[2+bs*2] = C[2+ldc*2];
17867 
17868 		if(n1==3)
17869 			goto kernel;
17870 
17871 		CC[0+bs*3] = C[0+ldc*3];
17872 		CC[1+bs*3] = C[1+ldc*3];
17873 		CC[2+bs*3] = C[2+ldc*3];
17874 		}
17875 	else if(m1>=2)
17876 		{
17877 		CC[0+bs*0] = C[0+ldc*0];
17878 		CC[1+bs*0] = C[1+ldc*0];
17879 
17880 		if(n1==1)
17881 			goto kernel;
17882 
17883 		CC[0+bs*1] = C[0+ldc*1];
17884 		CC[1+bs*1] = C[1+ldc*1];
17885 
17886 		if(n1==2)
17887 			goto kernel;
17888 
17889 		CC[0+bs*2] = C[0+ldc*2];
17890 		CC[1+bs*2] = C[1+ldc*2];
17891 
17892 		if(n1==3)
17893 			goto kernel;
17894 
17895 		CC[0+bs*3] = C[0+ldc*3];
17896 		CC[1+bs*3] = C[1+ldc*3];
17897 		}
17898 	else //if(m1>=1)
17899 		{
17900 		CC[0+bs*0] = C[0+ldc*0];
17901 
17902 		if(n1==1)
17903 			goto kernel;
17904 
17905 		CC[0+bs*1] = C[0+ldc*1];
17906 
17907 		if(n1==2)
17908 			goto kernel;
17909 
17910 		CC[0+bs*2] = C[0+ldc*2];
17911 
17912 		if(n1==3)
17913 			goto kernel;
17914 
17915 		CC[0+bs*3] = C[0+ldc*3];
17916 		}
17917 
17918 kernel:
17919 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, CC, CC);
17920 
17921 	if(n1==1)
17922 		goto store;
17923 
17924 	tmp = E[1+bs*0];
17925 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
17926 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
17927 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
17928 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
17929 
17930 	if(n1==2)
17931 		goto store;
17932 
17933 	tmp = E[2+bs*0];
17934 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
17935 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
17936 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
17937 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
17938 	tmp = E[2+bs*1];
17939 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
17940 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
17941 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
17942 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
17943 
17944 	if(n1==3)
17945 		goto store;
17946 
17947 	tmp = E[3+bs*0];
17948 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
17949 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
17950 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
17951 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
17952 	tmp = E[3+bs*1];
17953 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
17954 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
17955 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
17956 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
17957 	tmp = E[3+bs*2];
17958 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
17959 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
17960 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
17961 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
17962 
17963 	store:
17964 
17965 	if(m1>=4)
17966 		{
17967 		D[0+ldd*0] = CC[0+bs*0];
17968 		D[1+ldd*0] = CC[1+bs*0];
17969 		D[2+ldd*0] = CC[2+bs*0];
17970 		D[3+ldd*0] = CC[3+bs*0];
17971 
17972 		if(n1==1)
17973 			return;
17974 
17975 		D[0+ldd*1] = CC[0+bs*1];
17976 		D[1+ldd*1] = CC[1+bs*1];
17977 		D[2+ldd*1] = CC[2+bs*1];
17978 		D[3+ldd*1] = CC[3+bs*1];
17979 
17980 		if(n1==2)
17981 			return;
17982 
17983 		D[0+ldd*2] = CC[0+bs*2];
17984 		D[1+ldd*2] = CC[1+bs*2];
17985 		D[2+ldd*2] = CC[2+bs*2];
17986 		D[3+ldd*2] = CC[3+bs*2];
17987 
17988 		if(n1==3)
17989 			return;
17990 
17991 		D[0+ldd*3] = CC[0+bs*3];
17992 		D[1+ldd*3] = CC[1+bs*3];
17993 		D[2+ldd*3] = CC[2+bs*3];
17994 		D[3+ldd*3] = CC[3+bs*3];
17995 		}
17996 	else if(m1>=3)
17997 		{
17998 		D[0+ldd*0] = CC[0+bs*0];
17999 		D[1+ldd*0] = CC[1+bs*0];
18000 		D[2+ldd*0] = CC[2+bs*0];
18001 
18002 		if(n1==1)
18003 			return;
18004 
18005 		D[0+ldd*1] = CC[0+bs*1];
18006 		D[1+ldd*1] = CC[1+bs*1];
18007 		D[2+ldd*1] = CC[2+bs*1];
18008 
18009 		if(n1==2)
18010 			return;
18011 
18012 		D[0+ldd*2] = CC[0+bs*2];
18013 		D[1+ldd*2] = CC[1+bs*2];
18014 		D[2+ldd*2] = CC[2+bs*2];
18015 
18016 		if(n1==3)
18017 			return;
18018 
18019 		D[0+ldd*3] = CC[0+bs*3];
18020 		D[1+ldd*3] = CC[1+bs*3];
18021 		D[2+ldd*3] = CC[2+bs*3];
18022 		}
18023 	else if(m1>=2)
18024 		{
18025 		D[0+ldd*0] = CC[0+bs*0];
18026 		D[1+ldd*0] = CC[1+bs*0];
18027 
18028 		if(n1==1)
18029 			return;
18030 
18031 		D[0+ldd*1] = CC[0+bs*1];
18032 		D[1+ldd*1] = CC[1+bs*1];
18033 
18034 		if(n1==2)
18035 			return;
18036 
18037 		D[0+ldd*2] = CC[0+bs*2];
18038 		D[1+ldd*2] = CC[1+bs*2];
18039 
18040 		if(n1==3)
18041 			return;
18042 
18043 		D[0+ldd*3] = CC[0+bs*3];
18044 		D[1+ldd*3] = CC[1+bs*3];
18045 		}
18046 	else //if(m1>=1)
18047 		{
18048 		D[0+ldd*0] = CC[0+bs*0];
18049 
18050 		if(n1==1)
18051 			return;
18052 
18053 		D[0+ldd*1] = CC[0+bs*1];
18054 
18055 		if(n1==2)
18056 			return;
18057 
18058 		D[0+ldd*2] = CC[0+bs*2];
18059 
18060 		if(n1==3)
18061 			return;
18062 
18063 		D[0+ldd*3] = CC[0+bs*3];
18064 		}
18065 
18066 	return;
18067 
18068 	}
18069 #endif
18070 
18071 
18072 
18073 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_one_4x4_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde)18074 void kernel_dtrsm_nt_rl_one_4x4_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde)
18075 	{
18076 
18077 	const int bs = 4;
18078 
18079 	double tmp;
18080 
18081 #if defined(TARGET_GENERIC)
18082 	double CC[16] = {0};
18083 #else
18084 	ALIGNED( double CC[16], 64 ) = {0};
18085 #endif
18086 
18087 	double alpha1 = -1.0;
18088 
18089 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
18090 
18091 	tmp = E[1+lde*0];
18092 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
18093 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
18094 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
18095 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
18096 
18097 	tmp = E[2+lde*0];
18098 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
18099 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
18100 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
18101 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
18102 	tmp = E[2+lde*1];
18103 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
18104 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
18105 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
18106 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
18107 
18108 	tmp = E[3+lde*0];
18109 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
18110 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
18111 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
18112 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
18113 	tmp = E[3+lde*1];
18114 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
18115 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
18116 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
18117 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
18118 	tmp = E[3+lde*2];
18119 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
18120 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
18121 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
18122 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
18123 
18124 
18125 	D[0+bs*0] = CC[0+bs*0];
18126 	D[1+bs*0] = CC[1+bs*0];
18127 	D[2+bs*0] = CC[2+bs*0];
18128 	D[3+bs*0] = CC[3+bs*0];
18129 
18130 	D[0+bs*1] = CC[0+bs*1];
18131 	D[1+bs*1] = CC[1+bs*1];
18132 	D[2+bs*1] = CC[2+bs*1];
18133 	D[3+bs*1] = CC[3+bs*1];
18134 
18135 	D[0+bs*2] = CC[0+bs*2];
18136 	D[1+bs*2] = CC[1+bs*2];
18137 	D[2+bs*2] = CC[2+bs*2];
18138 	D[3+bs*2] = CC[3+bs*2];
18139 
18140 	D[0+bs*3] = CC[0+bs*3];
18141 	D[1+bs*3] = CC[1+bs*3];
18142 	D[2+bs*3] = CC[2+bs*3];
18143 	D[3+bs*3] = CC[3+bs*3];
18144 
18145 	return;
18146 
18147 	}
18148 #endif
18149 
18150 
18151 
18152 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_one_4x4_vs_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,int m1,int n1)18153 void kernel_dtrsm_nt_rl_one_4x4_vs_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1)
18154 	{
18155 
18156 	const int bs = 4;
18157 
18158 	double tmp;
18159 
18160 #if defined(TARGET_GENERIC)
18161 	double CC[16] = {0};
18162 #else
18163 	ALIGNED( double CC[16], 64 ) = {0};
18164 #endif
18165 
18166 	double alpha1 = -1.0;
18167 
18168 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
18169 
18170 	if(n1==1)
18171 		goto store;
18172 
18173 	tmp = E[1+lde*0];
18174 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
18175 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
18176 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
18177 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
18178 
18179 	if(n1==2)
18180 		goto store;
18181 
18182 	tmp = E[2+lde*0];
18183 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
18184 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
18185 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
18186 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
18187 	tmp = E[2+lde*1];
18188 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
18189 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
18190 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
18191 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
18192 
18193 	if(n1==3)
18194 		goto store;
18195 
18196 	tmp = E[3+lde*0];
18197 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
18198 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
18199 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
18200 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
18201 	tmp = E[3+lde*1];
18202 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
18203 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
18204 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
18205 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
18206 	tmp = E[3+lde*2];
18207 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
18208 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
18209 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
18210 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
18211 
18212 	store:
18213 
18214 	if(m1>=4)
18215 		{
18216 		D[0+bs*0] = CC[0+bs*0];
18217 		D[1+bs*0] = CC[1+bs*0];
18218 		D[2+bs*0] = CC[2+bs*0];
18219 		D[3+bs*0] = CC[3+bs*0];
18220 
18221 		if(n1==1)
18222 			return;
18223 
18224 		D[0+bs*1] = CC[0+bs*1];
18225 		D[1+bs*1] = CC[1+bs*1];
18226 		D[2+bs*1] = CC[2+bs*1];
18227 		D[3+bs*1] = CC[3+bs*1];
18228 
18229 		if(n1==2)
18230 			return;
18231 
18232 		D[0+bs*2] = CC[0+bs*2];
18233 		D[1+bs*2] = CC[1+bs*2];
18234 		D[2+bs*2] = CC[2+bs*2];
18235 		D[3+bs*2] = CC[3+bs*2];
18236 
18237 		if(n1==3)
18238 			return;
18239 
18240 		D[0+bs*3] = CC[0+bs*3];
18241 		D[1+bs*3] = CC[1+bs*3];
18242 		D[2+bs*3] = CC[2+bs*3];
18243 		D[3+bs*3] = CC[3+bs*3];
18244 		}
18245 	else if(m1>=3)
18246 		{
18247 		D[0+bs*0] = CC[0+bs*0];
18248 		D[1+bs*0] = CC[1+bs*0];
18249 		D[2+bs*0] = CC[2+bs*0];
18250 
18251 		if(n1==1)
18252 			return;
18253 
18254 		D[0+bs*1] = CC[0+bs*1];
18255 		D[1+bs*1] = CC[1+bs*1];
18256 		D[2+bs*1] = CC[2+bs*1];
18257 
18258 		if(n1==2)
18259 			return;
18260 
18261 		D[0+bs*2] = CC[0+bs*2];
18262 		D[1+bs*2] = CC[1+bs*2];
18263 		D[2+bs*2] = CC[2+bs*2];
18264 
18265 		if(n1==3)
18266 			return;
18267 
18268 		D[0+bs*3] = CC[0+bs*3];
18269 		D[1+bs*3] = CC[1+bs*3];
18270 		D[2+bs*3] = CC[2+bs*3];
18271 		}
18272 	else if(m1>=2)
18273 		{
18274 		D[0+bs*0] = CC[0+bs*0];
18275 		D[1+bs*0] = CC[1+bs*0];
18276 
18277 		if(n1==1)
18278 			return;
18279 
18280 		D[0+bs*1] = CC[0+bs*1];
18281 		D[1+bs*1] = CC[1+bs*1];
18282 
18283 		if(n1==2)
18284 			return;
18285 
18286 		D[0+bs*2] = CC[0+bs*2];
18287 		D[1+bs*2] = CC[1+bs*2];
18288 
18289 		if(n1==3)
18290 			return;
18291 
18292 		D[0+bs*3] = CC[0+bs*3];
18293 		D[1+bs*3] = CC[1+bs*3];
18294 		}
18295 	else //if(m1>=1)
18296 		{
18297 		D[0+bs*0] = CC[0+bs*0];
18298 
18299 		if(n1==1)
18300 			return;
18301 
18302 		D[0+bs*1] = CC[0+bs*1];
18303 
18304 		if(n1==2)
18305 			return;
18306 
18307 		D[0+bs*2] = CC[0+bs*2];
18308 
18309 		if(n1==3)
18310 			return;
18311 
18312 		D[0+bs*3] = CC[0+bs*3];
18313 		}
18314 
18315 	return;
18316 
18317 	}
18318 #endif
18319 
18320 
18321 
18322 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_one_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde)18323 void kernel_dtrsm_nt_rl_one_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde)
18324 	{
18325 
18326 	const int bs = 4;
18327 
18328 	double tmp;
18329 
18330 #if defined(TARGET_GENERIC)
18331 	double CC[16] = {0};
18332 #else
18333 	ALIGNED( double CC[16], 64 ) = {0};
18334 #endif
18335 
18336 	double alpha1 = -1.0;
18337 
18338 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
18339 
18340 	tmp = E[1+lde*0];
18341 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
18342 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
18343 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
18344 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
18345 
18346 	tmp = E[2+lde*0];
18347 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
18348 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
18349 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
18350 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
18351 	tmp = E[2+lde*1];
18352 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
18353 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
18354 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
18355 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
18356 
18357 	tmp = E[3+lde*0];
18358 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
18359 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
18360 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
18361 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
18362 	tmp = E[3+lde*1];
18363 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
18364 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
18365 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
18366 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
18367 	tmp = E[3+lde*2];
18368 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
18369 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
18370 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
18371 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
18372 
18373 	D[0+ldd*0] = CC[0+bs*0];
18374 	D[1+ldd*0] = CC[1+bs*0];
18375 	D[2+ldd*0] = CC[2+bs*0];
18376 	D[3+ldd*0] = CC[3+bs*0];
18377 
18378 	D[0+ldd*1] = CC[0+bs*1];
18379 	D[1+ldd*1] = CC[1+bs*1];
18380 	D[2+ldd*1] = CC[2+bs*1];
18381 	D[3+ldd*1] = CC[3+bs*1];
18382 
18383 	D[0+ldd*2] = CC[0+bs*2];
18384 	D[1+ldd*2] = CC[1+bs*2];
18385 	D[2+ldd*2] = CC[2+bs*2];
18386 	D[3+ldd*2] = CC[3+bs*2];
18387 
18388 	D[0+ldd*3] = CC[0+bs*3];
18389 	D[1+ldd*3] = CC[1+bs*3];
18390 	D[2+ldd*3] = CC[2+bs*3];
18391 	D[3+ldd*3] = CC[3+bs*3];
18392 
18393 	return;
18394 
18395 	}
18396 #endif
18397 
18398 
18399 
18400 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_rl_one_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,int m1,int n1)18401 void kernel_dtrsm_nt_rl_one_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1)
18402 	{
18403 
18404 	const int bs = 4;
18405 
18406 	double tmp;
18407 
18408 #if defined(TARGET_GENERIC)
18409 	double CC[16] = {0};
18410 #else
18411 	ALIGNED( double CC[16], 64 ) = {0};
18412 #endif
18413 
18414 	double alpha1 = -1.0;
18415 
18416 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
18417 
18418 	if(n1==1)
18419 		goto store;
18420 
18421 	tmp = E[1+lde*0];
18422 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
18423 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
18424 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
18425 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
18426 
18427 	if(n1==2)
18428 		goto store;
18429 
18430 	tmp = E[2+lde*0];
18431 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
18432 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
18433 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
18434 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
18435 	tmp = E[2+lde*1];
18436 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
18437 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
18438 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
18439 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
18440 
18441 	if(n1==3)
18442 		goto store;
18443 
18444 	tmp = E[3+lde*0];
18445 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
18446 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
18447 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
18448 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
18449 	tmp = E[3+lde*1];
18450 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
18451 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
18452 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
18453 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
18454 	tmp = E[3+lde*2];
18455 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
18456 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
18457 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
18458 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
18459 
18460 	store:
18461 
18462 	if(m1>=4)
18463 		{
18464 		D[0+ldd*0] = CC[0+bs*0];
18465 		D[1+ldd*0] = CC[1+bs*0];
18466 		D[2+ldd*0] = CC[2+bs*0];
18467 		D[3+ldd*0] = CC[3+bs*0];
18468 
18469 		if(n1==1)
18470 			return;
18471 
18472 		D[0+ldd*1] = CC[0+bs*1];
18473 		D[1+ldd*1] = CC[1+bs*1];
18474 		D[2+ldd*1] = CC[2+bs*1];
18475 		D[3+ldd*1] = CC[3+bs*1];
18476 
18477 		if(n1==2)
18478 			return;
18479 
18480 		D[0+ldd*2] = CC[0+bs*2];
18481 		D[1+ldd*2] = CC[1+bs*2];
18482 		D[2+ldd*2] = CC[2+bs*2];
18483 		D[3+ldd*2] = CC[3+bs*2];
18484 
18485 		if(n1==3)
18486 			return;
18487 
18488 		D[0+ldd*3] = CC[0+bs*3];
18489 		D[1+ldd*3] = CC[1+bs*3];
18490 		D[2+ldd*3] = CC[2+bs*3];
18491 		D[3+ldd*3] = CC[3+bs*3];
18492 		}
18493 	else if(m1>=3)
18494 		{
18495 		D[0+ldd*0] = CC[0+bs*0];
18496 		D[1+ldd*0] = CC[1+bs*0];
18497 		D[2+ldd*0] = CC[2+bs*0];
18498 
18499 		if(n1==1)
18500 			return;
18501 
18502 		D[0+ldd*1] = CC[0+bs*1];
18503 		D[1+ldd*1] = CC[1+bs*1];
18504 		D[2+ldd*1] = CC[2+bs*1];
18505 
18506 		if(n1==2)
18507 			return;
18508 
18509 		D[0+ldd*2] = CC[0+bs*2];
18510 		D[1+ldd*2] = CC[1+bs*2];
18511 		D[2+ldd*2] = CC[2+bs*2];
18512 
18513 		if(n1==3)
18514 			return;
18515 
18516 		D[0+ldd*3] = CC[0+bs*3];
18517 		D[1+ldd*3] = CC[1+bs*3];
18518 		D[2+ldd*3] = CC[2+bs*3];
18519 		}
18520 	else if(m1>=2)
18521 		{
18522 		D[0+ldd*0] = CC[0+bs*0];
18523 		D[1+ldd*0] = CC[1+bs*0];
18524 
18525 		if(n1==1)
18526 			return;
18527 
18528 		D[0+ldd*1] = CC[0+bs*1];
18529 		D[1+ldd*1] = CC[1+bs*1];
18530 
18531 		if(n1==2)
18532 			return;
18533 
18534 		D[0+ldd*2] = CC[0+bs*2];
18535 		D[1+ldd*2] = CC[1+bs*2];
18536 
18537 		if(n1==3)
18538 			return;
18539 
18540 		D[0+ldd*3] = CC[0+bs*3];
18541 		D[1+ldd*3] = CC[1+bs*3];
18542 		}
18543 	else //if(m1>=1)
18544 		{
18545 		D[0+ldd*0] = CC[0+bs*0];
18546 
18547 		if(n1==1)
18548 			return;
18549 
18550 		D[0+ldd*1] = CC[0+bs*1];
18551 
18552 		if(n1==2)
18553 			return;
18554 
18555 		D[0+ldd*2] = CC[0+bs*2];
18556 
18557 		if(n1==3)
18558 			return;
18559 
18560 		D[0+ldd*3] = CC[0+bs*3];
18561 		}
18562 
18563 	return;
18564 
18565 	}
18566 #endif
18567 
18568 
18569 
18570 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_inv_4x4_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,double * inv_diag_E)18571 void kernel_dtrsm_nn_ru_inv_4x4_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E)
18572 	{
18573 
18574 	const int bs = 4;
18575 
18576 	double tmp;
18577 
18578 #if defined(TARGET_GENERIC)
18579 	double CC[16] = {0};
18580 #else
18581 	ALIGNED( double CC[16], 64 ) = {0};
18582 #endif
18583 
18584 	double alpha1 = -1.0;
18585 
18586 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
18587 
18588 	tmp = inv_diag_E[0];
18589 	CC[0+bs*0] *= tmp;
18590 	CC[1+bs*0] *= tmp;
18591 	CC[2+bs*0] *= tmp;
18592 	CC[3+bs*0] *= tmp;
18593 
18594 	tmp = E[0+lde*1];
18595 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
18596 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
18597 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
18598 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
18599 	tmp = inv_diag_E[1];
18600 	CC[0+bs*1] *= tmp;
18601 	CC[1+bs*1] *= tmp;
18602 	CC[2+bs*1] *= tmp;
18603 	CC[3+bs*1] *= tmp;
18604 
18605 	tmp = E[0+lde*2];
18606 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
18607 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
18608 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
18609 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
18610 	tmp = E[1+lde*2];
18611 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
18612 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
18613 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
18614 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
18615 	tmp = inv_diag_E[2];
18616 	CC[0+bs*2] *= tmp;
18617 	CC[1+bs*2] *= tmp;
18618 	CC[2+bs*2] *= tmp;
18619 	CC[3+bs*2] *= tmp;
18620 
18621 	tmp = E[0+lde*3];
18622 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
18623 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
18624 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
18625 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
18626 	tmp = E[1+lde*3];
18627 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
18628 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
18629 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
18630 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
18631 	tmp = E[2+lde*3];
18632 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
18633 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
18634 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
18635 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
18636 	tmp = inv_diag_E[3];
18637 	CC[0+bs*3] *= tmp;
18638 	CC[1+bs*3] *= tmp;
18639 	CC[2+bs*3] *= tmp;
18640 	CC[3+bs*3] *= tmp;
18641 
18642 	D[0+bs*0] = CC[0+bs*0];
18643 	D[1+bs*0] = CC[1+bs*0];
18644 	D[2+bs*0] = CC[2+bs*0];
18645 	D[3+bs*0] = CC[3+bs*0];
18646 
18647 	D[0+bs*1] = CC[0+bs*1];
18648 	D[1+bs*1] = CC[1+bs*1];
18649 	D[2+bs*1] = CC[2+bs*1];
18650 	D[3+bs*1] = CC[3+bs*1];
18651 
18652 	D[0+bs*2] = CC[0+bs*2];
18653 	D[1+bs*2] = CC[1+bs*2];
18654 	D[2+bs*2] = CC[2+bs*2];
18655 	D[3+bs*2] = CC[3+bs*2];
18656 
18657 	D[0+bs*3] = CC[0+bs*3];
18658 	D[1+bs*3] = CC[1+bs*3];
18659 	D[2+bs*3] = CC[2+bs*3];
18660 	D[3+bs*3] = CC[3+bs*3];
18661 
18662 	return;
18663 
18664 	}
18665 #endif
18666 
18667 
18668 
18669 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_inv_4x4_vs_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,double * inv_diag_E,int m1,int n1)18670 void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1)
18671 	{
18672 
18673 	const int bs = 4;
18674 
18675 	double tmp;
18676 
18677 #if defined(TARGET_GENERIC)
18678 	double CC[16] = {0};
18679 #else
18680 	ALIGNED( double CC[16], 64 ) = {0};
18681 #endif
18682 
18683 	double alpha1 = -1.0;
18684 
18685 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
18686 
18687 	tmp = inv_diag_E[0];
18688 	CC[0+bs*0] *= tmp;
18689 	CC[1+bs*0] *= tmp;
18690 	CC[2+bs*0] *= tmp;
18691 	CC[3+bs*0] *= tmp;
18692 
18693 	if(n1==1)
18694 		goto store;
18695 
18696 	tmp = E[0+lde*1];
18697 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
18698 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
18699 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
18700 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
18701 	tmp = inv_diag_E[1];
18702 	CC[0+bs*1] *= tmp;
18703 	CC[1+bs*1] *= tmp;
18704 	CC[2+bs*1] *= tmp;
18705 	CC[3+bs*1] *= tmp;
18706 
18707 	if(n1==2)
18708 		goto store;
18709 
18710 	tmp = E[0+lde*2];
18711 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
18712 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
18713 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
18714 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
18715 	tmp = E[1+lde*2];
18716 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
18717 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
18718 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
18719 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
18720 	tmp = inv_diag_E[2];
18721 	CC[0+bs*2] *= tmp;
18722 	CC[1+bs*2] *= tmp;
18723 	CC[2+bs*2] *= tmp;
18724 	CC[3+bs*2] *= tmp;
18725 
18726 	if(n1==3)
18727 		goto store;
18728 
18729 	tmp = E[0+lde*3];
18730 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
18731 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
18732 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
18733 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
18734 	tmp = E[1+lde*3];
18735 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
18736 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
18737 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
18738 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
18739 	tmp = E[2+lde*3];
18740 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
18741 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
18742 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
18743 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
18744 	tmp = inv_diag_E[3];
18745 	CC[0+bs*3] *= tmp;
18746 	CC[1+bs*3] *= tmp;
18747 	CC[2+bs*3] *= tmp;
18748 	CC[3+bs*3] *= tmp;
18749 
18750 	store:
18751 
18752 	if(m1>=4)
18753 		{
18754 		D[0+bs*0] = CC[0+bs*0];
18755 		D[1+bs*0] = CC[1+bs*0];
18756 		D[2+bs*0] = CC[2+bs*0];
18757 		D[3+bs*0] = CC[3+bs*0];
18758 
18759 		if(n1==1)
18760 			return;
18761 
18762 		D[0+bs*1] = CC[0+bs*1];
18763 		D[1+bs*1] = CC[1+bs*1];
18764 		D[2+bs*1] = CC[2+bs*1];
18765 		D[3+bs*1] = CC[3+bs*1];
18766 
18767 		if(n1==2)
18768 			return;
18769 
18770 		D[0+bs*2] = CC[0+bs*2];
18771 		D[1+bs*2] = CC[1+bs*2];
18772 		D[2+bs*2] = CC[2+bs*2];
18773 		D[3+bs*2] = CC[3+bs*2];
18774 
18775 		if(n1==3)
18776 			return;
18777 
18778 		D[0+bs*3] = CC[0+bs*3];
18779 		D[1+bs*3] = CC[1+bs*3];
18780 		D[2+bs*3] = CC[2+bs*3];
18781 		D[3+bs*3] = CC[3+bs*3];
18782 		}
18783 	else if(m1>=3)
18784 		{
18785 		D[0+bs*0] = CC[0+bs*0];
18786 		D[1+bs*0] = CC[1+bs*0];
18787 		D[2+bs*0] = CC[2+bs*0];
18788 
18789 		if(n1==1)
18790 			return;
18791 
18792 		D[0+bs*1] = CC[0+bs*1];
18793 		D[1+bs*1] = CC[1+bs*1];
18794 		D[2+bs*1] = CC[2+bs*1];
18795 
18796 		if(n1==2)
18797 			return;
18798 
18799 		D[0+bs*2] = CC[0+bs*2];
18800 		D[1+bs*2] = CC[1+bs*2];
18801 		D[2+bs*2] = CC[2+bs*2];
18802 
18803 		if(n1==3)
18804 			return;
18805 
18806 		D[0+bs*3] = CC[0+bs*3];
18807 		D[1+bs*3] = CC[1+bs*3];
18808 		D[2+bs*3] = CC[2+bs*3];
18809 		}
18810 	else if(m1>=2)
18811 		{
18812 		D[0+bs*0] = CC[0+bs*0];
18813 		D[1+bs*0] = CC[1+bs*0];
18814 
18815 		if(n1==1)
18816 			return;
18817 
18818 		D[0+bs*1] = CC[0+bs*1];
18819 		D[1+bs*1] = CC[1+bs*1];
18820 
18821 		if(n1==2)
18822 			return;
18823 
18824 		D[0+bs*2] = CC[0+bs*2];
18825 		D[1+bs*2] = CC[1+bs*2];
18826 
18827 		if(n1==3)
18828 			return;
18829 
18830 		D[0+bs*3] = CC[0+bs*3];
18831 		D[1+bs*3] = CC[1+bs*3];
18832 		}
18833 	else //if(m1>=1)
18834 		{
18835 		D[0+bs*0] = CC[0+bs*0];
18836 
18837 		if(n1==1)
18838 			return;
18839 
18840 		D[0+bs*1] = CC[0+bs*1];
18841 
18842 		if(n1==2)
18843 			return;
18844 
18845 		D[0+bs*2] = CC[0+bs*2];
18846 
18847 		if(n1==3)
18848 			return;
18849 
18850 		D[0+bs*3] = CC[0+bs*3];
18851 		}
18852 
18853 	return;
18854 
18855 	}
18856 #endif
18857 
18858 
18859 
18860 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_inv_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E)18861 void kernel_dtrsm_nn_ru_inv_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E)
18862 	{
18863 
18864 	const int bs = 4;
18865 
18866 	double tmp;
18867 
18868 #if defined(TARGET_GENERIC)
18869 	double CC[16] = {0};
18870 #else
18871 	ALIGNED( double CC[16], 64 ) = {0};
18872 #endif
18873 
18874 	double alpha1 = -1.0;
18875 
18876 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
18877 
18878 	tmp = inv_diag_E[0];
18879 	CC[0+bs*0] *= tmp;
18880 	CC[1+bs*0] *= tmp;
18881 	CC[2+bs*0] *= tmp;
18882 	CC[3+bs*0] *= tmp;
18883 
18884 	tmp = E[0+lde*1];
18885 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
18886 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
18887 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
18888 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
18889 	tmp = inv_diag_E[1];
18890 	CC[0+bs*1] *= tmp;
18891 	CC[1+bs*1] *= tmp;
18892 	CC[2+bs*1] *= tmp;
18893 	CC[3+bs*1] *= tmp;
18894 
18895 	tmp = E[0+lde*2];
18896 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
18897 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
18898 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
18899 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
18900 	tmp = E[1+lde*2];
18901 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
18902 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
18903 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
18904 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
18905 	tmp = inv_diag_E[2];
18906 	CC[0+bs*2] *= tmp;
18907 	CC[1+bs*2] *= tmp;
18908 	CC[2+bs*2] *= tmp;
18909 	CC[3+bs*2] *= tmp;
18910 
18911 	tmp = E[0+lde*3];
18912 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
18913 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
18914 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
18915 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
18916 	tmp = E[1+lde*3];
18917 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
18918 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
18919 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
18920 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
18921 	tmp = E[2+lde*3];
18922 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
18923 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
18924 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
18925 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
18926 	tmp = inv_diag_E[3];
18927 	CC[0+bs*3] *= tmp;
18928 	CC[1+bs*3] *= tmp;
18929 	CC[2+bs*3] *= tmp;
18930 	CC[3+bs*3] *= tmp;
18931 
18932 
18933 	D[0+ldd*0] = CC[0+bs*0];
18934 	D[1+ldd*0] = CC[1+bs*0];
18935 	D[2+ldd*0] = CC[2+bs*0];
18936 	D[3+ldd*0] = CC[3+bs*0];
18937 
18938 	D[0+ldd*1] = CC[0+bs*1];
18939 	D[1+ldd*1] = CC[1+bs*1];
18940 	D[2+ldd*1] = CC[2+bs*1];
18941 	D[3+ldd*1] = CC[3+bs*1];
18942 
18943 	D[0+ldd*2] = CC[0+bs*2];
18944 	D[1+ldd*2] = CC[1+bs*2];
18945 	D[2+ldd*2] = CC[2+bs*2];
18946 	D[3+ldd*2] = CC[3+bs*2];
18947 
18948 	D[0+ldd*3] = CC[0+bs*3];
18949 	D[1+ldd*3] = CC[1+bs*3];
18950 	D[2+ldd*3] = CC[2+bs*3];
18951 	D[3+ldd*3] = CC[3+bs*3];
18952 
18953 	return;
18954 
18955 	}
18956 #endif
18957 
18958 
18959 
18960 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_inv_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E,int m1,int n1)18961 void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1)
18962 	{
18963 
18964 	const int bs = 4;
18965 
18966 	double tmp;
18967 
18968 #if defined(TARGET_GENERIC)
18969 	double CC[16] = {0};
18970 #else
18971 	ALIGNED( double CC[16], 64 ) = {0};
18972 #endif
18973 
18974 	double alpha1 = -1.0;
18975 
18976 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
18977 
18978 	tmp = inv_diag_E[0];
18979 	CC[0+bs*0] *= tmp;
18980 	CC[1+bs*0] *= tmp;
18981 	CC[2+bs*0] *= tmp;
18982 	CC[3+bs*0] *= tmp;
18983 
18984 	if(n1==1)
18985 		goto store;
18986 
18987 	tmp = E[0+lde*1];
18988 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
18989 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
18990 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
18991 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
18992 	tmp = inv_diag_E[1];
18993 	CC[0+bs*1] *= tmp;
18994 	CC[1+bs*1] *= tmp;
18995 	CC[2+bs*1] *= tmp;
18996 	CC[3+bs*1] *= tmp;
18997 
18998 	if(n1==2)
18999 		goto store;
19000 
19001 	tmp = E[0+lde*2];
19002 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
19003 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
19004 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
19005 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
19006 	tmp = E[1+lde*2];
19007 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
19008 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
19009 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
19010 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
19011 	tmp = inv_diag_E[2];
19012 	CC[0+bs*2] *= tmp;
19013 	CC[1+bs*2] *= tmp;
19014 	CC[2+bs*2] *= tmp;
19015 	CC[3+bs*2] *= tmp;
19016 
19017 	if(n1==3)
19018 		goto store;
19019 
19020 	tmp = E[0+lde*3];
19021 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
19022 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
19023 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
19024 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
19025 	tmp = E[1+lde*3];
19026 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
19027 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
19028 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
19029 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
19030 	tmp = E[2+lde*3];
19031 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
19032 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
19033 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
19034 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
19035 	tmp = inv_diag_E[3];
19036 	CC[0+bs*3] *= tmp;
19037 	CC[1+bs*3] *= tmp;
19038 	CC[2+bs*3] *= tmp;
19039 	CC[3+bs*3] *= tmp;
19040 
19041 	store:
19042 
19043 	if(m1>=4)
19044 		{
19045 		D[0+ldd*0] = CC[0+bs*0];
19046 		D[1+ldd*0] = CC[1+bs*0];
19047 		D[2+ldd*0] = CC[2+bs*0];
19048 		D[3+ldd*0] = CC[3+bs*0];
19049 
19050 		if(n1==1)
19051 			return;
19052 
19053 		D[0+ldd*1] = CC[0+bs*1];
19054 		D[1+ldd*1] = CC[1+bs*1];
19055 		D[2+ldd*1] = CC[2+bs*1];
19056 		D[3+ldd*1] = CC[3+bs*1];
19057 
19058 		if(n1==2)
19059 			return;
19060 
19061 		D[0+ldd*2] = CC[0+bs*2];
19062 		D[1+ldd*2] = CC[1+bs*2];
19063 		D[2+ldd*2] = CC[2+bs*2];
19064 		D[3+ldd*2] = CC[3+bs*2];
19065 
19066 		if(n1==3)
19067 			return;
19068 
19069 		D[0+ldd*3] = CC[0+bs*3];
19070 		D[1+ldd*3] = CC[1+bs*3];
19071 		D[2+ldd*3] = CC[2+bs*3];
19072 		D[3+ldd*3] = CC[3+bs*3];
19073 		}
19074 	else if(m1>=3)
19075 		{
19076 		D[0+ldd*0] = CC[0+bs*0];
19077 		D[1+ldd*0] = CC[1+bs*0];
19078 		D[2+ldd*0] = CC[2+bs*0];
19079 
19080 		if(n1==1)
19081 			return;
19082 
19083 		D[0+ldd*1] = CC[0+bs*1];
19084 		D[1+ldd*1] = CC[1+bs*1];
19085 		D[2+ldd*1] = CC[2+bs*1];
19086 
19087 		if(n1==2)
19088 			return;
19089 
19090 		D[0+ldd*2] = CC[0+bs*2];
19091 		D[1+ldd*2] = CC[1+bs*2];
19092 		D[2+ldd*2] = CC[2+bs*2];
19093 
19094 		if(n1==3)
19095 			return;
19096 
19097 		D[0+ldd*3] = CC[0+bs*3];
19098 		D[1+ldd*3] = CC[1+bs*3];
19099 		D[2+ldd*3] = CC[2+bs*3];
19100 		}
19101 	else if(m1>=2)
19102 		{
19103 		D[0+ldd*0] = CC[0+bs*0];
19104 		D[1+ldd*0] = CC[1+bs*0];
19105 
19106 		if(n1==1)
19107 			return;
19108 
19109 		D[0+ldd*1] = CC[0+bs*1];
19110 		D[1+ldd*1] = CC[1+bs*1];
19111 
19112 		if(n1==2)
19113 			return;
19114 
19115 		D[0+ldd*2] = CC[0+bs*2];
19116 		D[1+ldd*2] = CC[1+bs*2];
19117 
19118 		if(n1==3)
19119 			return;
19120 
19121 		D[0+ldd*3] = CC[0+bs*3];
19122 		D[1+ldd*3] = CC[1+bs*3];
19123 		}
19124 	else //if(m1>=1)
19125 		{
19126 		D[0+ldd*0] = CC[0+bs*0];
19127 
19128 		if(n1==1)
19129 			return;
19130 
19131 		D[0+ldd*1] = CC[0+bs*1];
19132 
19133 		if(n1==2)
19134 			return;
19135 
19136 		D[0+ldd*2] = CC[0+bs*2];
19137 
19138 		if(n1==3)
19139 			return;
19140 
19141 		D[0+ldd*3] = CC[0+bs*3];
19142 		}
19143 
19144 	return;
19145 
19146 	}
19147 #endif
19148 
19149 
19150 
19151 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_one_4x4_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde)19152 void kernel_dtrsm_nn_ru_one_4x4_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde)
19153 	{
19154 
19155 	const int bs = 4;
19156 
19157 	double tmp;
19158 
19159 #if defined(TARGET_GENERIC)
19160 	double CC[16] = {0};
19161 #else
19162 	ALIGNED( double CC[16], 64 ) = {0};
19163 #endif
19164 
19165 	double alpha1 = -1.0;
19166 
19167 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
19168 
19169 	tmp = E[0+lde*1];
19170 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
19171 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
19172 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
19173 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
19174 
19175 	tmp = E[0+lde*2];
19176 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
19177 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
19178 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
19179 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
19180 	tmp = E[1+lde*2];
19181 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
19182 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
19183 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
19184 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
19185 
19186 	tmp = E[0+lde*3];
19187 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
19188 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
19189 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
19190 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
19191 	tmp = E[1+lde*3];
19192 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
19193 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
19194 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
19195 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
19196 	tmp = E[2+lde*3];
19197 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
19198 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
19199 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
19200 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
19201 
19202 	D[0+bs*0] = CC[0+bs*0];
19203 	D[1+bs*0] = CC[1+bs*0];
19204 	D[2+bs*0] = CC[2+bs*0];
19205 	D[3+bs*0] = CC[3+bs*0];
19206 
19207 	D[0+bs*1] = CC[0+bs*1];
19208 	D[1+bs*1] = CC[1+bs*1];
19209 	D[2+bs*1] = CC[2+bs*1];
19210 	D[3+bs*1] = CC[3+bs*1];
19211 
19212 	D[0+bs*2] = CC[0+bs*2];
19213 	D[1+bs*2] = CC[1+bs*2];
19214 	D[2+bs*2] = CC[2+bs*2];
19215 	D[3+bs*2] = CC[3+bs*2];
19216 
19217 	D[0+bs*3] = CC[0+bs*3];
19218 	D[1+bs*3] = CC[1+bs*3];
19219 	D[2+bs*3] = CC[2+bs*3];
19220 	D[3+bs*3] = CC[3+bs*3];
19221 
19222 	return;
19223 
19224 	}
19225 #endif
19226 
19227 
19228 
19229 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_one_4x4_vs_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,int m1,int n1)19230 void kernel_dtrsm_nn_ru_one_4x4_vs_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1)
19231 	{
19232 
19233 	const int bs = 4;
19234 
19235 	double tmp;
19236 
19237 #if defined(TARGET_GENERIC)
19238 	double CC[16] = {0};
19239 #else
19240 	ALIGNED( double CC[16], 64 ) = {0};
19241 #endif
19242 
19243 	double alpha1 = -1.0;
19244 
19245 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
19246 
19247 	if(n1==1)
19248 		goto store;
19249 
19250 	tmp = E[0+lde*1];
19251 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
19252 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
19253 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
19254 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
19255 
19256 	if(n1==2)
19257 		goto store;
19258 
19259 	tmp = E[0+lde*2];
19260 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
19261 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
19262 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
19263 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
19264 	tmp = E[1+lde*2];
19265 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
19266 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
19267 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
19268 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
19269 
19270 	if(n1==3)
19271 		goto store;
19272 
19273 	tmp = E[0+lde*3];
19274 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
19275 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
19276 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
19277 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
19278 	tmp = E[1+lde*3];
19279 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
19280 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
19281 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
19282 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
19283 	tmp = E[2+lde*3];
19284 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
19285 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
19286 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
19287 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
19288 
19289 	store:
19290 
19291 	if(m1>=4)
19292 		{
19293 		D[0+bs*0] = CC[0+bs*0];
19294 		D[1+bs*0] = CC[1+bs*0];
19295 		D[2+bs*0] = CC[2+bs*0];
19296 		D[3+bs*0] = CC[3+bs*0];
19297 
19298 		if(n1==1)
19299 			return;
19300 
19301 		D[0+bs*1] = CC[0+bs*1];
19302 		D[1+bs*1] = CC[1+bs*1];
19303 		D[2+bs*1] = CC[2+bs*1];
19304 		D[3+bs*1] = CC[3+bs*1];
19305 
19306 		if(n1==2)
19307 			return;
19308 
19309 		D[0+bs*2] = CC[0+bs*2];
19310 		D[1+bs*2] = CC[1+bs*2];
19311 		D[2+bs*2] = CC[2+bs*2];
19312 		D[3+bs*2] = CC[3+bs*2];
19313 
19314 		if(n1==3)
19315 			return;
19316 
19317 		D[0+bs*3] = CC[0+bs*3];
19318 		D[1+bs*3] = CC[1+bs*3];
19319 		D[2+bs*3] = CC[2+bs*3];
19320 		D[3+bs*3] = CC[3+bs*3];
19321 		}
19322 	else if(m1>=3)
19323 		{
19324 		D[0+bs*0] = CC[0+bs*0];
19325 		D[1+bs*0] = CC[1+bs*0];
19326 		D[2+bs*0] = CC[2+bs*0];
19327 
19328 		if(n1==1)
19329 			return;
19330 
19331 		D[0+bs*1] = CC[0+bs*1];
19332 		D[1+bs*1] = CC[1+bs*1];
19333 		D[2+bs*1] = CC[2+bs*1];
19334 
19335 		if(n1==2)
19336 			return;
19337 
19338 		D[0+bs*2] = CC[0+bs*2];
19339 		D[1+bs*2] = CC[1+bs*2];
19340 		D[2+bs*2] = CC[2+bs*2];
19341 
19342 		if(n1==3)
19343 			return;
19344 
19345 		D[0+bs*3] = CC[0+bs*3];
19346 		D[1+bs*3] = CC[1+bs*3];
19347 		D[2+bs*3] = CC[2+bs*3];
19348 		}
19349 	else if(m1>=2)
19350 		{
19351 		D[0+bs*0] = CC[0+bs*0];
19352 		D[1+bs*0] = CC[1+bs*0];
19353 
19354 		if(n1==1)
19355 			return;
19356 
19357 		D[0+bs*1] = CC[0+bs*1];
19358 		D[1+bs*1] = CC[1+bs*1];
19359 
19360 		if(n1==2)
19361 			return;
19362 
19363 		D[0+bs*2] = CC[0+bs*2];
19364 		D[1+bs*2] = CC[1+bs*2];
19365 
19366 		if(n1==3)
19367 			return;
19368 
19369 		D[0+bs*3] = CC[0+bs*3];
19370 		D[1+bs*3] = CC[1+bs*3];
19371 		}
19372 	else //if(m1>=1)
19373 		{
19374 		D[0+bs*0] = CC[0+bs*0];
19375 
19376 		if(n1==1)
19377 			return;
19378 
19379 		D[0+bs*1] = CC[0+bs*1];
19380 
19381 		if(n1==2)
19382 			return;
19383 
19384 		D[0+bs*2] = CC[0+bs*2];
19385 
19386 		if(n1==3)
19387 			return;
19388 
19389 		D[0+bs*3] = CC[0+bs*3];
19390 		}
19391 
19392 	return;
19393 
19394 	}
19395 #endif
19396 
19397 
19398 
19399 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_one_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde)19400 void kernel_dtrsm_nn_ru_one_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde)
19401 	{
19402 
19403 	const int bs = 4;
19404 
19405 	double tmp;
19406 
19407 #if defined(TARGET_GENERIC)
19408 	double CC[16] = {0};
19409 #else
19410 	ALIGNED( double CC[16], 64 ) = {0};
19411 #endif
19412 
19413 	double alpha1 = -1.0;
19414 
19415 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
19416 
19417 	tmp = E[0+lde*1];
19418 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
19419 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
19420 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
19421 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
19422 
19423 	tmp = E[0+lde*2];
19424 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
19425 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
19426 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
19427 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
19428 	tmp = E[1+lde*2];
19429 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
19430 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
19431 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
19432 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
19433 
19434 	tmp = E[0+lde*3];
19435 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
19436 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
19437 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
19438 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
19439 	tmp = E[1+lde*3];
19440 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
19441 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
19442 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
19443 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
19444 	tmp = E[2+lde*3];
19445 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
19446 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
19447 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
19448 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
19449 
19450 
19451 	D[0+ldd*0] = CC[0+bs*0];
19452 	D[1+ldd*0] = CC[1+bs*0];
19453 	D[2+ldd*0] = CC[2+bs*0];
19454 	D[3+ldd*0] = CC[3+bs*0];
19455 
19456 	D[0+ldd*1] = CC[0+bs*1];
19457 	D[1+ldd*1] = CC[1+bs*1];
19458 	D[2+ldd*1] = CC[2+bs*1];
19459 	D[3+ldd*1] = CC[3+bs*1];
19460 
19461 	D[0+ldd*2] = CC[0+bs*2];
19462 	D[1+ldd*2] = CC[1+bs*2];
19463 	D[2+ldd*2] = CC[2+bs*2];
19464 	D[3+ldd*2] = CC[3+bs*2];
19465 
19466 	D[0+ldd*3] = CC[0+bs*3];
19467 	D[1+ldd*3] = CC[1+bs*3];
19468 	D[2+ldd*3] = CC[2+bs*3];
19469 	D[3+ldd*3] = CC[3+bs*3];
19470 
19471 	return;
19472 
19473 	}
19474 #endif
19475 
19476 
19477 
19478 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_one_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,int m1,int n1)19479 void kernel_dtrsm_nn_ru_one_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1)
19480 	{
19481 
19482 	const int bs = 4;
19483 
19484 	double tmp;
19485 
19486 #if defined(TARGET_GENERIC)
19487 	double CC[16] = {0};
19488 #else
19489 	ALIGNED( double CC[16], 64 ) = {0};
19490 #endif
19491 
19492 	double alpha1 = -1.0;
19493 
19494 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
19495 
19496 	if(n1==1)
19497 		goto store;
19498 
19499 	tmp = E[0+lde*1];
19500 	CC[0+bs*1] -= CC[0+bs*0] * tmp;
19501 	CC[1+bs*1] -= CC[1+bs*0] * tmp;
19502 	CC[2+bs*1] -= CC[2+bs*0] * tmp;
19503 	CC[3+bs*1] -= CC[3+bs*0] * tmp;
19504 
19505 	if(n1==2)
19506 		goto store;
19507 
19508 	tmp = E[0+lde*2];
19509 	CC[0+bs*2] -= CC[0+bs*0] * tmp;
19510 	CC[1+bs*2] -= CC[1+bs*0] * tmp;
19511 	CC[2+bs*2] -= CC[2+bs*0] * tmp;
19512 	CC[3+bs*2] -= CC[3+bs*0] * tmp;
19513 	tmp = E[1+lde*2];
19514 	CC[0+bs*2] -= CC[0+bs*1] * tmp;
19515 	CC[1+bs*2] -= CC[1+bs*1] * tmp;
19516 	CC[2+bs*2] -= CC[2+bs*1] * tmp;
19517 	CC[3+bs*2] -= CC[3+bs*1] * tmp;
19518 
19519 	if(n1==3)
19520 		goto store;
19521 
19522 	tmp = E[0+lde*3];
19523 	CC[0+bs*3] -= CC[0+bs*0] * tmp;
19524 	CC[1+bs*3] -= CC[1+bs*0] * tmp;
19525 	CC[2+bs*3] -= CC[2+bs*0] * tmp;
19526 	CC[3+bs*3] -= CC[3+bs*0] * tmp;
19527 	tmp = E[1+lde*3];
19528 	CC[0+bs*3] -= CC[0+bs*1] * tmp;
19529 	CC[1+bs*3] -= CC[1+bs*1] * tmp;
19530 	CC[2+bs*3] -= CC[2+bs*1] * tmp;
19531 	CC[3+bs*3] -= CC[3+bs*1] * tmp;
19532 	tmp = E[2+lde*3];
19533 	CC[0+bs*3] -= CC[0+bs*2] * tmp;
19534 	CC[1+bs*3] -= CC[1+bs*2] * tmp;
19535 	CC[2+bs*3] -= CC[2+bs*2] * tmp;
19536 	CC[3+bs*3] -= CC[3+bs*2] * tmp;
19537 
19538 	store:
19539 
19540 	if(m1>=4)
19541 		{
19542 		D[0+ldd*0] = CC[0+bs*0];
19543 		D[1+ldd*0] = CC[1+bs*0];
19544 		D[2+ldd*0] = CC[2+bs*0];
19545 		D[3+ldd*0] = CC[3+bs*0];
19546 
19547 		if(n1==1)
19548 			return;
19549 
19550 		D[0+ldd*1] = CC[0+bs*1];
19551 		D[1+ldd*1] = CC[1+bs*1];
19552 		D[2+ldd*1] = CC[2+bs*1];
19553 		D[3+ldd*1] = CC[3+bs*1];
19554 
19555 		if(n1==2)
19556 			return;
19557 
19558 		D[0+ldd*2] = CC[0+bs*2];
19559 		D[1+ldd*2] = CC[1+bs*2];
19560 		D[2+ldd*2] = CC[2+bs*2];
19561 		D[3+ldd*2] = CC[3+bs*2];
19562 
19563 		if(n1==3)
19564 			return;
19565 
19566 		D[0+ldd*3] = CC[0+bs*3];
19567 		D[1+ldd*3] = CC[1+bs*3];
19568 		D[2+ldd*3] = CC[2+bs*3];
19569 		D[3+ldd*3] = CC[3+bs*3];
19570 		}
19571 	else if(m1>=3)
19572 		{
19573 		D[0+ldd*0] = CC[0+bs*0];
19574 		D[1+ldd*0] = CC[1+bs*0];
19575 		D[2+ldd*0] = CC[2+bs*0];
19576 
19577 		if(n1==1)
19578 			return;
19579 
19580 		D[0+ldd*1] = CC[0+bs*1];
19581 		D[1+ldd*1] = CC[1+bs*1];
19582 		D[2+ldd*1] = CC[2+bs*1];
19583 
19584 		if(n1==2)
19585 			return;
19586 
19587 		D[0+ldd*2] = CC[0+bs*2];
19588 		D[1+ldd*2] = CC[1+bs*2];
19589 		D[2+ldd*2] = CC[2+bs*2];
19590 
19591 		if(n1==3)
19592 			return;
19593 
19594 		D[0+ldd*3] = CC[0+bs*3];
19595 		D[1+ldd*3] = CC[1+bs*3];
19596 		D[2+ldd*3] = CC[2+bs*3];
19597 		}
19598 	else if(m1>=2)
19599 		{
19600 		D[0+ldd*0] = CC[0+bs*0];
19601 		D[1+ldd*0] = CC[1+bs*0];
19602 
19603 		if(n1==1)
19604 			return;
19605 
19606 		D[0+ldd*1] = CC[0+bs*1];
19607 		D[1+ldd*1] = CC[1+bs*1];
19608 
19609 		if(n1==2)
19610 			return;
19611 
19612 		D[0+ldd*2] = CC[0+bs*2];
19613 		D[1+ldd*2] = CC[1+bs*2];
19614 
19615 		if(n1==3)
19616 			return;
19617 
19618 		D[0+ldd*3] = CC[0+bs*3];
19619 		D[1+ldd*3] = CC[1+bs*3];
19620 		}
19621 	else //if(m1>=1)
19622 		{
19623 		D[0+ldd*0] = CC[0+bs*0];
19624 
19625 		if(n1==1)
19626 			return;
19627 
19628 		D[0+ldd*1] = CC[0+bs*1];
19629 
19630 		if(n1==2)
19631 			return;
19632 
19633 		D[0+ldd*2] = CC[0+bs*2];
19634 
19635 		if(n1==3)
19636 			return;
19637 
19638 		D[0+ldd*3] = CC[0+bs*3];
19639 		}
19640 
19641 	return;
19642 
19643 	}
19644 #endif
19645 
19646 
19647 
19648 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_inv_4x4_lib44cc4(int kmax,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,double * E,double * inv_diag_E)19649 void kernel_dtrsm_nt_ru_inv_4x4_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E)
19650 	{
19651 
19652 	const int bs = 4;
19653 
19654 	double tmp;
19655 
19656 #if defined(TARGET_GENERIC)
19657 	double CC[16] = {0};
19658 #else
19659 	ALIGNED( double CC[16], 64 ) = {0};
19660 #endif
19661 
19662 	double alpha1 = -1.0;
19663 
19664 	CC[0+bs*0] = C[0+ldc*0];
19665 	CC[1+bs*0] = C[1+ldc*0];
19666 	CC[2+bs*0] = C[2+ldc*0];
19667 	CC[3+bs*0] = C[3+ldc*0];
19668 
19669 	CC[0+bs*1] = C[0+ldc*1];
19670 	CC[1+bs*1] = C[1+ldc*1];
19671 	CC[2+bs*1] = C[2+ldc*1];
19672 	CC[3+bs*1] = C[3+ldc*1];
19673 
19674 	CC[0+bs*2] = C[0+ldc*2];
19675 	CC[1+bs*2] = C[1+ldc*2];
19676 	CC[2+bs*2] = C[2+ldc*2];
19677 	CC[3+bs*2] = C[3+ldc*2];
19678 
19679 	CC[0+bs*3] = C[0+ldc*3];
19680 	CC[1+bs*3] = C[1+ldc*3];
19681 	CC[2+bs*3] = C[2+ldc*3];
19682 	CC[3+bs*3] = C[3+ldc*3];
19683 
19684 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, CC, CC);
19685 
19686 	tmp = inv_diag_E[3];
19687 	CC[0+bs*3] *= tmp;
19688 	CC[1+bs*3] *= tmp;
19689 	CC[2+bs*3] *= tmp;
19690 	CC[3+bs*3] *= tmp;
19691 	tmp = E[0+bs*3];
19692 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
19693 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
19694 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
19695 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
19696 	tmp = E[1+bs*3];
19697 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
19698 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
19699 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
19700 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
19701 	tmp = E[2+bs*3];
19702 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
19703 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
19704 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
19705 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
19706 
19707 	tmp = inv_diag_E[2];
19708 	CC[0+bs*2] *= tmp;
19709 	CC[1+bs*2] *= tmp;
19710 	CC[2+bs*2] *= tmp;
19711 	CC[3+bs*2] *= tmp;
19712 	tmp = E[0+bs*2];
19713 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
19714 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
19715 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
19716 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
19717 	tmp = E[1+bs*2];
19718 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
19719 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
19720 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
19721 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
19722 
19723 	tmp = inv_diag_E[1];
19724 	CC[0+bs*1] *= tmp;
19725 	CC[1+bs*1] *= tmp;
19726 	CC[2+bs*1] *= tmp;
19727 	CC[3+bs*1] *= tmp;
19728 	tmp = E[0+bs*1];
19729 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
19730 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
19731 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
19732 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
19733 
19734 	tmp = inv_diag_E[0];
19735 	CC[0+bs*0] *= tmp;
19736 	CC[1+bs*0] *= tmp;
19737 	CC[2+bs*0] *= tmp;
19738 	CC[3+bs*0] *= tmp;
19739 
19740 
19741 	D[0+ldd*0] = CC[0+bs*0];
19742 	D[1+ldd*0] = CC[1+bs*0];
19743 	D[2+ldd*0] = CC[2+bs*0];
19744 	D[3+ldd*0] = CC[3+bs*0];
19745 
19746 	D[0+ldd*1] = CC[0+bs*1];
19747 	D[1+ldd*1] = CC[1+bs*1];
19748 	D[2+ldd*1] = CC[2+bs*1];
19749 	D[3+ldd*1] = CC[3+bs*1];
19750 
19751 	D[0+ldd*2] = CC[0+bs*2];
19752 	D[1+ldd*2] = CC[1+bs*2];
19753 	D[2+ldd*2] = CC[2+bs*2];
19754 	D[3+ldd*2] = CC[3+bs*2];
19755 
19756 	D[0+ldd*3] = CC[0+bs*3];
19757 	D[1+ldd*3] = CC[1+bs*3];
19758 	D[2+ldd*3] = CC[2+bs*3];
19759 	D[3+ldd*3] = CC[3+bs*3];
19760 
19761 	return;
19762 
19763 	}
19764 #endif
19765 
19766 
19767 
19768 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_inv_4x4_vs_lib44cc4(int kmax,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,double * E,double * inv_diag_E,int m1,int n1)19769 void kernel_dtrsm_nt_ru_inv_4x4_vs_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E, int m1, int n1)
19770 	{
19771 
19772 	const int bs = 4;
19773 
19774 	double tmp;
19775 
19776 #if defined(TARGET_GENERIC)
19777 	double CC[16] = {0};
19778 #else
19779 	ALIGNED( double CC[16], 64 ) = {0};
19780 #endif
19781 
19782 	double alpha1 = -1.0;
19783 
19784 	CC[0+bs*0] = C[0+ldc*0];
19785 	CC[1+bs*0] = C[1+ldc*0];
19786 	CC[2+bs*0] = C[2+ldc*0];
19787 	CC[3+bs*0] = C[3+ldc*0];
19788 
19789 	CC[0+bs*1] = C[0+ldc*1];
19790 	CC[1+bs*1] = C[1+ldc*1];
19791 	CC[2+bs*1] = C[2+ldc*1];
19792 	CC[3+bs*1] = C[3+ldc*1];
19793 
19794 	CC[0+bs*2] = C[0+ldc*2];
19795 	CC[1+bs*2] = C[1+ldc*2];
19796 	CC[2+bs*2] = C[2+ldc*2];
19797 	CC[3+bs*2] = C[3+ldc*2];
19798 
19799 	CC[0+bs*3] = C[0+ldc*3];
19800 	CC[1+bs*3] = C[1+ldc*3];
19801 	CC[2+bs*3] = C[2+ldc*3];
19802 	CC[3+bs*3] = C[3+ldc*3];
19803 
19804 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, CC, CC);
19805 
19806 	if(n1<=3)
19807 		goto n3;
19808 
19809 	tmp = inv_diag_E[3];
19810 	CC[0+bs*3] *= tmp;
19811 	CC[1+bs*3] *= tmp;
19812 	CC[2+bs*3] *= tmp;
19813 	CC[3+bs*3] *= tmp;
19814 	tmp = E[0+bs*3];
19815 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
19816 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
19817 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
19818 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
19819 	tmp = E[1+bs*3];
19820 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
19821 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
19822 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
19823 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
19824 	tmp = E[2+bs*3];
19825 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
19826 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
19827 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
19828 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
19829 
19830 n3:
19831 	if(n1<=2)
19832 		goto n2;
19833 
19834 	tmp = inv_diag_E[2];
19835 	CC[0+bs*2] *= tmp;
19836 	CC[1+bs*2] *= tmp;
19837 	CC[2+bs*2] *= tmp;
19838 	CC[3+bs*2] *= tmp;
19839 	tmp = E[0+bs*2];
19840 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
19841 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
19842 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
19843 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
19844 	tmp = E[1+bs*2];
19845 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
19846 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
19847 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
19848 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
19849 
19850 n2:
19851 	if(n1<=1)
19852 		goto n1;
19853 
19854 	tmp = inv_diag_E[1];
19855 	CC[0+bs*1] *= tmp;
19856 	CC[1+bs*1] *= tmp;
19857 	CC[2+bs*1] *= tmp;
19858 	CC[3+bs*1] *= tmp;
19859 	tmp = E[0+bs*1];
19860 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
19861 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
19862 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
19863 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
19864 
19865 n1:
19866 
19867 	tmp = inv_diag_E[0];
19868 	CC[0+bs*0] *= tmp;
19869 	CC[1+bs*0] *= tmp;
19870 	CC[2+bs*0] *= tmp;
19871 	CC[3+bs*0] *= tmp;
19872 
19873 	store:
19874 
19875 	if(m1>=4)
19876 		{
19877 		D[0+ldd*0] = CC[0+bs*0];
19878 		D[1+ldd*0] = CC[1+bs*0];
19879 		D[2+ldd*0] = CC[2+bs*0];
19880 		D[3+ldd*0] = CC[3+bs*0];
19881 
19882 		if(n1==1)
19883 			return;
19884 
19885 		D[0+ldd*1] = CC[0+bs*1];
19886 		D[1+ldd*1] = CC[1+bs*1];
19887 		D[2+ldd*1] = CC[2+bs*1];
19888 		D[3+ldd*1] = CC[3+bs*1];
19889 
19890 		if(n1==2)
19891 			return;
19892 
19893 		D[0+ldd*2] = CC[0+bs*2];
19894 		D[1+ldd*2] = CC[1+bs*2];
19895 		D[2+ldd*2] = CC[2+bs*2];
19896 		D[3+ldd*2] = CC[3+bs*2];
19897 
19898 		if(n1==3)
19899 			return;
19900 
19901 		D[0+ldd*3] = CC[0+bs*3];
19902 		D[1+ldd*3] = CC[1+bs*3];
19903 		D[2+ldd*3] = CC[2+bs*3];
19904 		D[3+ldd*3] = CC[3+bs*3];
19905 		}
19906 	else if(m1>=3)
19907 		{
19908 		D[0+ldd*0] = CC[0+bs*0];
19909 		D[1+ldd*0] = CC[1+bs*0];
19910 		D[2+ldd*0] = CC[2+bs*0];
19911 
19912 		if(n1==1)
19913 			return;
19914 
19915 		D[0+ldd*1] = CC[0+bs*1];
19916 		D[1+ldd*1] = CC[1+bs*1];
19917 		D[2+ldd*1] = CC[2+bs*1];
19918 
19919 		if(n1==2)
19920 			return;
19921 
19922 		D[0+ldd*2] = CC[0+bs*2];
19923 		D[1+ldd*2] = CC[1+bs*2];
19924 		D[2+ldd*2] = CC[2+bs*2];
19925 
19926 		if(n1==3)
19927 			return;
19928 
19929 		D[0+ldd*3] = CC[0+bs*3];
19930 		D[1+ldd*3] = CC[1+bs*3];
19931 		D[2+ldd*3] = CC[2+bs*3];
19932 		}
19933 	else if(m1>=2)
19934 		{
19935 		D[0+ldd*0] = CC[0+bs*0];
19936 		D[1+ldd*0] = CC[1+bs*0];
19937 
19938 		if(n1==1)
19939 			return;
19940 
19941 		D[0+ldd*1] = CC[0+bs*1];
19942 		D[1+ldd*1] = CC[1+bs*1];
19943 
19944 		if(n1==2)
19945 			return;
19946 
19947 		D[0+ldd*2] = CC[0+bs*2];
19948 		D[1+ldd*2] = CC[1+bs*2];
19949 
19950 		if(n1==3)
19951 			return;
19952 
19953 		D[0+ldd*3] = CC[0+bs*3];
19954 		D[1+ldd*3] = CC[1+bs*3];
19955 		}
19956 	else //if(m1>=1)
19957 		{
19958 		D[0+ldd*0] = CC[0+bs*0];
19959 
19960 		if(n1==1)
19961 			return;
19962 
19963 		D[0+ldd*1] = CC[0+bs*1];
19964 
19965 		if(n1==2)
19966 			return;
19967 
19968 		D[0+ldd*2] = CC[0+bs*2];
19969 
19970 		if(n1==3)
19971 			return;
19972 
19973 		D[0+ldd*3] = CC[0+bs*3];
19974 		}
19975 
19976 	return;
19977 
19978 	}
19979 #endif
19980 
19981 
19982 
19983 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_inv_4x4_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,double * inv_diag_E)19984 void kernel_dtrsm_nt_ru_inv_4x4_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E)
19985 	{
19986 
19987 	const int bs = 4;
19988 
19989 	double tmp;
19990 
19991 #if defined(TARGET_GENERIC)
19992 	double CC[16] = {0};
19993 #else
19994 	ALIGNED( double CC[16], 64 ) = {0};
19995 #endif
19996 
19997 	double alpha1 = -1.0;
19998 
19999 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
20000 
20001 	tmp = inv_diag_E[3];
20002 	CC[0+bs*3] *= tmp;
20003 	CC[1+bs*3] *= tmp;
20004 	CC[2+bs*3] *= tmp;
20005 	CC[3+bs*3] *= tmp;
20006 	tmp = E[0+lde*3];
20007 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
20008 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
20009 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
20010 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
20011 	tmp = E[1+lde*3];
20012 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
20013 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
20014 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
20015 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
20016 	tmp = E[2+lde*3];
20017 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
20018 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
20019 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
20020 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
20021 
20022 	tmp = inv_diag_E[2];
20023 	CC[0+bs*2] *= tmp;
20024 	CC[1+bs*2] *= tmp;
20025 	CC[2+bs*2] *= tmp;
20026 	CC[3+bs*2] *= tmp;
20027 	tmp = E[0+lde*2];
20028 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
20029 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
20030 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
20031 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
20032 	tmp = E[1+lde*2];
20033 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
20034 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
20035 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
20036 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
20037 
20038 	tmp = inv_diag_E[1];
20039 	CC[0+bs*1] *= tmp;
20040 	CC[1+bs*1] *= tmp;
20041 	CC[2+bs*1] *= tmp;
20042 	CC[3+bs*1] *= tmp;
20043 	tmp = E[0+lde*1];
20044 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
20045 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
20046 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
20047 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
20048 
20049 	tmp = inv_diag_E[0];
20050 	CC[0+bs*0] *= tmp;
20051 	CC[1+bs*0] *= tmp;
20052 	CC[2+bs*0] *= tmp;
20053 	CC[3+bs*0] *= tmp;
20054 
20055 
20056 	D[0+bs*0] = CC[0+bs*0];
20057 	D[1+bs*0] = CC[1+bs*0];
20058 	D[2+bs*0] = CC[2+bs*0];
20059 	D[3+bs*0] = CC[3+bs*0];
20060 
20061 	D[0+bs*1] = CC[0+bs*1];
20062 	D[1+bs*1] = CC[1+bs*1];
20063 	D[2+bs*1] = CC[2+bs*1];
20064 	D[3+bs*1] = CC[3+bs*1];
20065 
20066 	D[0+bs*2] = CC[0+bs*2];
20067 	D[1+bs*2] = CC[1+bs*2];
20068 	D[2+bs*2] = CC[2+bs*2];
20069 	D[3+bs*2] = CC[3+bs*2];
20070 
20071 	D[0+bs*3] = CC[0+bs*3];
20072 	D[1+bs*3] = CC[1+bs*3];
20073 	D[2+bs*3] = CC[2+bs*3];
20074 	D[3+bs*3] = CC[3+bs*3];
20075 
20076 	return;
20077 
20078 	}
20079 #endif
20080 
20081 
20082 
20083 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_inv_4x4_vs_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,double * inv_diag_E,int m1,int n1)20084 void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1)
20085 	{
20086 
20087 	const int bs = 4;
20088 
20089 	double tmp;
20090 
20091 #if defined(TARGET_GENERIC)
20092 	double CC[16] = {0};
20093 #else
20094 	ALIGNED( double CC[16], 64 ) = {0};
20095 #endif
20096 
20097 	double alpha1 = -1.0;
20098 
20099 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
20100 
20101 	if(n1<=3)
20102 		goto n3;
20103 
20104 	tmp = inv_diag_E[3];
20105 	CC[0+bs*3] *= tmp;
20106 	CC[1+bs*3] *= tmp;
20107 	CC[2+bs*3] *= tmp;
20108 	CC[3+bs*3] *= tmp;
20109 	tmp = E[0+lde*3];
20110 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
20111 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
20112 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
20113 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
20114 	tmp = E[1+lde*3];
20115 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
20116 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
20117 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
20118 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
20119 	tmp = E[2+lde*3];
20120 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
20121 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
20122 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
20123 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
20124 
20125 n3:
20126 	if(n1<=2)
20127 		goto n2;
20128 
20129 	tmp = inv_diag_E[2];
20130 	CC[0+bs*2] *= tmp;
20131 	CC[1+bs*2] *= tmp;
20132 	CC[2+bs*2] *= tmp;
20133 	CC[3+bs*2] *= tmp;
20134 	tmp = E[0+lde*2];
20135 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
20136 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
20137 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
20138 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
20139 	tmp = E[1+lde*2];
20140 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
20141 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
20142 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
20143 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
20144 
20145 n2:
20146 	if(n1<=1)
20147 		goto n1;
20148 
20149 	tmp = inv_diag_E[1];
20150 	CC[0+bs*1] *= tmp;
20151 	CC[1+bs*1] *= tmp;
20152 	CC[2+bs*1] *= tmp;
20153 	CC[3+bs*1] *= tmp;
20154 	tmp = E[0+lde*1];
20155 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
20156 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
20157 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
20158 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
20159 
20160 n1:
20161 
20162 	tmp = inv_diag_E[0];
20163 	CC[0+bs*0] *= tmp;
20164 	CC[1+bs*0] *= tmp;
20165 	CC[2+bs*0] *= tmp;
20166 	CC[3+bs*0] *= tmp;
20167 
20168 	store:
20169 
20170 	if(m1>=4)
20171 		{
20172 		D[0+bs*0] = CC[0+bs*0];
20173 		D[1+bs*0] = CC[1+bs*0];
20174 		D[2+bs*0] = CC[2+bs*0];
20175 		D[3+bs*0] = CC[3+bs*0];
20176 
20177 		if(n1==1)
20178 			return;
20179 
20180 		D[0+bs*1] = CC[0+bs*1];
20181 		D[1+bs*1] = CC[1+bs*1];
20182 		D[2+bs*1] = CC[2+bs*1];
20183 		D[3+bs*1] = CC[3+bs*1];
20184 
20185 		if(n1==2)
20186 			return;
20187 
20188 		D[0+bs*2] = CC[0+bs*2];
20189 		D[1+bs*2] = CC[1+bs*2];
20190 		D[2+bs*2] = CC[2+bs*2];
20191 		D[3+bs*2] = CC[3+bs*2];
20192 
20193 		if(n1==3)
20194 			return;
20195 
20196 		D[0+bs*3] = CC[0+bs*3];
20197 		D[1+bs*3] = CC[1+bs*3];
20198 		D[2+bs*3] = CC[2+bs*3];
20199 		D[3+bs*3] = CC[3+bs*3];
20200 		}
20201 	else if(m1>=3)
20202 		{
20203 		D[0+bs*0] = CC[0+bs*0];
20204 		D[1+bs*0] = CC[1+bs*0];
20205 		D[2+bs*0] = CC[2+bs*0];
20206 
20207 		if(n1==1)
20208 			return;
20209 
20210 		D[0+bs*1] = CC[0+bs*1];
20211 		D[1+bs*1] = CC[1+bs*1];
20212 		D[2+bs*1] = CC[2+bs*1];
20213 
20214 		if(n1==2)
20215 			return;
20216 
20217 		D[0+bs*2] = CC[0+bs*2];
20218 		D[1+bs*2] = CC[1+bs*2];
20219 		D[2+bs*2] = CC[2+bs*2];
20220 
20221 		if(n1==3)
20222 			return;
20223 
20224 		D[0+bs*3] = CC[0+bs*3];
20225 		D[1+bs*3] = CC[1+bs*3];
20226 		D[2+bs*3] = CC[2+bs*3];
20227 		}
20228 	else if(m1>=2)
20229 		{
20230 		D[0+bs*0] = CC[0+bs*0];
20231 		D[1+bs*0] = CC[1+bs*0];
20232 
20233 		if(n1==1)
20234 			return;
20235 
20236 		D[0+bs*1] = CC[0+bs*1];
20237 		D[1+bs*1] = CC[1+bs*1];
20238 
20239 		if(n1==2)
20240 			return;
20241 
20242 		D[0+bs*2] = CC[0+bs*2];
20243 		D[1+bs*2] = CC[1+bs*2];
20244 
20245 		if(n1==3)
20246 			return;
20247 
20248 		D[0+bs*3] = CC[0+bs*3];
20249 		D[1+bs*3] = CC[1+bs*3];
20250 		}
20251 	else //if(m1>=1)
20252 		{
20253 		D[0+bs*0] = CC[0+bs*0];
20254 
20255 		if(n1==1)
20256 			return;
20257 
20258 		D[0+bs*1] = CC[0+bs*1];
20259 
20260 		if(n1==2)
20261 			return;
20262 
20263 		D[0+bs*2] = CC[0+bs*2];
20264 
20265 		if(n1==3)
20266 			return;
20267 
20268 		D[0+bs*3] = CC[0+bs*3];
20269 		}
20270 
20271 	return;
20272 
20273 	}
20274 #endif
20275 
20276 
20277 
20278 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_inv_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E)20279 void kernel_dtrsm_nt_ru_inv_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E)
20280 	{
20281 
20282 	const int bs = 4;
20283 
20284 	double tmp;
20285 
20286 #if defined(TARGET_GENERIC)
20287 	double CC[16] = {0};
20288 #else
20289 	ALIGNED( double CC[16], 64 ) = {0};
20290 #endif
20291 
20292 	double alpha1 = -1.0;
20293 
20294 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
20295 
20296 	tmp = inv_diag_E[3];
20297 	CC[0+bs*3] *= tmp;
20298 	CC[1+bs*3] *= tmp;
20299 	CC[2+bs*3] *= tmp;
20300 	CC[3+bs*3] *= tmp;
20301 	tmp = E[0+lde*3];
20302 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
20303 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
20304 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
20305 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
20306 	tmp = E[1+lde*3];
20307 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
20308 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
20309 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
20310 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
20311 	tmp = E[2+lde*3];
20312 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
20313 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
20314 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
20315 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
20316 
20317 	tmp = inv_diag_E[2];
20318 	CC[0+bs*2] *= tmp;
20319 	CC[1+bs*2] *= tmp;
20320 	CC[2+bs*2] *= tmp;
20321 	CC[3+bs*2] *= tmp;
20322 	tmp = E[0+lde*2];
20323 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
20324 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
20325 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
20326 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
20327 	tmp = E[1+lde*2];
20328 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
20329 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
20330 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
20331 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
20332 
20333 	tmp = inv_diag_E[1];
20334 	CC[0+bs*1] *= tmp;
20335 	CC[1+bs*1] *= tmp;
20336 	CC[2+bs*1] *= tmp;
20337 	CC[3+bs*1] *= tmp;
20338 	tmp = E[0+lde*1];
20339 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
20340 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
20341 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
20342 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
20343 
20344 	tmp = inv_diag_E[0];
20345 	CC[0+bs*0] *= tmp;
20346 	CC[1+bs*0] *= tmp;
20347 	CC[2+bs*0] *= tmp;
20348 	CC[3+bs*0] *= tmp;
20349 
20350 
20351 	D[0+ldd*0] = CC[0+bs*0];
20352 	D[1+ldd*0] = CC[1+bs*0];
20353 	D[2+ldd*0] = CC[2+bs*0];
20354 	D[3+ldd*0] = CC[3+bs*0];
20355 
20356 	D[0+ldd*1] = CC[0+bs*1];
20357 	D[1+ldd*1] = CC[1+bs*1];
20358 	D[2+ldd*1] = CC[2+bs*1];
20359 	D[3+ldd*1] = CC[3+bs*1];
20360 
20361 	D[0+ldd*2] = CC[0+bs*2];
20362 	D[1+ldd*2] = CC[1+bs*2];
20363 	D[2+ldd*2] = CC[2+bs*2];
20364 	D[3+ldd*2] = CC[3+bs*2];
20365 
20366 	D[0+ldd*3] = CC[0+bs*3];
20367 	D[1+ldd*3] = CC[1+bs*3];
20368 	D[2+ldd*3] = CC[2+bs*3];
20369 	D[3+ldd*3] = CC[3+bs*3];
20370 
20371 	return;
20372 
20373 	}
20374 #endif
20375 
20376 
20377 
20378 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_inv_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,double * inv_diag_E,int m1,int n1)20379 void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1)
20380 	{
20381 
20382 	const int bs = 4;
20383 
20384 	double tmp;
20385 
20386 #if defined(TARGET_GENERIC)
20387 	double CC[16] = {0};
20388 #else
20389 	ALIGNED( double CC[16], 64 ) = {0};
20390 #endif
20391 
20392 	double alpha1 = -1.0;
20393 
20394 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
20395 
20396 	if(n1<=3)
20397 		goto n3;
20398 
20399 	tmp = inv_diag_E[3];
20400 	CC[0+bs*3] *= tmp;
20401 	CC[1+bs*3] *= tmp;
20402 	CC[2+bs*3] *= tmp;
20403 	CC[3+bs*3] *= tmp;
20404 	tmp = E[0+lde*3];
20405 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
20406 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
20407 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
20408 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
20409 	tmp = E[1+lde*3];
20410 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
20411 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
20412 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
20413 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
20414 	tmp = E[2+lde*3];
20415 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
20416 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
20417 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
20418 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
20419 
20420 n3:
20421 	if(n1<=2)
20422 		goto n2;
20423 
20424 	tmp = inv_diag_E[2];
20425 	CC[0+bs*2] *= tmp;
20426 	CC[1+bs*2] *= tmp;
20427 	CC[2+bs*2] *= tmp;
20428 	CC[3+bs*2] *= tmp;
20429 	tmp = E[0+lde*2];
20430 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
20431 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
20432 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
20433 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
20434 	tmp = E[1+lde*2];
20435 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
20436 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
20437 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
20438 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
20439 
20440 n2:
20441 	if(n1<=1)
20442 		goto n1;
20443 
20444 	tmp = inv_diag_E[1];
20445 	CC[0+bs*1] *= tmp;
20446 	CC[1+bs*1] *= tmp;
20447 	CC[2+bs*1] *= tmp;
20448 	CC[3+bs*1] *= tmp;
20449 	tmp = E[0+lde*1];
20450 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
20451 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
20452 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
20453 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
20454 
20455 n1:
20456 
20457 	tmp = inv_diag_E[0];
20458 	CC[0+bs*0] *= tmp;
20459 	CC[1+bs*0] *= tmp;
20460 	CC[2+bs*0] *= tmp;
20461 	CC[3+bs*0] *= tmp;
20462 
20463 	store:
20464 
20465 	if(m1>=4)
20466 		{
20467 		D[0+ldd*0] = CC[0+bs*0];
20468 		D[1+ldd*0] = CC[1+bs*0];
20469 		D[2+ldd*0] = CC[2+bs*0];
20470 		D[3+ldd*0] = CC[3+bs*0];
20471 
20472 		if(n1==1)
20473 			return;
20474 
20475 		D[0+ldd*1] = CC[0+bs*1];
20476 		D[1+ldd*1] = CC[1+bs*1];
20477 		D[2+ldd*1] = CC[2+bs*1];
20478 		D[3+ldd*1] = CC[3+bs*1];
20479 
20480 		if(n1==2)
20481 			return;
20482 
20483 		D[0+ldd*2] = CC[0+bs*2];
20484 		D[1+ldd*2] = CC[1+bs*2];
20485 		D[2+ldd*2] = CC[2+bs*2];
20486 		D[3+ldd*2] = CC[3+bs*2];
20487 
20488 		if(n1==3)
20489 			return;
20490 
20491 		D[0+ldd*3] = CC[0+bs*3];
20492 		D[1+ldd*3] = CC[1+bs*3];
20493 		D[2+ldd*3] = CC[2+bs*3];
20494 		D[3+ldd*3] = CC[3+bs*3];
20495 		}
20496 	else if(m1>=3)
20497 		{
20498 		D[0+ldd*0] = CC[0+bs*0];
20499 		D[1+ldd*0] = CC[1+bs*0];
20500 		D[2+ldd*0] = CC[2+bs*0];
20501 
20502 		if(n1==1)
20503 			return;
20504 
20505 		D[0+ldd*1] = CC[0+bs*1];
20506 		D[1+ldd*1] = CC[1+bs*1];
20507 		D[2+ldd*1] = CC[2+bs*1];
20508 
20509 		if(n1==2)
20510 			return;
20511 
20512 		D[0+ldd*2] = CC[0+bs*2];
20513 		D[1+ldd*2] = CC[1+bs*2];
20514 		D[2+ldd*2] = CC[2+bs*2];
20515 
20516 		if(n1==3)
20517 			return;
20518 
20519 		D[0+ldd*3] = CC[0+bs*3];
20520 		D[1+ldd*3] = CC[1+bs*3];
20521 		D[2+ldd*3] = CC[2+bs*3];
20522 		}
20523 	else if(m1>=2)
20524 		{
20525 		D[0+ldd*0] = CC[0+bs*0];
20526 		D[1+ldd*0] = CC[1+bs*0];
20527 
20528 		if(n1==1)
20529 			return;
20530 
20531 		D[0+ldd*1] = CC[0+bs*1];
20532 		D[1+ldd*1] = CC[1+bs*1];
20533 
20534 		if(n1==2)
20535 			return;
20536 
20537 		D[0+ldd*2] = CC[0+bs*2];
20538 		D[1+ldd*2] = CC[1+bs*2];
20539 
20540 		if(n1==3)
20541 			return;
20542 
20543 		D[0+ldd*3] = CC[0+bs*3];
20544 		D[1+ldd*3] = CC[1+bs*3];
20545 		}
20546 	else //if(m1>=1)
20547 		{
20548 		D[0+ldd*0] = CC[0+bs*0];
20549 
20550 		if(n1==1)
20551 			return;
20552 
20553 		D[0+ldd*1] = CC[0+bs*1];
20554 
20555 		if(n1==2)
20556 			return;
20557 
20558 		D[0+ldd*2] = CC[0+bs*2];
20559 
20560 		if(n1==3)
20561 			return;
20562 
20563 		D[0+ldd*3] = CC[0+bs*3];
20564 		}
20565 
20566 	return;
20567 
20568 	}
20569 #endif
20570 
20571 
20572 
20573 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_one_4x4_lib44cc4(int kmax,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,double * E)20574 void kernel_dtrsm_nt_ru_one_4x4_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E)
20575 	{
20576 
20577 	const int bs = 4;
20578 
20579 	double tmp;
20580 
20581 #if defined(TARGET_GENERIC)
20582 	double CC[16] = {0};
20583 #else
20584 	ALIGNED( double CC[16], 64 ) = {0};
20585 #endif
20586 
20587 	double alpha1 = -1.0;
20588 
20589 	CC[0+bs*0] = C[0+ldc*0];
20590 	CC[1+bs*0] = C[1+ldc*0];
20591 	CC[2+bs*0] = C[2+ldc*0];
20592 	CC[3+bs*0] = C[3+ldc*0];
20593 
20594 	CC[0+bs*1] = C[0+ldc*1];
20595 	CC[1+bs*1] = C[1+ldc*1];
20596 	CC[2+bs*1] = C[2+ldc*1];
20597 	CC[3+bs*1] = C[3+ldc*1];
20598 
20599 	CC[0+bs*2] = C[0+ldc*2];
20600 	CC[1+bs*2] = C[1+ldc*2];
20601 	CC[2+bs*2] = C[2+ldc*2];
20602 	CC[3+bs*2] = C[3+ldc*2];
20603 
20604 	CC[0+bs*3] = C[0+ldc*3];
20605 	CC[1+bs*3] = C[1+ldc*3];
20606 	CC[2+bs*3] = C[2+ldc*3];
20607 	CC[3+bs*3] = C[3+ldc*3];
20608 
20609 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, CC, CC);
20610 
20611 	tmp = E[0+bs*3];
20612 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
20613 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
20614 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
20615 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
20616 	tmp = E[1+bs*3];
20617 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
20618 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
20619 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
20620 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
20621 	tmp = E[2+bs*3];
20622 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
20623 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
20624 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
20625 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
20626 
20627 	tmp = E[0+bs*2];
20628 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
20629 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
20630 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
20631 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
20632 	tmp = E[1+bs*2];
20633 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
20634 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
20635 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
20636 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
20637 
20638 	tmp = E[0+bs*1];
20639 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
20640 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
20641 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
20642 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
20643 
20644 
20645 	D[0+ldd*0] = CC[0+bs*0];
20646 	D[1+ldd*0] = CC[1+bs*0];
20647 	D[2+ldd*0] = CC[2+bs*0];
20648 	D[3+ldd*0] = CC[3+bs*0];
20649 
20650 	D[0+ldd*1] = CC[0+bs*1];
20651 	D[1+ldd*1] = CC[1+bs*1];
20652 	D[2+ldd*1] = CC[2+bs*1];
20653 	D[3+ldd*1] = CC[3+bs*1];
20654 
20655 	D[0+ldd*2] = CC[0+bs*2];
20656 	D[1+ldd*2] = CC[1+bs*2];
20657 	D[2+ldd*2] = CC[2+bs*2];
20658 	D[3+ldd*2] = CC[3+bs*2];
20659 
20660 	D[0+ldd*3] = CC[0+bs*3];
20661 	D[1+ldd*3] = CC[1+bs*3];
20662 	D[2+ldd*3] = CC[2+bs*3];
20663 	D[3+ldd*3] = CC[3+bs*3];
20664 
20665 	return;
20666 
20667 	}
20668 #endif
20669 
20670 
20671 
20672 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_one_4x4_vs_lib44cc4(int kmax,double * A,double * B,double * beta,double * C,int ldc,double * D,int ldd,double * E,int m1,int n1)20673 void kernel_dtrsm_nt_ru_one_4x4_vs_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, int m1, int n1)
20674 	{
20675 
20676 	const int bs = 4;
20677 
20678 	double tmp;
20679 
20680 #if defined(TARGET_GENERIC)
20681 	double CC[16] = {0};
20682 #else
20683 	ALIGNED( double CC[16], 64 ) = {0};
20684 #endif
20685 
20686 	double alpha1 = -1.0;
20687 
20688 	CC[0+bs*0] = C[0+ldc*0];
20689 	CC[1+bs*0] = C[1+ldc*0];
20690 	CC[2+bs*0] = C[2+ldc*0];
20691 	CC[3+bs*0] = C[3+ldc*0];
20692 
20693 	CC[0+bs*1] = C[0+ldc*1];
20694 	CC[1+bs*1] = C[1+ldc*1];
20695 	CC[2+bs*1] = C[2+ldc*1];
20696 	CC[3+bs*1] = C[3+ldc*1];
20697 
20698 	CC[0+bs*2] = C[0+ldc*2];
20699 	CC[1+bs*2] = C[1+ldc*2];
20700 	CC[2+bs*2] = C[2+ldc*2];
20701 	CC[3+bs*2] = C[3+ldc*2];
20702 
20703 	CC[0+bs*3] = C[0+ldc*3];
20704 	CC[1+bs*3] = C[1+ldc*3];
20705 	CC[2+bs*3] = C[2+ldc*3];
20706 	CC[3+bs*3] = C[3+ldc*3];
20707 
20708 	kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, CC, CC);
20709 
20710 	if(n1<=3)
20711 		goto n3;
20712 
20713 	tmp = E[0+bs*3];
20714 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
20715 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
20716 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
20717 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
20718 	tmp = E[1+bs*3];
20719 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
20720 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
20721 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
20722 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
20723 	tmp = E[2+bs*3];
20724 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
20725 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
20726 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
20727 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
20728 
20729 n3:
20730 	if(n1<=2)
20731 		goto n2;
20732 
20733 	tmp = E[0+bs*2];
20734 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
20735 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
20736 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
20737 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
20738 	tmp = E[1+bs*2];
20739 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
20740 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
20741 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
20742 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
20743 
20744 n2:
20745 	if(n1<=1)
20746 		goto n1;
20747 
20748 	tmp = E[0+bs*1];
20749 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
20750 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
20751 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
20752 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
20753 
20754 n1:
20755 
20756 	store:
20757 
20758 	if(m1>=4)
20759 		{
20760 		D[0+ldd*0] = CC[0+bs*0];
20761 		D[1+ldd*0] = CC[1+bs*0];
20762 		D[2+ldd*0] = CC[2+bs*0];
20763 		D[3+ldd*0] = CC[3+bs*0];
20764 
20765 		if(n1==1)
20766 			return;
20767 
20768 		D[0+ldd*1] = CC[0+bs*1];
20769 		D[1+ldd*1] = CC[1+bs*1];
20770 		D[2+ldd*1] = CC[2+bs*1];
20771 		D[3+ldd*1] = CC[3+bs*1];
20772 
20773 		if(n1==2)
20774 			return;
20775 
20776 		D[0+ldd*2] = CC[0+bs*2];
20777 		D[1+ldd*2] = CC[1+bs*2];
20778 		D[2+ldd*2] = CC[2+bs*2];
20779 		D[3+ldd*2] = CC[3+bs*2];
20780 
20781 		if(n1==3)
20782 			return;
20783 
20784 		D[0+ldd*3] = CC[0+bs*3];
20785 		D[1+ldd*3] = CC[1+bs*3];
20786 		D[2+ldd*3] = CC[2+bs*3];
20787 		D[3+ldd*3] = CC[3+bs*3];
20788 		}
20789 	else if(m1>=3)
20790 		{
20791 		D[0+ldd*0] = CC[0+bs*0];
20792 		D[1+ldd*0] = CC[1+bs*0];
20793 		D[2+ldd*0] = CC[2+bs*0];
20794 
20795 		if(n1==1)
20796 			return;
20797 
20798 		D[0+ldd*1] = CC[0+bs*1];
20799 		D[1+ldd*1] = CC[1+bs*1];
20800 		D[2+ldd*1] = CC[2+bs*1];
20801 
20802 		if(n1==2)
20803 			return;
20804 
20805 		D[0+ldd*2] = CC[0+bs*2];
20806 		D[1+ldd*2] = CC[1+bs*2];
20807 		D[2+ldd*2] = CC[2+bs*2];
20808 
20809 		if(n1==3)
20810 			return;
20811 
20812 		D[0+ldd*3] = CC[0+bs*3];
20813 		D[1+ldd*3] = CC[1+bs*3];
20814 		D[2+ldd*3] = CC[2+bs*3];
20815 		}
20816 	else if(m1>=2)
20817 		{
20818 		D[0+ldd*0] = CC[0+bs*0];
20819 		D[1+ldd*0] = CC[1+bs*0];
20820 
20821 		if(n1==1)
20822 			return;
20823 
20824 		D[0+ldd*1] = CC[0+bs*1];
20825 		D[1+ldd*1] = CC[1+bs*1];
20826 
20827 		if(n1==2)
20828 			return;
20829 
20830 		D[0+ldd*2] = CC[0+bs*2];
20831 		D[1+ldd*2] = CC[1+bs*2];
20832 
20833 		if(n1==3)
20834 			return;
20835 
20836 		D[0+ldd*3] = CC[0+bs*3];
20837 		D[1+ldd*3] = CC[1+bs*3];
20838 		}
20839 	else //if(m1>=1)
20840 		{
20841 		D[0+ldd*0] = CC[0+bs*0];
20842 
20843 		if(n1==1)
20844 			return;
20845 
20846 		D[0+ldd*1] = CC[0+bs*1];
20847 
20848 		if(n1==2)
20849 			return;
20850 
20851 		D[0+ldd*2] = CC[0+bs*2];
20852 
20853 		if(n1==3)
20854 			return;
20855 
20856 		D[0+ldd*3] = CC[0+bs*3];
20857 		}
20858 
20859 	return;
20860 
20861 	}
20862 #endif
20863 
20864 
20865 
20866 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_one_4x4_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde)20867 void kernel_dtrsm_nt_ru_one_4x4_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde)
20868 	{
20869 
20870 	const int bs = 4;
20871 
20872 	double tmp;
20873 
20874 #if defined(TARGET_GENERIC)
20875 	double CC[16] = {0};
20876 #else
20877 	ALIGNED( double CC[16], 64 ) = {0};
20878 #endif
20879 
20880 	double alpha1 = -1.0;
20881 
20882 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
20883 
20884 	tmp = E[0+lde*3];
20885 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
20886 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
20887 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
20888 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
20889 	tmp = E[1+lde*3];
20890 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
20891 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
20892 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
20893 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
20894 	tmp = E[2+lde*3];
20895 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
20896 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
20897 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
20898 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
20899 
20900 	tmp = E[0+lde*2];
20901 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
20902 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
20903 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
20904 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
20905 	tmp = E[1+lde*2];
20906 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
20907 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
20908 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
20909 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
20910 
20911 	tmp = E[0+lde*1];
20912 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
20913 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
20914 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
20915 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
20916 
20917 
20918 	D[0+bs*0] = CC[0+bs*0];
20919 	D[1+bs*0] = CC[1+bs*0];
20920 	D[2+bs*0] = CC[2+bs*0];
20921 	D[3+bs*0] = CC[3+bs*0];
20922 
20923 	D[0+bs*1] = CC[0+bs*1];
20924 	D[1+bs*1] = CC[1+bs*1];
20925 	D[2+bs*1] = CC[2+bs*1];
20926 	D[3+bs*1] = CC[3+bs*1];
20927 
20928 	D[0+bs*2] = CC[0+bs*2];
20929 	D[1+bs*2] = CC[1+bs*2];
20930 	D[2+bs*2] = CC[2+bs*2];
20931 	D[3+bs*2] = CC[3+bs*2];
20932 
20933 	D[0+bs*3] = CC[0+bs*3];
20934 	D[1+bs*3] = CC[1+bs*3];
20935 	D[2+bs*3] = CC[2+bs*3];
20936 	D[3+bs*3] = CC[3+bs*3];
20937 
20938 	return;
20939 
20940 	}
20941 #endif
20942 
20943 
20944 
20945 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_one_4x4_vs_lib4c44c(int kmax,double * A,double * B,int ldb,double * beta,double * C,double * D,double * E,int lde,int m1,int n1)20946 void kernel_dtrsm_nt_ru_one_4x4_vs_lib4c44c(int kmax, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1)
20947 	{
20948 
20949 	const int bs = 4;
20950 
20951 	double tmp;
20952 
20953 #if defined(TARGET_GENERIC)
20954 	double CC[16] = {0};
20955 #else
20956 	ALIGNED( double CC[16], 64 ) = {0};
20957 #endif
20958 
20959 	double alpha1 = -1.0;
20960 
20961 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, bs, CC, bs);
20962 
20963 	if(n1<=3)
20964 		goto n3;
20965 
20966 	tmp = E[0+lde*3];
20967 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
20968 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
20969 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
20970 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
20971 	tmp = E[1+lde*3];
20972 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
20973 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
20974 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
20975 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
20976 	tmp = E[2+lde*3];
20977 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
20978 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
20979 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
20980 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
20981 
20982 n3:
20983 	if(n1<=2)
20984 		goto n2;
20985 
20986 	tmp = E[0+lde*2];
20987 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
20988 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
20989 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
20990 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
20991 	tmp = E[1+lde*2];
20992 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
20993 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
20994 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
20995 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
20996 
20997 n2:
20998 	if(n1<=1)
20999 		goto n1;
21000 
21001 	tmp = E[0+lde*1];
21002 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
21003 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
21004 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
21005 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
21006 
21007 n1:
21008 
21009 	store:
21010 
21011 	if(m1>=4)
21012 		{
21013 		D[0+bs*0] = CC[0+bs*0];
21014 		D[1+bs*0] = CC[1+bs*0];
21015 		D[2+bs*0] = CC[2+bs*0];
21016 		D[3+bs*0] = CC[3+bs*0];
21017 
21018 		if(n1==1)
21019 			return;
21020 
21021 		D[0+bs*1] = CC[0+bs*1];
21022 		D[1+bs*1] = CC[1+bs*1];
21023 		D[2+bs*1] = CC[2+bs*1];
21024 		D[3+bs*1] = CC[3+bs*1];
21025 
21026 		if(n1==2)
21027 			return;
21028 
21029 		D[0+bs*2] = CC[0+bs*2];
21030 		D[1+bs*2] = CC[1+bs*2];
21031 		D[2+bs*2] = CC[2+bs*2];
21032 		D[3+bs*2] = CC[3+bs*2];
21033 
21034 		if(n1==3)
21035 			return;
21036 
21037 		D[0+bs*3] = CC[0+bs*3];
21038 		D[1+bs*3] = CC[1+bs*3];
21039 		D[2+bs*3] = CC[2+bs*3];
21040 		D[3+bs*3] = CC[3+bs*3];
21041 		}
21042 	else if(m1>=3)
21043 		{
21044 		D[0+bs*0] = CC[0+bs*0];
21045 		D[1+bs*0] = CC[1+bs*0];
21046 		D[2+bs*0] = CC[2+bs*0];
21047 
21048 		if(n1==1)
21049 			return;
21050 
21051 		D[0+bs*1] = CC[0+bs*1];
21052 		D[1+bs*1] = CC[1+bs*1];
21053 		D[2+bs*1] = CC[2+bs*1];
21054 
21055 		if(n1==2)
21056 			return;
21057 
21058 		D[0+bs*2] = CC[0+bs*2];
21059 		D[1+bs*2] = CC[1+bs*2];
21060 		D[2+bs*2] = CC[2+bs*2];
21061 
21062 		if(n1==3)
21063 			return;
21064 
21065 		D[0+bs*3] = CC[0+bs*3];
21066 		D[1+bs*3] = CC[1+bs*3];
21067 		D[2+bs*3] = CC[2+bs*3];
21068 		}
21069 	else if(m1>=2)
21070 		{
21071 		D[0+bs*0] = CC[0+bs*0];
21072 		D[1+bs*0] = CC[1+bs*0];
21073 
21074 		if(n1==1)
21075 			return;
21076 
21077 		D[0+bs*1] = CC[0+bs*1];
21078 		D[1+bs*1] = CC[1+bs*1];
21079 
21080 		if(n1==2)
21081 			return;
21082 
21083 		D[0+bs*2] = CC[0+bs*2];
21084 		D[1+bs*2] = CC[1+bs*2];
21085 
21086 		if(n1==3)
21087 			return;
21088 
21089 		D[0+bs*3] = CC[0+bs*3];
21090 		D[1+bs*3] = CC[1+bs*3];
21091 		}
21092 	else //if(m1>=1)
21093 		{
21094 		D[0+bs*0] = CC[0+bs*0];
21095 
21096 		if(n1==1)
21097 			return;
21098 
21099 		D[0+bs*1] = CC[0+bs*1];
21100 
21101 		if(n1==2)
21102 			return;
21103 
21104 		D[0+bs*2] = CC[0+bs*2];
21105 
21106 		if(n1==3)
21107 			return;
21108 
21109 		D[0+bs*3] = CC[0+bs*3];
21110 		}
21111 
21112 	return;
21113 
21114 	}
21115 #endif
21116 
21117 
21118 
21119 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_one_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde)21120 void kernel_dtrsm_nt_ru_one_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde)
21121 	{
21122 
21123 	const int bs = 4;
21124 
21125 	double tmp;
21126 
21127 #if defined(TARGET_GENERIC)
21128 	double CC[16] = {0};
21129 #else
21130 	ALIGNED( double CC[16], 64 ) = {0};
21131 #endif
21132 
21133 	double alpha1 = -1.0;
21134 
21135 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
21136 
21137 	tmp = E[0+lde*3];
21138 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
21139 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
21140 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
21141 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
21142 	tmp = E[1+lde*3];
21143 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
21144 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
21145 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
21146 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
21147 	tmp = E[2+lde*3];
21148 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
21149 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
21150 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
21151 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
21152 
21153 	tmp = E[0+lde*2];
21154 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
21155 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
21156 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
21157 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
21158 	tmp = E[1+lde*2];
21159 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
21160 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
21161 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
21162 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
21163 
21164 	tmp = E[0+lde*1];
21165 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
21166 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
21167 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
21168 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
21169 
21170 
21171 	D[0+ldd*0] = CC[0+bs*0];
21172 	D[1+ldd*0] = CC[1+bs*0];
21173 	D[2+ldd*0] = CC[2+bs*0];
21174 	D[3+ldd*0] = CC[3+bs*0];
21175 
21176 	D[0+ldd*1] = CC[0+bs*1];
21177 	D[1+ldd*1] = CC[1+bs*1];
21178 	D[2+ldd*1] = CC[2+bs*1];
21179 	D[3+ldd*1] = CC[3+bs*1];
21180 
21181 	D[0+ldd*2] = CC[0+bs*2];
21182 	D[1+ldd*2] = CC[1+bs*2];
21183 	D[2+ldd*2] = CC[2+bs*2];
21184 	D[3+ldd*2] = CC[3+bs*2];
21185 
21186 	D[0+ldd*3] = CC[0+bs*3];
21187 	D[1+ldd*3] = CC[1+bs*3];
21188 	D[2+ldd*3] = CC[2+bs*3];
21189 	D[3+ldd*3] = CC[3+bs*3];
21190 
21191 	return;
21192 
21193 	}
21194 #endif
21195 
21196 
21197 
21198 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_one_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,int m1,int n1)21199 void kernel_dtrsm_nt_ru_one_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1)
21200 	{
21201 
21202 	const int bs = 4;
21203 
21204 	double tmp;
21205 
21206 #if defined(TARGET_GENERIC)
21207 	double CC[16] = {0};
21208 #else
21209 	ALIGNED( double CC[16], 64 ) = {0};
21210 #endif
21211 
21212 	double alpha1 = -1.0;
21213 
21214 	kernel_dgemm_nt_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
21215 
21216 	if(n1<=3)
21217 		goto n3;
21218 
21219 	tmp = E[0+lde*3];
21220 	CC[0+bs*0] -= CC[0+bs*3] * tmp;
21221 	CC[1+bs*0] -= CC[1+bs*3] * tmp;
21222 	CC[2+bs*0] -= CC[2+bs*3] * tmp;
21223 	CC[3+bs*0] -= CC[3+bs*3] * tmp;
21224 	tmp = E[1+lde*3];
21225 	CC[0+bs*1] -= CC[0+bs*3] * tmp;
21226 	CC[1+bs*1] -= CC[1+bs*3] * tmp;
21227 	CC[2+bs*1] -= CC[2+bs*3] * tmp;
21228 	CC[3+bs*1] -= CC[3+bs*3] * tmp;
21229 	tmp = E[2+lde*3];
21230 	CC[0+bs*2] -= CC[0+bs*3] * tmp;
21231 	CC[1+bs*2] -= CC[1+bs*3] * tmp;
21232 	CC[2+bs*2] -= CC[2+bs*3] * tmp;
21233 	CC[3+bs*2] -= CC[3+bs*3] * tmp;
21234 
21235 n3:
21236 	if(n1<=2)
21237 		goto n2;
21238 
21239 	tmp = E[0+lde*2];
21240 	CC[0+bs*0] -= CC[0+bs*2] * tmp;
21241 	CC[1+bs*0] -= CC[1+bs*2] * tmp;
21242 	CC[2+bs*0] -= CC[2+bs*2] * tmp;
21243 	CC[3+bs*0] -= CC[3+bs*2] * tmp;
21244 	tmp = E[1+lde*2];
21245 	CC[0+bs*1] -= CC[0+bs*2] * tmp;
21246 	CC[1+bs*1] -= CC[1+bs*2] * tmp;
21247 	CC[2+bs*1] -= CC[2+bs*2] * tmp;
21248 	CC[3+bs*1] -= CC[3+bs*2] * tmp;
21249 
21250 n2:
21251 	if(n1<=1)
21252 		goto n1;
21253 
21254 	tmp = E[0+lde*1];
21255 	CC[0+bs*0] -= CC[0+bs*1] * tmp;
21256 	CC[1+bs*0] -= CC[1+bs*1] * tmp;
21257 	CC[2+bs*0] -= CC[2+bs*1] * tmp;
21258 	CC[3+bs*0] -= CC[3+bs*1] * tmp;
21259 
21260 n1:
21261 
21262 	store:
21263 
21264 	if(m1>=4)
21265 		{
21266 		D[0+ldd*0] = CC[0+bs*0];
21267 		D[1+ldd*0] = CC[1+bs*0];
21268 		D[2+ldd*0] = CC[2+bs*0];
21269 		D[3+ldd*0] = CC[3+bs*0];
21270 
21271 		if(n1==1)
21272 			return;
21273 
21274 		D[0+ldd*1] = CC[0+bs*1];
21275 		D[1+ldd*1] = CC[1+bs*1];
21276 		D[2+ldd*1] = CC[2+bs*1];
21277 		D[3+ldd*1] = CC[3+bs*1];
21278 
21279 		if(n1==2)
21280 			return;
21281 
21282 		D[0+ldd*2] = CC[0+bs*2];
21283 		D[1+ldd*2] = CC[1+bs*2];
21284 		D[2+ldd*2] = CC[2+bs*2];
21285 		D[3+ldd*2] = CC[3+bs*2];
21286 
21287 		if(n1==3)
21288 			return;
21289 
21290 		D[0+ldd*3] = CC[0+bs*3];
21291 		D[1+ldd*3] = CC[1+bs*3];
21292 		D[2+ldd*3] = CC[2+bs*3];
21293 		D[3+ldd*3] = CC[3+bs*3];
21294 		}
21295 	else if(m1>=3)
21296 		{
21297 		D[0+ldd*0] = CC[0+bs*0];
21298 		D[1+ldd*0] = CC[1+bs*0];
21299 		D[2+ldd*0] = CC[2+bs*0];
21300 
21301 		if(n1==1)
21302 			return;
21303 
21304 		D[0+ldd*1] = CC[0+bs*1];
21305 		D[1+ldd*1] = CC[1+bs*1];
21306 		D[2+ldd*1] = CC[2+bs*1];
21307 
21308 		if(n1==2)
21309 			return;
21310 
21311 		D[0+ldd*2] = CC[0+bs*2];
21312 		D[1+ldd*2] = CC[1+bs*2];
21313 		D[2+ldd*2] = CC[2+bs*2];
21314 
21315 		if(n1==3)
21316 			return;
21317 
21318 		D[0+ldd*3] = CC[0+bs*3];
21319 		D[1+ldd*3] = CC[1+bs*3];
21320 		D[2+ldd*3] = CC[2+bs*3];
21321 		}
21322 	else if(m1>=2)
21323 		{
21324 		D[0+ldd*0] = CC[0+bs*0];
21325 		D[1+ldd*0] = CC[1+bs*0];
21326 
21327 		if(n1==1)
21328 			return;
21329 
21330 		D[0+ldd*1] = CC[0+bs*1];
21331 		D[1+ldd*1] = CC[1+bs*1];
21332 
21333 		if(n1==2)
21334 			return;
21335 
21336 		D[0+ldd*2] = CC[0+bs*2];
21337 		D[1+ldd*2] = CC[1+bs*2];
21338 
21339 		if(n1==3)
21340 			return;
21341 
21342 		D[0+ldd*3] = CC[0+bs*3];
21343 		D[1+ldd*3] = CC[1+bs*3];
21344 		}
21345 	else //if(m1>=1)
21346 		{
21347 		D[0+ldd*0] = CC[0+bs*0];
21348 
21349 		if(n1==1)
21350 			return;
21351 
21352 		D[0+ldd*1] = CC[0+bs*1];
21353 
21354 		if(n1==2)
21355 			return;
21356 
21357 		D[0+ldd*2] = CC[0+bs*2];
21358 
21359 		if(n1==3)
21360 			return;
21361 
21362 		D[0+ldd*3] = CC[0+bs*3];
21363 		}
21364 
21365 	return;
21366 
21367 	}
21368 #endif
21369 
21370 
21371 
21372 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nn_ll_one_4x4_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde)21373 void kernel_dtrsm_nn_ll_one_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde)
21374 	{
21375 
21376 	const int bs = 4;
21377 
21378 	double tmp,
21379 		e_1, e_2, e_3;
21380 
21381 #if defined(TARGET_GENERIC)
21382 	double CC[16] = {0};
21383 #else
21384 	ALIGNED( double CC[16], 64 ) = {0};
21385 #endif
21386 
21387 	double alpha1 = -1.0;
21388 
21389 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
21390 
21391 	e_1 = E[1+lde*0];
21392 	e_2 = E[2+lde*0];
21393 	e_3 = E[3+lde*0];
21394 	CC[1+bs*0] -= e_1 * CC[0+bs*0];
21395 	CC[2+bs*0] -= e_2 * CC[0+bs*0];
21396 	CC[3+bs*0] -= e_3 * CC[0+bs*0];
21397 	CC[1+bs*1] -= e_1 * CC[0+bs*1];
21398 	CC[2+bs*1] -= e_2 * CC[0+bs*1];
21399 	CC[3+bs*1] -= e_3 * CC[0+bs*1];
21400 	CC[1+bs*2] -= e_1 * CC[0+bs*2];
21401 	CC[2+bs*2] -= e_2 * CC[0+bs*2];
21402 	CC[3+bs*2] -= e_3 * CC[0+bs*2];
21403 	CC[1+bs*3] -= e_1 * CC[0+bs*3];
21404 	CC[2+bs*3] -= e_2 * CC[0+bs*3];
21405 	CC[3+bs*3] -= e_3 * CC[0+bs*3];
21406 
21407 	e_2 = E[2+lde*1];
21408 	e_3 = E[3+lde*1];
21409 	CC[2+bs*0] -= e_2 * CC[1+bs*0];
21410 	CC[3+bs*0] -= e_3 * CC[1+bs*0];
21411 	CC[2+bs*1] -= e_2 * CC[1+bs*1];
21412 	CC[3+bs*1] -= e_3 * CC[1+bs*1];
21413 	CC[2+bs*2] -= e_2 * CC[1+bs*2];
21414 	CC[3+bs*2] -= e_3 * CC[1+bs*2];
21415 	CC[2+bs*3] -= e_2 * CC[1+bs*3];
21416 	CC[3+bs*3] -= e_3 * CC[1+bs*3];
21417 
21418 	e_3 = E[3+lde*2];
21419 	CC[3+bs*0] -= e_3 * CC[2+bs*0];
21420 	CC[3+bs*1] -= e_3 * CC[2+bs*1];
21421 	CC[3+bs*2] -= e_3 * CC[2+bs*2];
21422 	CC[3+bs*3] -= e_3 * CC[2+bs*3];
21423 
21424 
21425 	D[0+ldd*0] = CC[0+bs*0];
21426 	D[1+ldd*0] = CC[1+bs*0];
21427 	D[2+ldd*0] = CC[2+bs*0];
21428 	D[3+ldd*0] = CC[3+bs*0];
21429 
21430 	D[0+ldd*1] = CC[0+bs*1];
21431 	D[1+ldd*1] = CC[1+bs*1];
21432 	D[2+ldd*1] = CC[2+bs*1];
21433 	D[3+ldd*1] = CC[3+bs*1];
21434 
21435 	D[0+ldd*2] = CC[0+bs*2];
21436 	D[1+ldd*2] = CC[1+bs*2];
21437 	D[2+ldd*2] = CC[2+bs*2];
21438 	D[3+ldd*2] = CC[3+bs*2];
21439 
21440 	D[0+ldd*3] = CC[0+bs*3];
21441 	D[1+ldd*3] = CC[1+bs*3];
21442 	D[2+ldd*3] = CC[2+bs*3];
21443 	D[3+ldd*3] = CC[3+bs*3];
21444 
21445 	return;
21446 
21447 	}
21448 #endif
21449 
21450 
21451 
21452 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc(int kmax,double * A,double * B,int ldb,double * beta,double * C,int ldc,double * D,int ldd,double * E,int lde,int m1,int n1)21453 void kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1)
21454 	{
21455 
21456 	const int bs = 4;
21457 
21458 	double tmp,
21459 		e_1, e_2, e_3;
21460 
21461 #if defined(TARGET_GENERIC)
21462 	double CC[16] = {0};
21463 #else
21464 	ALIGNED( double CC[16], 64 ) = {0};
21465 #endif
21466 
21467 	double alpha1 = -1.0;
21468 
21469 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, beta, C, ldc, CC, bs);
21470 
21471 	e_1 = E[1+lde*0];
21472 	e_2 = E[2+lde*0];
21473 	e_3 = E[3+lde*0];
21474 	CC[1+bs*0] -= e_1 * CC[0+bs*0];
21475 	CC[2+bs*0] -= e_2 * CC[0+bs*0];
21476 	CC[3+bs*0] -= e_3 * CC[0+bs*0];
21477 	CC[1+bs*1] -= e_1 * CC[0+bs*1];
21478 	CC[2+bs*1] -= e_2 * CC[0+bs*1];
21479 	CC[3+bs*1] -= e_3 * CC[0+bs*1];
21480 	CC[1+bs*2] -= e_1 * CC[0+bs*2];
21481 	CC[2+bs*2] -= e_2 * CC[0+bs*2];
21482 	CC[3+bs*2] -= e_3 * CC[0+bs*2];
21483 	CC[1+bs*3] -= e_1 * CC[0+bs*3];
21484 	CC[2+bs*3] -= e_2 * CC[0+bs*3];
21485 	CC[3+bs*3] -= e_3 * CC[0+bs*3];
21486 
21487 	e_2 = E[2+lde*1];
21488 	e_3 = E[3+lde*1];
21489 	CC[2+bs*0] -= e_2 * CC[1+bs*0];
21490 	CC[3+bs*0] -= e_3 * CC[1+bs*0];
21491 	CC[2+bs*1] -= e_2 * CC[1+bs*1];
21492 	CC[3+bs*1] -= e_3 * CC[1+bs*1];
21493 	CC[2+bs*2] -= e_2 * CC[1+bs*2];
21494 	CC[3+bs*2] -= e_3 * CC[1+bs*2];
21495 	CC[2+bs*3] -= e_2 * CC[1+bs*3];
21496 	CC[3+bs*3] -= e_3 * CC[1+bs*3];
21497 
21498 	e_3 = E[3+lde*2];
21499 	CC[3+bs*0] -= e_3 * CC[2+bs*0];
21500 	CC[3+bs*1] -= e_3 * CC[2+bs*1];
21501 	CC[3+bs*2] -= e_3 * CC[2+bs*2];
21502 	CC[3+bs*3] -= e_3 * CC[2+bs*3];
21503 
21504 	store:
21505 
21506 	if(m1>=4)
21507 		{
21508 		D[0+ldd*0] = CC[0+bs*0];
21509 		D[1+ldd*0] = CC[1+bs*0];
21510 		D[2+ldd*0] = CC[2+bs*0];
21511 		D[3+ldd*0] = CC[3+bs*0];
21512 
21513 		if(n1==1)
21514 			return;
21515 
21516 		D[0+ldd*1] = CC[0+bs*1];
21517 		D[1+ldd*1] = CC[1+bs*1];
21518 		D[2+ldd*1] = CC[2+bs*1];
21519 		D[3+ldd*1] = CC[3+bs*1];
21520 
21521 		if(n1==2)
21522 			return;
21523 
21524 		D[0+ldd*2] = CC[0+bs*2];
21525 		D[1+ldd*2] = CC[1+bs*2];
21526 		D[2+ldd*2] = CC[2+bs*2];
21527 		D[3+ldd*2] = CC[3+bs*2];
21528 
21529 		if(n1==3)
21530 			return;
21531 
21532 		D[0+ldd*3] = CC[0+bs*3];
21533 		D[1+ldd*3] = CC[1+bs*3];
21534 		D[2+ldd*3] = CC[2+bs*3];
21535 		D[3+ldd*3] = CC[3+bs*3];
21536 		}
21537 	else if(m1>=3)
21538 		{
21539 		D[0+ldd*0] = CC[0+bs*0];
21540 		D[1+ldd*0] = CC[1+bs*0];
21541 		D[2+ldd*0] = CC[2+bs*0];
21542 
21543 		if(n1==1)
21544 			return;
21545 
21546 		D[0+ldd*1] = CC[0+bs*1];
21547 		D[1+ldd*1] = CC[1+bs*1];
21548 		D[2+ldd*1] = CC[2+bs*1];
21549 
21550 		if(n1==2)
21551 			return;
21552 
21553 		D[0+ldd*2] = CC[0+bs*2];
21554 		D[1+ldd*2] = CC[1+bs*2];
21555 		D[2+ldd*2] = CC[2+bs*2];
21556 
21557 		if(n1==3)
21558 			return;
21559 
21560 		D[0+ldd*3] = CC[0+bs*3];
21561 		D[1+ldd*3] = CC[1+bs*3];
21562 		D[2+ldd*3] = CC[2+bs*3];
21563 		}
21564 	else if(m1>=2)
21565 		{
21566 		D[0+ldd*0] = CC[0+bs*0];
21567 		D[1+ldd*0] = CC[1+bs*0];
21568 
21569 		if(n1==1)
21570 			return;
21571 
21572 		D[0+ldd*1] = CC[0+bs*1];
21573 		D[1+ldd*1] = CC[1+bs*1];
21574 
21575 		if(n1==2)
21576 			return;
21577 
21578 		D[0+ldd*2] = CC[0+bs*2];
21579 		D[1+ldd*2] = CC[1+bs*2];
21580 
21581 		if(n1==3)
21582 			return;
21583 
21584 		D[0+ldd*3] = CC[0+bs*3];
21585 		D[1+ldd*3] = CC[1+bs*3];
21586 		}
21587 	else //if(m1>=1)
21588 		{
21589 		D[0+ldd*0] = CC[0+bs*0];
21590 
21591 		if(n1==1)
21592 			return;
21593 
21594 		D[0+ldd*1] = CC[0+bs*1];
21595 
21596 		if(n1==2)
21597 			return;
21598 
21599 		D[0+ldd*2] = CC[0+bs*2];
21600 
21601 		if(n1==3)
21602 			return;
21603 
21604 		D[0+ldd*3] = CC[0+bs*3];
21605 		}
21606 
21607 	return;
21608 
21609 	}
21610 #endif
21611 
21612 
21613 
21614 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgetrf_nn_4x4_lib4ccc(int kmax,double * A,double * B,int ldb,double * C,int ldc,double * D,int ldd,double * inv_diag_D)21615 void kernel_dgetrf_nn_4x4_lib4ccc(int kmax, double *A, double *B, int ldb, double *C, int ldc, double *D, int ldd, double *inv_diag_D)
21616 	{
21617 
21618 	const int bs = 4;
21619 
21620 	int k;
21621 
21622 	double tmp;
21623 
21624 #if defined(TARGET_GENERIC)
21625 	double CC[16] = {0};
21626 #else
21627 	ALIGNED( double CC[16], 64 ) = {0};
21628 #endif
21629 
21630 	double alpha1 = -1.0;
21631 	double beta1  = 1.0;
21632 
21633 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, &beta1, C, ldc, CC, bs);
21634 
21635 	// factorization
21636 
21637 	// first column
21638 	tmp = 1.0 / CC[0+bs*0];
21639 	CC[1+bs*0] *= tmp;
21640 	CC[2+bs*0] *= tmp;
21641 	CC[3+bs*0] *= tmp;
21642 
21643 	inv_diag_D[0] = tmp;
21644 
21645 	// second column
21646 	CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1];
21647 	CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1];
21648 	CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1];
21649 
21650 	tmp = 1.0 / CC[1+bs*1];
21651 	CC[2+bs*1] *= tmp;
21652 	CC[3+bs*1] *= tmp;
21653 
21654 	inv_diag_D[1] = tmp;
21655 
21656 	// third column
21657 	CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2];
21658 	CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2];
21659 	CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2];
21660 
21661 	CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2];
21662 	CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2];
21663 
21664 	tmp = 1.0 / CC[2+bs*2];
21665 	CC[3+bs*2] *= tmp;
21666 
21667 	inv_diag_D[2] = tmp;
21668 
21669 	// fourth column
21670 	CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3];
21671 	CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3];
21672 	CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3];
21673 
21674 	CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3];
21675 	CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3];
21676 
21677 	CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3];
21678 
21679 	tmp = 1.0 / CC[3+bs*3];
21680 
21681 	inv_diag_D[3] = tmp;
21682 
21683 	D[0+ldd*0] = CC[0+bs*0];
21684 	D[1+ldd*0] = CC[1+bs*0];
21685 	D[2+ldd*0] = CC[2+bs*0];
21686 	D[3+ldd*0] = CC[3+bs*0];
21687 
21688 	D[0+ldd*1] = CC[0+bs*1];
21689 	D[1+ldd*1] = CC[1+bs*1];
21690 	D[2+ldd*1] = CC[2+bs*1];
21691 	D[3+ldd*1] = CC[3+bs*1];
21692 
21693 	D[0+ldd*2] = CC[0+bs*2];
21694 	D[1+ldd*2] = CC[1+bs*2];
21695 	D[2+ldd*2] = CC[2+bs*2];
21696 	D[3+ldd*2] = CC[3+bs*2];
21697 
21698 	D[0+ldd*3] = CC[0+bs*3];
21699 	D[1+ldd*3] = CC[1+bs*3];
21700 	D[2+ldd*3] = CC[2+bs*3];
21701 	D[3+ldd*3] = CC[3+bs*3];
21702 
21703 	return;
21704 
21705 	}
21706 #endif
21707 
21708 
21709 
21710 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgetrf_nn_4x4_vs_lib4ccc(int kmax,double * A,double * B,int ldb,double * C,int ldc,double * D,int ldd,double * inv_diag_D,int m1,int n1)21711 void kernel_dgetrf_nn_4x4_vs_lib4ccc(int kmax, double *A, double *B, int ldb, double *C, int ldc, double *D, int ldd, double *inv_diag_D, int m1, int n1)
21712 	{
21713 
21714 	const int bs = 4;
21715 
21716 	int k;
21717 
21718 	double tmp;
21719 
21720 #if defined(TARGET_GENERIC)
21721 	double CC[16] = {0};
21722 #else
21723 	ALIGNED( double CC[16], 64 ) = {0};
21724 #endif
21725 
21726 	double alpha1 = -1.0;
21727 	double beta1  = 1.0;
21728 
21729 	kernel_dgemm_nn_4x4_lib4ccc(kmax, &alpha1, A, B, ldb, &beta1, C, ldc, CC, bs);
21730 
21731 	// factorization
21732 
21733 	// first column
21734 	tmp = 1.0 / CC[0+bs*0];
21735 	CC[1+bs*0] *= tmp;
21736 	CC[2+bs*0] *= tmp;
21737 	CC[3+bs*0] *= tmp;
21738 
21739 	inv_diag_D[0] = tmp;
21740 
21741 	if(n1==1)
21742 		goto store;
21743 
21744 	// second column
21745 	CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1];
21746 	CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1];
21747 	CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1];
21748 
21749 	tmp = 1.0 / CC[1+bs*1];
21750 	CC[2+bs*1] *= tmp;
21751 	CC[3+bs*1] *= tmp;
21752 
21753 	inv_diag_D[1] = tmp;
21754 
21755 	if(n1==2)
21756 		goto store;
21757 
21758 	// third column
21759 	CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2];
21760 	CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2];
21761 	CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2];
21762 
21763 	CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2];
21764 	CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2];
21765 
21766 	tmp = 1.0 / CC[2+bs*2];
21767 	CC[3+bs*2] *= tmp;
21768 
21769 	inv_diag_D[2] = tmp;
21770 
21771 	if(n1==3)
21772 		goto store;
21773 
21774 	// fourth column
21775 	CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3];
21776 	CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3];
21777 	CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3];
21778 
21779 	CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3];
21780 	CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3];
21781 
21782 	CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3];
21783 
21784 	tmp = 1.0 / CC[3+bs*3];
21785 
21786 	inv_diag_D[3] = tmp;
21787 
21788 	store:
21789 
21790 	if(m1>=4)
21791 		{
21792 		D[0+ldd*0] = CC[0+bs*0];
21793 		D[1+ldd*0] = CC[1+bs*0];
21794 		D[2+ldd*0] = CC[2+bs*0];
21795 		D[3+ldd*0] = CC[3+bs*0];
21796 
21797 		if(n1==1)
21798 			return;
21799 
21800 		D[0+ldd*1] = CC[0+bs*1];
21801 		D[1+ldd*1] = CC[1+bs*1];
21802 		D[2+ldd*1] = CC[2+bs*1];
21803 		D[3+ldd*1] = CC[3+bs*1];
21804 
21805 		if(n1==2)
21806 			return;
21807 
21808 		D[0+ldd*2] = CC[0+bs*2];
21809 		D[1+ldd*2] = CC[1+bs*2];
21810 		D[2+ldd*2] = CC[2+bs*2];
21811 		D[3+ldd*2] = CC[3+bs*2];
21812 
21813 		if(n1==3)
21814 			return;
21815 
21816 		D[0+ldd*3] = CC[0+bs*3];
21817 		D[1+ldd*3] = CC[1+bs*3];
21818 		D[2+ldd*3] = CC[2+bs*3];
21819 		D[3+ldd*3] = CC[3+bs*3];
21820 		}
21821 	else if(m1>=3)
21822 		{
21823 		D[0+ldd*0] = CC[0+bs*0];
21824 		D[1+ldd*0] = CC[1+bs*0];
21825 		D[2+ldd*0] = CC[2+bs*0];
21826 
21827 		if(n1==1)
21828 			return;
21829 
21830 		D[0+ldd*1] = CC[0+bs*1];
21831 		D[1+ldd*1] = CC[1+bs*1];
21832 		D[2+ldd*1] = CC[2+bs*1];
21833 
21834 		if(n1==2)
21835 			return;
21836 
21837 		D[0+ldd*2] = CC[0+bs*2];
21838 		D[1+ldd*2] = CC[1+bs*2];
21839 		D[2+ldd*2] = CC[2+bs*2];
21840 
21841 		if(n1==3)
21842 			return;
21843 
21844 		D[0+ldd*3] = CC[0+bs*3];
21845 		D[1+ldd*3] = CC[1+bs*3];
21846 		D[2+ldd*3] = CC[2+bs*3];
21847 		}
21848 	else if(m1>=2)
21849 		{
21850 		D[0+ldd*0] = CC[0+bs*0];
21851 		D[1+ldd*0] = CC[1+bs*0];
21852 
21853 		if(n1==1)
21854 			return;
21855 
21856 		D[0+ldd*1] = CC[0+bs*1];
21857 		D[1+ldd*1] = CC[1+bs*1];
21858 
21859 		if(n1==2)
21860 			return;
21861 
21862 		D[0+ldd*2] = CC[0+bs*2];
21863 		D[1+ldd*2] = CC[1+bs*2];
21864 
21865 		if(n1==3)
21866 			return;
21867 
21868 		D[0+ldd*3] = CC[0+bs*3];
21869 		D[1+ldd*3] = CC[1+bs*3];
21870 		}
21871 	else //if(m1>=1)
21872 		{
21873 		D[0+ldd*0] = CC[0+bs*0];
21874 
21875 		if(n1==1)
21876 			return;
21877 
21878 		D[0+ldd*1] = CC[0+bs*1];
21879 
21880 		if(n1==2)
21881 			return;
21882 
21883 		D[0+ldd*2] = CC[0+bs*2];
21884 
21885 		if(n1==3)
21886 			return;
21887 
21888 		D[0+ldd*3] = CC[0+bs*3];
21889 		}
21890 
21891 	return;
21892 
21893 	}
21894 #endif
21895 
21896 
21897 
21898 
21899