1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38 #include <math.h>
39
40 #include <blasfeo_common.h>
41 #include <blasfeo_d_kernel.h>
42
43
44
45 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA)
kernel_dgemm_nt_4x4_lib4(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D)46 void kernel_dgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
47 {
48
49 const int bs = 4;
50
51 #if defined(TARGET_X86_AMD_BARCELONA)
52
53 kernel_dgemm_nt_4x2_lib4(kmax, alpha, A, B, beta, C, D);
54 kernel_dgemm_nt_4x2_lib4(kmax, alpha, A, B+2, beta, C+2*bs, D+2*bs);
55
56 return;
57
58 #endif
59
60 double
61 a_0, a_1, a_2, a_3,
62 b_0, b_1, b_2, b_3;
63
64 #if defined(TARGET_GENERIC)
65 double CC[16] = {0};
66 #else
67 ALIGNED( double CC[16], 64 ) = {0};
68 #endif
69
70 int k;
71
72 for(k=0; k<kmax-3; k+=4)
73 {
74
75 // k = 0
76
77 a_0 = A[0];
78 a_1 = A[1];
79 a_2 = A[2];
80 a_3 = A[3];
81
82 b_0 = B[0];
83 b_1 = B[1];
84 b_2 = B[2];
85 b_3 = B[3];
86
87 CC[0+bs*0] += a_0 * b_0;
88 CC[1+bs*0] += a_1 * b_0;
89 CC[2+bs*0] += a_2 * b_0;
90 CC[3+bs*0] += a_3 * b_0;
91
92 CC[0+bs*1] += a_0 * b_1;
93 CC[1+bs*1] += a_1 * b_1;
94 CC[2+bs*1] += a_2 * b_1;
95 CC[3+bs*1] += a_3 * b_1;
96
97 CC[0+bs*2] += a_0 * b_2;
98 CC[1+bs*2] += a_1 * b_2;
99 CC[2+bs*2] += a_2 * b_2;
100 CC[3+bs*2] += a_3 * b_2;
101
102 CC[0+bs*3] += a_0 * b_3;
103 CC[1+bs*3] += a_1 * b_3;
104 CC[2+bs*3] += a_2 * b_3;
105 CC[3+bs*3] += a_3 * b_3;
106
107
108 // k = 1
109
110 a_0 = A[4];
111 a_1 = A[5];
112 a_2 = A[6];
113 a_3 = A[7];
114
115 b_0 = B[4];
116 b_1 = B[5];
117 b_2 = B[6];
118 b_3 = B[7];
119
120 CC[0+bs*0] += a_0 * b_0;
121 CC[1+bs*0] += a_1 * b_0;
122 CC[2+bs*0] += a_2 * b_0;
123 CC[3+bs*0] += a_3 * b_0;
124
125 CC[0+bs*1] += a_0 * b_1;
126 CC[1+bs*1] += a_1 * b_1;
127 CC[2+bs*1] += a_2 * b_1;
128 CC[3+bs*1] += a_3 * b_1;
129
130 CC[0+bs*2] += a_0 * b_2;
131 CC[1+bs*2] += a_1 * b_2;
132 CC[2+bs*2] += a_2 * b_2;
133 CC[3+bs*2] += a_3 * b_2;
134
135 CC[0+bs*3] += a_0 * b_3;
136 CC[1+bs*3] += a_1 * b_3;
137 CC[2+bs*3] += a_2 * b_3;
138 CC[3+bs*3] += a_3 * b_3;
139
140
141 // k = 2
142
143 a_0 = A[8];
144 a_1 = A[9];
145 a_2 = A[10];
146 a_3 = A[11];
147
148 b_0 = B[8];
149 b_1 = B[9];
150 b_2 = B[10];
151 b_3 = B[11];
152
153 CC[0+bs*0] += a_0 * b_0;
154 CC[1+bs*0] += a_1 * b_0;
155 CC[2+bs*0] += a_2 * b_0;
156 CC[3+bs*0] += a_3 * b_0;
157
158 CC[0+bs*1] += a_0 * b_1;
159 CC[1+bs*1] += a_1 * b_1;
160 CC[2+bs*1] += a_2 * b_1;
161 CC[3+bs*1] += a_3 * b_1;
162
163 CC[0+bs*2] += a_0 * b_2;
164 CC[1+bs*2] += a_1 * b_2;
165 CC[2+bs*2] += a_2 * b_2;
166 CC[3+bs*2] += a_3 * b_2;
167
168 CC[0+bs*3] += a_0 * b_3;
169 CC[1+bs*3] += a_1 * b_3;
170 CC[2+bs*3] += a_2 * b_3;
171 CC[3+bs*3] += a_3 * b_3;
172
173
174 // k = 3
175
176 a_0 = A[12];
177 a_1 = A[13];
178 a_2 = A[14];
179 a_3 = A[15];
180
181 b_0 = B[12];
182 b_1 = B[13];
183 b_2 = B[14];
184 b_3 = B[15];
185
186 CC[0+bs*0] += a_0 * b_0;
187 CC[1+bs*0] += a_1 * b_0;
188 CC[2+bs*0] += a_2 * b_0;
189 CC[3+bs*0] += a_3 * b_0;
190
191 CC[0+bs*1] += a_0 * b_1;
192 CC[1+bs*1] += a_1 * b_1;
193 CC[2+bs*1] += a_2 * b_1;
194 CC[3+bs*1] += a_3 * b_1;
195
196 CC[0+bs*2] += a_0 * b_2;
197 CC[1+bs*2] += a_1 * b_2;
198 CC[2+bs*2] += a_2 * b_2;
199 CC[3+bs*2] += a_3 * b_2;
200
201 CC[0+bs*3] += a_0 * b_3;
202 CC[1+bs*3] += a_1 * b_3;
203 CC[2+bs*3] += a_2 * b_3;
204 CC[3+bs*3] += a_3 * b_3;
205
206 A += 16;
207 B += 16;
208
209 }
210
211 for(; k<kmax; k++)
212 {
213
214 // k = 0
215
216 a_0 = A[0];
217 a_1 = A[1];
218 a_2 = A[2];
219 a_3 = A[3];
220
221 b_0 = B[0];
222 b_1 = B[1];
223 b_2 = B[2];
224 b_3 = B[3];
225
226 CC[0+bs*0] += a_0 * b_0;
227 CC[1+bs*0] += a_1 * b_0;
228 CC[2+bs*0] += a_2 * b_0;
229 CC[3+bs*0] += a_3 * b_0;
230
231 CC[0+bs*1] += a_0 * b_1;
232 CC[1+bs*1] += a_1 * b_1;
233 CC[2+bs*1] += a_2 * b_1;
234 CC[3+bs*1] += a_3 * b_1;
235
236 CC[0+bs*2] += a_0 * b_2;
237 CC[1+bs*2] += a_1 * b_2;
238 CC[2+bs*2] += a_2 * b_2;
239 CC[3+bs*2] += a_3 * b_2;
240
241 CC[0+bs*3] += a_0 * b_3;
242 CC[1+bs*3] += a_1 * b_3;
243 CC[2+bs*3] += a_2 * b_3;
244 CC[3+bs*3] += a_3 * b_3;
245
246 A += 4;
247 B += 4;
248
249 }
250
251 D[0+bs*0] = beta[0]*C[0+bs*0] + alpha[0]*CC[0+bs*0];
252 D[1+bs*0] = beta[0]*C[1+bs*0] + alpha[0]*CC[1+bs*0];
253 D[2+bs*0] = beta[0]*C[2+bs*0] + alpha[0]*CC[2+bs*0];
254 D[3+bs*0] = beta[0]*C[3+bs*0] + alpha[0]*CC[3+bs*0];
255
256 D[0+bs*1] = beta[0]*C[0+bs*1] + alpha[0]*CC[0+bs*1];
257 D[1+bs*1] = beta[0]*C[1+bs*1] + alpha[0]*CC[1+bs*1];
258 D[2+bs*1] = beta[0]*C[2+bs*1] + alpha[0]*CC[2+bs*1];
259 D[3+bs*1] = beta[0]*C[3+bs*1] + alpha[0]*CC[3+bs*1];
260
261 D[0+bs*2] = beta[0]*C[0+bs*2] + alpha[0]*CC[0+bs*2];
262 D[1+bs*2] = beta[0]*C[1+bs*2] + alpha[0]*CC[1+bs*2];
263 D[2+bs*2] = beta[0]*C[2+bs*2] + alpha[0]*CC[2+bs*2];
264 D[3+bs*2] = beta[0]*C[3+bs*2] + alpha[0]*CC[3+bs*2];
265
266 D[0+bs*3] = beta[0]*C[0+bs*3] + alpha[0]*CC[0+bs*3];
267 D[1+bs*3] = beta[0]*C[1+bs*3] + alpha[0]*CC[1+bs*3];
268 D[2+bs*3] = beta[0]*C[2+bs*3] + alpha[0]*CC[2+bs*3];
269 D[3+bs*3] = beta[0]*C[3+bs*3] + alpha[0]*CC[3+bs*3];
270
271 return;
272
273 }
274 #endif
275
276
277
278 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nt_4x4_vs_lib4(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int km,int kn)279 void kernel_dgemm_nt_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
280 {
281
282 const int bs = 4;
283
284 #if defined(TARGET_GENERIC)
285 double CC[16] = {0};
286 #else
287 ALIGNED( double CC[16], 64 ) = {0};
288 #endif
289
290 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC);
291
292 if(km>=4)
293 {
294 D[0+bs*0] = CC[0+bs*0];
295 D[1+bs*0] = CC[1+bs*0];
296 D[2+bs*0] = CC[2+bs*0];
297 D[3+bs*0] = CC[3+bs*0];
298
299 if(kn==1)
300 return;
301
302 D[0+bs*1] = CC[0+bs*1];
303 D[1+bs*1] = CC[1+bs*1];
304 D[2+bs*1] = CC[2+bs*1];
305 D[3+bs*1] = CC[3+bs*1];
306
307 if(kn==2)
308 return;
309
310 D[0+bs*2] = CC[0+bs*2];
311 D[1+bs*2] = CC[1+bs*2];
312 D[2+bs*2] = CC[2+bs*2];
313 D[3+bs*2] = CC[3+bs*2];
314
315 if(kn==3)
316 return;
317
318 D[0+bs*3] = CC[0+bs*3];
319 D[1+bs*3] = CC[1+bs*3];
320 D[2+bs*3] = CC[2+bs*3];
321 D[3+bs*3] = CC[3+bs*3];
322 }
323 else if(km>=3)
324 {
325 D[0+bs*0] = CC[0+bs*0];
326 D[1+bs*0] = CC[1+bs*0];
327 D[2+bs*0] = CC[2+bs*0];
328
329 if(kn==1)
330 return;
331
332 D[0+bs*1] = CC[0+bs*1];
333 D[1+bs*1] = CC[1+bs*1];
334 D[2+bs*1] = CC[2+bs*1];
335
336 if(kn==2)
337 return;
338
339 D[0+bs*2] = CC[0+bs*2];
340 D[1+bs*2] = CC[1+bs*2];
341 D[2+bs*2] = CC[2+bs*2];
342
343 if(kn==3)
344 return;
345
346 D[0+bs*3] = CC[0+bs*3];
347 D[1+bs*3] = CC[1+bs*3];
348 D[2+bs*3] = CC[2+bs*3];
349 }
350 else if(km>=2)
351 {
352 D[0+bs*0] = CC[0+bs*0];
353 D[1+bs*0] = CC[1+bs*0];
354
355 if(kn==1)
356 return;
357
358 D[0+bs*1] = CC[0+bs*1];
359 D[1+bs*1] = CC[1+bs*1];
360
361 if(kn==2)
362 return;
363
364 D[0+bs*2] = CC[0+bs*2];
365 D[1+bs*2] = CC[1+bs*2];
366
367 if(kn==3)
368 return;
369
370 D[0+bs*3] = CC[0+bs*3];
371 D[1+bs*3] = CC[1+bs*3];
372 }
373 else //if(km>=1)
374 {
375 D[0+bs*0] = CC[0+bs*0];
376
377 if(kn==1)
378 return;
379
380 D[0+bs*1] = CC[0+bs*1];
381
382 if(kn==2)
383 return;
384
385 D[0+bs*2] = CC[0+bs*2];
386
387 if(kn==3)
388 return;
389
390 D[0+bs*3] = CC[0+bs*3];
391 }
392
393 return;
394
395 }
396 #endif
397
398
399
400 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_nt_4x4_gen_lib4(int kmax,double * alpha,double * A,double * B,double * beta,int offsetC,double * C0,int sdc,int offsetD,double * D0,int sdd,int m0,int m1,int n0,int n1)401 void kernel_dgemm_nt_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
402 {
403
404 const int bs = 4;
405
406 #if defined(TARGET_GENERIC)
407 double CC[16] = {0};
408 #else
409 ALIGNED( double CC[16], 64 ) = {0};
410 #endif
411
412 double
413 *C1, *D1;
414
415 if(offsetC==0)
416 {
417 CC[0+bs*0] = beta[0]*C0[0+bs*0];
418 CC[1+bs*0] = beta[0]*C0[1+bs*0];
419 CC[2+bs*0] = beta[0]*C0[2+bs*0];
420 CC[3+bs*0] = beta[0]*C0[3+bs*0];
421
422 CC[0+bs*1] = beta[0]*C0[0+bs*1];
423 CC[1+bs*1] = beta[0]*C0[1+bs*1];
424 CC[2+bs*1] = beta[0]*C0[2+bs*1];
425 CC[3+bs*1] = beta[0]*C0[3+bs*1];
426
427 CC[0+bs*2] = beta[0]*C0[0+bs*2];
428 CC[1+bs*2] = beta[0]*C0[1+bs*2];
429 CC[2+bs*2] = beta[0]*C0[2+bs*2];
430 CC[3+bs*2] = beta[0]*C0[3+bs*2];
431
432 CC[0+bs*3] = beta[0]*C0[0+bs*3];
433 CC[1+bs*3] = beta[0]*C0[1+bs*3];
434 CC[2+bs*3] = beta[0]*C0[2+bs*3];
435 CC[3+bs*3] = beta[0]*C0[3+bs*3];
436 }
437 else if(offsetC==1)
438 {
439 C1 = C0 + sdc*bs;
440
441 CC[0+bs*0] = beta[0]*C0[1+bs*0];
442 CC[1+bs*0] = beta[0]*C0[2+bs*0];
443 CC[2+bs*0] = beta[0]*C0[3+bs*0];
444 CC[3+bs*0] = beta[0]*C1[0+bs*0];
445
446 CC[0+bs*1] = beta[0]*C0[1+bs*1];
447 CC[1+bs*1] = beta[0]*C0[2+bs*1];
448 CC[2+bs*1] = beta[0]*C0[3+bs*1];
449 CC[3+bs*1] = beta[0]*C1[0+bs*1];
450
451 CC[0+bs*2] = beta[0]*C0[1+bs*2];
452 CC[1+bs*2] = beta[0]*C0[2+bs*2];
453 CC[2+bs*2] = beta[0]*C0[3+bs*2];
454 CC[3+bs*2] = beta[0]*C1[0+bs*2];
455
456 CC[0+bs*3] = beta[0]*C0[1+bs*3];
457 CC[1+bs*3] = beta[0]*C0[2+bs*3];
458 CC[2+bs*3] = beta[0]*C0[3+bs*3];
459 CC[3+bs*3] = beta[0]*C1[0+bs*3];
460 }
461 else if(offsetC==2)
462 {
463 C1 = C0 + sdc*bs;
464
465 CC[0+bs*0] = beta[0]*C0[2+bs*0];
466 CC[1+bs*0] = beta[0]*C0[3+bs*0];
467 CC[2+bs*0] = beta[0]*C1[0+bs*0];
468 CC[3+bs*0] = beta[0]*C1[1+bs*0];
469
470 CC[0+bs*1] = beta[0]*C0[2+bs*1];
471 CC[1+bs*1] = beta[0]*C0[3+bs*1];
472 CC[2+bs*1] = beta[0]*C1[0+bs*1];
473 CC[3+bs*1] = beta[0]*C1[1+bs*1];
474
475 CC[0+bs*2] = beta[0]*C0[2+bs*2];
476 CC[1+bs*2] = beta[0]*C0[3+bs*2];
477 CC[2+bs*2] = beta[0]*C1[0+bs*2];
478 CC[3+bs*2] = beta[0]*C1[1+bs*2];
479
480 CC[0+bs*3] = beta[0]*C0[2+bs*3];
481 CC[1+bs*3] = beta[0]*C0[3+bs*3];
482 CC[2+bs*3] = beta[0]*C1[0+bs*3];
483 CC[3+bs*3] = beta[0]*C1[1+bs*3];
484 }
485 else //if(offsetC==3)
486 {
487 C1 = C0 + sdc*bs;
488
489 CC[0+bs*0] = beta[0]*C0[3+bs*0];
490 CC[1+bs*0] = beta[0]*C1[0+bs*0];
491 CC[2+bs*0] = beta[0]*C1[1+bs*0];
492 CC[3+bs*0] = beta[0]*C1[2+bs*0];
493
494 CC[0+bs*1] = beta[0]*C0[3+bs*1];
495 CC[1+bs*1] = beta[0]*C1[0+bs*1];
496 CC[2+bs*1] = beta[0]*C1[1+bs*1];
497 CC[3+bs*1] = beta[0]*C1[2+bs*1];
498
499 CC[0+bs*2] = beta[0]*C0[3+bs*2];
500 CC[1+bs*2] = beta[0]*C1[0+bs*2];
501 CC[2+bs*2] = beta[0]*C1[1+bs*2];
502 CC[3+bs*2] = beta[0]*C1[2+bs*2];
503
504 CC[0+bs*3] = beta[0]*C0[3+bs*3];
505 CC[1+bs*3] = beta[0]*C1[0+bs*3];
506 CC[2+bs*3] = beta[0]*C1[1+bs*3];
507 CC[3+bs*3] = beta[0]*C1[2+bs*3];
508 }
509
510 double beta1 = 1.0;
511
512 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC);
513
514 // shift sol for cols
515 if(n0>0)
516 {
517 if(n0==1)
518 {
519 CC[0+bs*0] = CC[0+bs*1];
520 CC[1+bs*0] = CC[1+bs*1];
521 CC[2+bs*0] = CC[2+bs*1];
522 CC[3+bs*0] = CC[3+bs*1];
523
524 CC[0+bs*1] = CC[0+bs*2];
525 CC[1+bs*1] = CC[1+bs*2];
526 CC[2+bs*1] = CC[2+bs*2];
527 CC[3+bs*1] = CC[3+bs*2];
528
529 CC[0+bs*2] = CC[0+bs*3];
530 CC[1+bs*2] = CC[1+bs*3];
531 CC[2+bs*2] = CC[2+bs*3];
532 CC[3+bs*2] = CC[3+bs*3];
533
534 D0 += 1*bs;
535 }
536 else if(n0==2)
537 {
538 CC[0+bs*0] = CC[0+bs*2];
539 CC[1+bs*0] = CC[1+bs*2];
540 CC[2+bs*0] = CC[2+bs*2];
541 CC[3+bs*0] = CC[3+bs*2];
542
543 CC[0+bs*1] = CC[0+bs*3];
544 CC[1+bs*1] = CC[1+bs*3];
545 CC[2+bs*1] = CC[2+bs*3];
546 CC[3+bs*1] = CC[3+bs*3];
547
548 D0 += 2*bs;
549 }
550 else //if(n0==3)
551 {
552 CC[0+bs*0] = CC[0+bs*3];
553 CC[1+bs*0] = CC[1+bs*3];
554 CC[2+bs*0] = CC[2+bs*3];
555 CC[3+bs*0] = CC[3+bs*3];
556
557 D0 += 3*bs;
558 }
559 }
560
561 n1 = 4<n1 ? 4 : n1;
562 int kn = n1 - n0;
563
564 if(offsetD==0)
565 {
566 if(kn<=0)
567 return;
568
569 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
570 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0];
571 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0];
572 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0];
573
574 if(kn<=1)
575 return;
576
577 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
578 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
579 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1];
580 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1];
581
582 if(kn<=2)
583 return;
584
585 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
586 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
587 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
588 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2];
589
590 if(kn<=3)
591 return;
592
593 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
594 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
595 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
596 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3];
597 }
598 else if(offsetD==1)
599 {
600 D1 = D0 + sdd*bs;
601
602 if(kn<=0)
603 return;
604
605 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0];
606 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0];
607 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0];
608 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0];
609
610 if(kn<=1)
611 return;
612
613 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1];
614 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1];
615 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1];
616 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1];
617
618 if(kn<=2)
619 return;
620
621 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2];
622 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2];
623 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2];
624 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2];
625
626 if(kn<=3)
627 return;
628
629 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3];
630 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3];
631 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3];
632 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3];
633 }
634 else if(offsetD==2)
635 {
636 D1 = D0 + sdd*bs;
637
638 if(kn<=0)
639 return;
640
641 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0];
642 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0];
643 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0];
644 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0];
645
646 if(kn<=1)
647 return;
648
649 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1];
650 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1];
651 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1];
652 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1];
653
654 if(kn<=2)
655 return;
656
657 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2];
658 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2];
659 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2];
660 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2];
661
662 if(kn<=3)
663 return;
664
665 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3];
666 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3];
667 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3];
668 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3];
669 }
670 else //if(offsetD==3)
671 {
672 D1 = D0 + sdd*bs;
673
674 if(kn<=0)
675 return;
676
677 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0];
678 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0];
679 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0];
680 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0];
681
682 if(kn<=1)
683 return;
684
685 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1];
686 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1];
687 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1];
688 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1];
689
690 if(kn<=2)
691 return;
692
693 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2];
694 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2];
695 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2];
696 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2];
697
698 if(kn<=3)
699 return;
700
701 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3];
702 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3];
703 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3];
704 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3];
705 }
706
707 return;
708
709 }
710 #endif
711
712
713
714 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) //|| defined(TARGET_X64_AMD_BULLDOZER)
kernel_dgemm_nn_4x4_lib4(int kmax,double * alpha,double * A,int offsetB,double * B,int sdb,double * beta,double * C,double * D)715 void kernel_dgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
716 {
717
718 const int bs = 4;
719
720 #if defined(TARGET_X86_AMD_BARCELONA)
721
722 kernel_dgemm_nn_4x2_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, D);
723 kernel_dgemm_nn_4x2_lib4(kmax, alpha, A, offsetB, B+2*bs, sdb, beta, C+2*bs, D+2*bs);
724
725 return;
726
727 #endif
728
729 double
730 a_0, a_1, a_2, a_3,
731 b_0, b_1, b_2, b_3;
732
733 #if defined(TARGET_GENERIC)
734 double CC[16] = {0};
735 #else
736 ALIGNED( double CC[16], 64 ) = {0};
737 #endif
738
739 double
740 *C1, *D1;
741
742 int k;
743
744 k = 0;
745 if(offsetB!=0)
746 {
747 if(offsetB==1)
748 {
749
750 B += 1;
751
752 a_0 = A[0];
753 a_1 = A[1];
754 a_2 = A[2];
755 a_3 = A[3];
756
757 b_0 = B[0];
758 b_1 = B[4];
759 b_2 = B[8];
760 b_3 = B[12];
761
762 CC[0+bs*0] += a_0 * b_0;
763 CC[1+bs*0] += a_1 * b_0;
764 CC[2+bs*0] += a_2 * b_0;
765 CC[3+bs*0] += a_3 * b_0;
766
767 CC[0+bs*1] += a_0 * b_1;
768 CC[1+bs*1] += a_1 * b_1;
769 CC[2+bs*1] += a_2 * b_1;
770 CC[3+bs*1] += a_3 * b_1;
771
772 CC[0+bs*2] += a_0 * b_2;
773 CC[1+bs*2] += a_1 * b_2;
774 CC[2+bs*2] += a_2 * b_2;
775 CC[3+bs*2] += a_3 * b_2;
776
777 CC[0+bs*3] += a_0 * b_3;
778 CC[1+bs*3] += a_1 * b_3;
779 CC[2+bs*3] += a_2 * b_3;
780 CC[3+bs*3] += a_3 * b_3;
781
782 A += 4;
783 B += 1;
784 k += 1;
785
786 if(k>=kmax)
787 goto scale;
788
789 a_0 = A[0];
790 a_1 = A[1];
791 a_2 = A[2];
792 a_3 = A[3];
793
794 b_0 = B[0];
795 b_1 = B[4];
796 b_2 = B[8];
797 b_3 = B[12];
798
799 CC[0+bs*0] += a_0 * b_0;
800 CC[1+bs*0] += a_1 * b_0;
801 CC[2+bs*0] += a_2 * b_0;
802 CC[3+bs*0] += a_3 * b_0;
803
804 CC[0+bs*1] += a_0 * b_1;
805 CC[1+bs*1] += a_1 * b_1;
806 CC[2+bs*1] += a_2 * b_1;
807 CC[3+bs*1] += a_3 * b_1;
808
809 CC[0+bs*2] += a_0 * b_2;
810 CC[1+bs*2] += a_1 * b_2;
811 CC[2+bs*2] += a_2 * b_2;
812 CC[3+bs*2] += a_3 * b_2;
813
814 CC[0+bs*3] += a_0 * b_3;
815 CC[1+bs*3] += a_1 * b_3;
816 CC[2+bs*3] += a_2 * b_3;
817 CC[3+bs*3] += a_3 * b_3;
818
819 A += 4;
820 B += 1;
821 k += 1;
822
823 if(k>=kmax)
824 goto scale;
825
826 a_0 = A[0];
827 a_1 = A[1];
828 a_2 = A[2];
829 a_3 = A[3];
830
831 b_0 = B[0];
832 b_1 = B[4];
833 b_2 = B[8];
834 b_3 = B[12];
835
836 CC[0+bs*0] += a_0 * b_0;
837 CC[1+bs*0] += a_1 * b_0;
838 CC[2+bs*0] += a_2 * b_0;
839 CC[3+bs*0] += a_3 * b_0;
840
841 CC[0+bs*1] += a_0 * b_1;
842 CC[1+bs*1] += a_1 * b_1;
843 CC[2+bs*1] += a_2 * b_1;
844 CC[3+bs*1] += a_3 * b_1;
845
846 CC[0+bs*2] += a_0 * b_2;
847 CC[1+bs*2] += a_1 * b_2;
848 CC[2+bs*2] += a_2 * b_2;
849 CC[3+bs*2] += a_3 * b_2;
850
851 CC[0+bs*3] += a_0 * b_3;
852 CC[1+bs*3] += a_1 * b_3;
853 CC[2+bs*3] += a_2 * b_3;
854 CC[3+bs*3] += a_3 * b_3;
855
856 A += 4;
857 B += 1;
858 B += bs*(sdb-1);
859 k += 1;
860
861 }
862 else if(offsetB==2)
863 {
864
865 B += 2;
866
867 a_0 = A[0];
868 a_1 = A[1];
869 a_2 = A[2];
870 a_3 = A[3];
871
872 b_0 = B[0];
873 b_1 = B[4];
874 b_2 = B[8];
875 b_3 = B[12];
876
877 CC[0+bs*0] += a_0 * b_0;
878 CC[1+bs*0] += a_1 * b_0;
879 CC[2+bs*0] += a_2 * b_0;
880 CC[3+bs*0] += a_3 * b_0;
881
882 CC[0+bs*1] += a_0 * b_1;
883 CC[1+bs*1] += a_1 * b_1;
884 CC[2+bs*1] += a_2 * b_1;
885 CC[3+bs*1] += a_3 * b_1;
886
887 CC[0+bs*2] += a_0 * b_2;
888 CC[1+bs*2] += a_1 * b_2;
889 CC[2+bs*2] += a_2 * b_2;
890 CC[3+bs*2] += a_3 * b_2;
891
892 CC[0+bs*3] += a_0 * b_3;
893 CC[1+bs*3] += a_1 * b_3;
894 CC[2+bs*3] += a_2 * b_3;
895 CC[3+bs*3] += a_3 * b_3;
896
897 A += 4;
898 B += 1;
899 k += 1;
900
901 if(k>=kmax)
902 goto scale;
903
904 a_0 = A[0];
905 a_1 = A[1];
906 a_2 = A[2];
907 a_3 = A[3];
908
909 b_0 = B[0];
910 b_1 = B[4];
911 b_2 = B[8];
912 b_3 = B[12];
913
914 CC[0+bs*0] += a_0 * b_0;
915 CC[1+bs*0] += a_1 * b_0;
916 CC[2+bs*0] += a_2 * b_0;
917 CC[3+bs*0] += a_3 * b_0;
918
919 CC[0+bs*1] += a_0 * b_1;
920 CC[1+bs*1] += a_1 * b_1;
921 CC[2+bs*1] += a_2 * b_1;
922 CC[3+bs*1] += a_3 * b_1;
923
924 CC[0+bs*2] += a_0 * b_2;
925 CC[1+bs*2] += a_1 * b_2;
926 CC[2+bs*2] += a_2 * b_2;
927 CC[3+bs*2] += a_3 * b_2;
928
929 CC[0+bs*3] += a_0 * b_3;
930 CC[1+bs*3] += a_1 * b_3;
931 CC[2+bs*3] += a_2 * b_3;
932 CC[3+bs*3] += a_3 * b_3;
933
934 A += 4;
935 B += 1;
936 B += bs*(sdb-1);
937 k += 1;
938
939 }
940 else // if(offsetB==3)
941 {
942
943 B += 3;
944
945 a_0 = A[0];
946 a_1 = A[1];
947 a_2 = A[2];
948 a_3 = A[3];
949
950 b_0 = B[0];
951 b_1 = B[4];
952 b_2 = B[8];
953 b_3 = B[12];
954
955 CC[0+bs*0] += a_0 * b_0;
956 CC[1+bs*0] += a_1 * b_0;
957 CC[2+bs*0] += a_2 * b_0;
958 CC[3+bs*0] += a_3 * b_0;
959
960 CC[0+bs*1] += a_0 * b_1;
961 CC[1+bs*1] += a_1 * b_1;
962 CC[2+bs*1] += a_2 * b_1;
963 CC[3+bs*1] += a_3 * b_1;
964
965 CC[0+bs*2] += a_0 * b_2;
966 CC[1+bs*2] += a_1 * b_2;
967 CC[2+bs*2] += a_2 * b_2;
968 CC[3+bs*2] += a_3 * b_2;
969
970 CC[0+bs*3] += a_0 * b_3;
971 CC[1+bs*3] += a_1 * b_3;
972 CC[2+bs*3] += a_2 * b_3;
973 CC[3+bs*3] += a_3 * b_3;
974
975 A += 4;
976 B += 1;
977 B += bs*(sdb-1);
978 k += 1;
979
980 }
981 }
982 for(; k<kmax-3; k+=4)
983 {
984
985 // k = 0
986
987 a_0 = A[0];
988 a_1 = A[1];
989 a_2 = A[2];
990 a_3 = A[3];
991
992 b_0 = B[0];
993 b_1 = B[4];
994 b_2 = B[8];
995 b_3 = B[12];
996
997 CC[0+bs*0] += a_0 * b_0;
998 CC[1+bs*0] += a_1 * b_0;
999 CC[2+bs*0] += a_2 * b_0;
1000 CC[3+bs*0] += a_3 * b_0;
1001
1002 CC[0+bs*1] += a_0 * b_1;
1003 CC[1+bs*1] += a_1 * b_1;
1004 CC[2+bs*1] += a_2 * b_1;
1005 CC[3+bs*1] += a_3 * b_1;
1006
1007 CC[0+bs*2] += a_0 * b_2;
1008 CC[1+bs*2] += a_1 * b_2;
1009 CC[2+bs*2] += a_2 * b_2;
1010 CC[3+bs*2] += a_3 * b_2;
1011
1012 CC[0+bs*3] += a_0 * b_3;
1013 CC[1+bs*3] += a_1 * b_3;
1014 CC[2+bs*3] += a_2 * b_3;
1015 CC[3+bs*3] += a_3 * b_3;
1016
1017
1018 // k = 1
1019
1020 a_0 = A[4];
1021 a_1 = A[5];
1022 a_2 = A[6];
1023 a_3 = A[7];
1024
1025 b_0 = B[1];
1026 b_1 = B[5];
1027 b_2 = B[9];
1028 b_3 = B[13];
1029
1030 CC[0+bs*0] += a_0 * b_0;
1031 CC[1+bs*0] += a_1 * b_0;
1032 CC[2+bs*0] += a_2 * b_0;
1033 CC[3+bs*0] += a_3 * b_0;
1034
1035 CC[0+bs*1] += a_0 * b_1;
1036 CC[1+bs*1] += a_1 * b_1;
1037 CC[2+bs*1] += a_2 * b_1;
1038 CC[3+bs*1] += a_3 * b_1;
1039
1040 CC[0+bs*2] += a_0 * b_2;
1041 CC[1+bs*2] += a_1 * b_2;
1042 CC[2+bs*2] += a_2 * b_2;
1043 CC[3+bs*2] += a_3 * b_2;
1044
1045 CC[0+bs*3] += a_0 * b_3;
1046 CC[1+bs*3] += a_1 * b_3;
1047 CC[2+bs*3] += a_2 * b_3;
1048 CC[3+bs*3] += a_3 * b_3;
1049
1050
1051 // k = 2
1052
1053 a_0 = A[8];
1054 a_1 = A[9];
1055 a_2 = A[10];
1056 a_3 = A[11];
1057
1058 b_0 = B[2];
1059 b_1 = B[6];
1060 b_2 = B[10];
1061 b_3 = B[14];
1062
1063 CC[0+bs*0] += a_0 * b_0;
1064 CC[1+bs*0] += a_1 * b_0;
1065 CC[2+bs*0] += a_2 * b_0;
1066 CC[3+bs*0] += a_3 * b_0;
1067
1068 CC[0+bs*1] += a_0 * b_1;
1069 CC[1+bs*1] += a_1 * b_1;
1070 CC[2+bs*1] += a_2 * b_1;
1071 CC[3+bs*1] += a_3 * b_1;
1072
1073 CC[0+bs*2] += a_0 * b_2;
1074 CC[1+bs*2] += a_1 * b_2;
1075 CC[2+bs*2] += a_2 * b_2;
1076 CC[3+bs*2] += a_3 * b_2;
1077
1078 CC[0+bs*3] += a_0 * b_3;
1079 CC[1+bs*3] += a_1 * b_3;
1080 CC[2+bs*3] += a_2 * b_3;
1081 CC[3+bs*3] += a_3 * b_3;
1082
1083
1084 // k = 3
1085
1086 a_0 = A[12];
1087 a_1 = A[13];
1088 a_2 = A[14];
1089 a_3 = A[15];
1090
1091 b_0 = B[3];
1092 b_1 = B[7];
1093 b_2 = B[11];
1094 b_3 = B[15];
1095
1096 CC[0+bs*0] += a_0 * b_0;
1097 CC[1+bs*0] += a_1 * b_0;
1098 CC[2+bs*0] += a_2 * b_0;
1099 CC[3+bs*0] += a_3 * b_0;
1100
1101 CC[0+bs*1] += a_0 * b_1;
1102 CC[1+bs*1] += a_1 * b_1;
1103 CC[2+bs*1] += a_2 * b_1;
1104 CC[3+bs*1] += a_3 * b_1;
1105
1106 CC[0+bs*2] += a_0 * b_2;
1107 CC[1+bs*2] += a_1 * b_2;
1108 CC[2+bs*2] += a_2 * b_2;
1109 CC[3+bs*2] += a_3 * b_2;
1110
1111 CC[0+bs*3] += a_0 * b_3;
1112 CC[1+bs*3] += a_1 * b_3;
1113 CC[2+bs*3] += a_2 * b_3;
1114 CC[3+bs*3] += a_3 * b_3;
1115
1116 A += 16;
1117 B += 4*sdb;
1118
1119 }
1120 for(; k<kmax; k++)
1121 {
1122
1123 // k = 0
1124
1125 a_0 = A[0];
1126 a_1 = A[1];
1127 a_2 = A[2];
1128 a_3 = A[3];
1129
1130 b_0 = B[0];
1131 b_1 = B[4];
1132 b_2 = B[8];
1133 b_3 = B[12];
1134
1135 CC[0+bs*0] += a_0 * b_0;
1136 CC[1+bs*0] += a_1 * b_0;
1137 CC[2+bs*0] += a_2 * b_0;
1138 CC[3+bs*0] += a_3 * b_0;
1139
1140 CC[0+bs*1] += a_0 * b_1;
1141 CC[1+bs*1] += a_1 * b_1;
1142 CC[2+bs*1] += a_2 * b_1;
1143 CC[3+bs*1] += a_3 * b_1;
1144
1145 CC[0+bs*2] += a_0 * b_2;
1146 CC[1+bs*2] += a_1 * b_2;
1147 CC[2+bs*2] += a_2 * b_2;
1148 CC[3+bs*2] += a_3 * b_2;
1149
1150 CC[0+bs*3] += a_0 * b_3;
1151 CC[1+bs*3] += a_1 * b_3;
1152 CC[2+bs*3] += a_2 * b_3;
1153 CC[3+bs*3] += a_3 * b_3;
1154
1155 A += 4;
1156 B += 1;
1157
1158 }
1159
1160 scale:
1161
1162 D[0+bs*0] = beta[0]*C[0+bs*0] + alpha[0]*CC[0+bs*0];
1163 D[1+bs*0] = beta[0]*C[1+bs*0] + alpha[0]*CC[1+bs*0];
1164 D[2+bs*0] = beta[0]*C[2+bs*0] + alpha[0]*CC[2+bs*0];
1165 D[3+bs*0] = beta[0]*C[3+bs*0] + alpha[0]*CC[3+bs*0];
1166
1167 D[0+bs*1] = beta[0]*C[0+bs*1] + alpha[0]*CC[0+bs*1];
1168 D[1+bs*1] = beta[0]*C[1+bs*1] + alpha[0]*CC[1+bs*1];
1169 D[2+bs*1] = beta[0]*C[2+bs*1] + alpha[0]*CC[2+bs*1];
1170 D[3+bs*1] = beta[0]*C[3+bs*1] + alpha[0]*CC[3+bs*1];
1171
1172 D[0+bs*2] = beta[0]*C[0+bs*2] + alpha[0]*CC[0+bs*2];
1173 D[1+bs*2] = beta[0]*C[1+bs*2] + alpha[0]*CC[1+bs*2];
1174 D[2+bs*2] = beta[0]*C[2+bs*2] + alpha[0]*CC[2+bs*2];
1175 D[3+bs*2] = beta[0]*C[3+bs*2] + alpha[0]*CC[3+bs*2];
1176
1177 D[0+bs*3] = beta[0]*C[0+bs*3] + alpha[0]*CC[0+bs*3];
1178 D[1+bs*3] = beta[0]*C[1+bs*3] + alpha[0]*CC[1+bs*3];
1179 D[2+bs*3] = beta[0]*C[2+bs*3] + alpha[0]*CC[2+bs*3];
1180 D[3+bs*3] = beta[0]*C[3+bs*3] + alpha[0]*CC[3+bs*3];
1181
1182 return;
1183
1184 }
1185 #endif
1186
1187
1188
1189 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dgemm_nn_4x4_vs_lib4(int kmax,double * alpha,double * A,int offsetB,double * B,int sdb,double * beta,double * C,double * D,int km,int kn)1190 void kernel_dgemm_nn_4x4_vs_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int km, int kn)
1191 {
1192
1193 const int bs = 4;
1194
1195 #if defined(TARGET_GENERIC)
1196 double CC[16] = {0};
1197 #else
1198 ALIGNED( double CC[16], 64 ) = {0};
1199 #endif
1200
1201 kernel_dgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, CC);
1202
1203 if(km>=4)
1204 {
1205 D[0+bs*0] = CC[0+bs*0];
1206 D[1+bs*0] = CC[1+bs*0];
1207 D[2+bs*0] = CC[2+bs*0];
1208 D[3+bs*0] = CC[3+bs*0];
1209
1210 if(kn==1)
1211 return;
1212
1213 D[0+bs*1] = CC[0+bs*1];
1214 D[1+bs*1] = CC[1+bs*1];
1215 D[2+bs*1] = CC[2+bs*1];
1216 D[3+bs*1] = CC[3+bs*1];
1217
1218 if(kn==2)
1219 return;
1220
1221 D[0+bs*2] = CC[0+bs*2];
1222 D[1+bs*2] = CC[1+bs*2];
1223 D[2+bs*2] = CC[2+bs*2];
1224 D[3+bs*2] = CC[3+bs*2];
1225
1226 if(kn==3)
1227 return;
1228
1229 D[0+bs*3] = CC[0+bs*3];
1230 D[1+bs*3] = CC[1+bs*3];
1231 D[2+bs*3] = CC[2+bs*3];
1232 D[3+bs*3] = CC[3+bs*3];
1233 }
1234 else if(km>=3)
1235 {
1236 D[0+bs*0] = CC[0+bs*0];
1237 D[1+bs*0] = CC[1+bs*0];
1238 D[2+bs*0] = CC[2+bs*0];
1239
1240 if(kn==1)
1241 return;
1242
1243 D[0+bs*1] = CC[0+bs*1];
1244 D[1+bs*1] = CC[1+bs*1];
1245 D[2+bs*1] = CC[2+bs*1];
1246
1247 if(kn==2)
1248 return;
1249
1250 D[0+bs*2] = CC[0+bs*2];
1251 D[1+bs*2] = CC[1+bs*2];
1252 D[2+bs*2] = CC[2+bs*2];
1253
1254 if(kn==3)
1255 return;
1256
1257 D[0+bs*3] = CC[0+bs*3];
1258 D[1+bs*3] = CC[1+bs*3];
1259 D[2+bs*3] = CC[2+bs*3];
1260 }
1261 else if(km>=2)
1262 {
1263 D[0+bs*0] = CC[0+bs*0];
1264 D[1+bs*0] = CC[1+bs*0];
1265
1266 if(kn==1)
1267 return;
1268
1269 D[0+bs*1] = CC[0+bs*1];
1270 D[1+bs*1] = CC[1+bs*1];
1271
1272 if(kn==2)
1273 return;
1274
1275 D[0+bs*2] = CC[0+bs*2];
1276 D[1+bs*2] = CC[1+bs*2];
1277
1278 if(kn==3)
1279 return;
1280
1281 D[0+bs*3] = CC[0+bs*3];
1282 D[1+bs*3] = CC[1+bs*3];
1283 }
1284 else //if(km>=1)
1285 {
1286 D[0+bs*0] = CC[0+bs*0];
1287
1288 if(kn==1)
1289 return;
1290
1291 D[0+bs*1] = CC[0+bs*1];
1292
1293 if(kn==2)
1294 return;
1295
1296 D[0+bs*2] = CC[0+bs*2];
1297
1298 if(kn==3)
1299 return;
1300
1301 D[0+bs*3] = CC[0+bs*3];
1302 }
1303
1304 return;
1305
1306 }
1307 #endif
1308
1309
1310
1311 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_nn_4x4_gen_lib4(int kmax,double * alpha,double * A,int offsetB,double * B,int sdb,double * beta,int offsetC,double * C0,int sdc,int offsetD,double * D0,int sdd,int m0,int m1,int n0,int n1)1312 void kernel_dgemm_nn_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
1313 {
1314
1315 const int bs = 4;
1316
1317 #if defined(TARGET_GENERIC)
1318 double CC[16] = {0};
1319 #else
1320 ALIGNED( double CC[16], 64 ) = {0};
1321 #endif
1322
1323 double
1324 *C1, *D1;
1325
1326 if(offsetC==0)
1327 {
1328 CC[0+bs*0] = beta[0]*C0[0+bs*0];
1329 CC[1+bs*0] = beta[0]*C0[1+bs*0];
1330 CC[2+bs*0] = beta[0]*C0[2+bs*0];
1331 CC[3+bs*0] = beta[0]*C0[3+bs*0];
1332
1333 CC[0+bs*1] = beta[0]*C0[0+bs*1];
1334 CC[1+bs*1] = beta[0]*C0[1+bs*1];
1335 CC[2+bs*1] = beta[0]*C0[2+bs*1];
1336 CC[3+bs*1] = beta[0]*C0[3+bs*1];
1337
1338 CC[0+bs*2] = beta[0]*C0[0+bs*2];
1339 CC[1+bs*2] = beta[0]*C0[1+bs*2];
1340 CC[2+bs*2] = beta[0]*C0[2+bs*2];
1341 CC[3+bs*2] = beta[0]*C0[3+bs*2];
1342
1343 CC[0+bs*3] = beta[0]*C0[0+bs*3];
1344 CC[1+bs*3] = beta[0]*C0[1+bs*3];
1345 CC[2+bs*3] = beta[0]*C0[2+bs*3];
1346 CC[3+bs*3] = beta[0]*C0[3+bs*3];
1347 }
1348 else if(offsetC==1)
1349 {
1350 C1 = C0 + sdc*bs;
1351
1352 CC[0+bs*0] = beta[0]*C0[1+bs*0];
1353 CC[1+bs*0] = beta[0]*C0[2+bs*0];
1354 CC[2+bs*0] = beta[0]*C0[3+bs*0];
1355 CC[3+bs*0] = beta[0]*C1[0+bs*0];
1356
1357 CC[0+bs*1] = beta[0]*C0[1+bs*1];
1358 CC[1+bs*1] = beta[0]*C0[2+bs*1];
1359 CC[2+bs*1] = beta[0]*C0[3+bs*1];
1360 CC[3+bs*1] = beta[0]*C1[0+bs*1];
1361
1362 CC[0+bs*2] = beta[0]*C0[1+bs*2];
1363 CC[1+bs*2] = beta[0]*C0[2+bs*2];
1364 CC[2+bs*2] = beta[0]*C0[3+bs*2];
1365 CC[3+bs*2] = beta[0]*C1[0+bs*2];
1366
1367 CC[0+bs*3] = beta[0]*C0[1+bs*3];
1368 CC[1+bs*3] = beta[0]*C0[2+bs*3];
1369 CC[2+bs*3] = beta[0]*C0[3+bs*3];
1370 CC[3+bs*3] = beta[0]*C1[0+bs*3];
1371 }
1372 else if(offsetC==2)
1373 {
1374 C1 = C0 + sdc*bs;
1375
1376 CC[0+bs*0] = beta[0]*C0[2+bs*0];
1377 CC[1+bs*0] = beta[0]*C0[3+bs*0];
1378 CC[2+bs*0] = beta[0]*C1[0+bs*0];
1379 CC[3+bs*0] = beta[0]*C1[1+bs*0];
1380
1381 CC[0+bs*1] = beta[0]*C0[2+bs*1];
1382 CC[1+bs*1] = beta[0]*C0[3+bs*1];
1383 CC[2+bs*1] = beta[0]*C1[0+bs*1];
1384 CC[3+bs*1] = beta[0]*C1[1+bs*1];
1385
1386 CC[0+bs*2] = beta[0]*C0[2+bs*2];
1387 CC[1+bs*2] = beta[0]*C0[3+bs*2];
1388 CC[2+bs*2] = beta[0]*C1[0+bs*2];
1389 CC[3+bs*2] = beta[0]*C1[1+bs*2];
1390
1391 CC[0+bs*3] = beta[0]*C0[2+bs*3];
1392 CC[1+bs*3] = beta[0]*C0[3+bs*3];
1393 CC[2+bs*3] = beta[0]*C1[0+bs*3];
1394 CC[3+bs*3] = beta[0]*C1[1+bs*3];
1395 }
1396 else //if(offsetC==3)
1397 {
1398 C1 = C0 + sdc*bs;
1399
1400 CC[0+bs*0] = beta[0]*C0[3+bs*0];
1401 CC[1+bs*0] = beta[0]*C1[0+bs*0];
1402 CC[2+bs*0] = beta[0]*C1[1+bs*0];
1403 CC[3+bs*0] = beta[0]*C1[2+bs*0];
1404
1405 CC[0+bs*1] = beta[0]*C0[3+bs*1];
1406 CC[1+bs*1] = beta[0]*C1[0+bs*1];
1407 CC[2+bs*1] = beta[0]*C1[1+bs*1];
1408 CC[3+bs*1] = beta[0]*C1[2+bs*1];
1409
1410 CC[0+bs*2] = beta[0]*C0[3+bs*2];
1411 CC[1+bs*2] = beta[0]*C1[0+bs*2];
1412 CC[2+bs*2] = beta[0]*C1[1+bs*2];
1413 CC[3+bs*2] = beta[0]*C1[2+bs*2];
1414
1415 CC[0+bs*3] = beta[0]*C0[3+bs*3];
1416 CC[1+bs*3] = beta[0]*C1[0+bs*3];
1417 CC[2+bs*3] = beta[0]*C1[1+bs*3];
1418 CC[3+bs*3] = beta[0]*C1[2+bs*3];
1419 }
1420
1421 double beta1 = 1.0;
1422
1423 kernel_dgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, &beta1, CC, CC);
1424
1425 // shift sol for cols
1426 if(n0>0)
1427 {
1428 if(n0==1)
1429 {
1430 CC[0+bs*0] = CC[0+bs*1];
1431 CC[1+bs*0] = CC[1+bs*1];
1432 CC[2+bs*0] = CC[2+bs*1];
1433 CC[3+bs*0] = CC[3+bs*1];
1434
1435 CC[0+bs*1] = CC[0+bs*2];
1436 CC[1+bs*1] = CC[1+bs*2];
1437 CC[2+bs*1] = CC[2+bs*2];
1438 CC[3+bs*1] = CC[3+bs*2];
1439
1440 CC[0+bs*2] = CC[0+bs*3];
1441 CC[1+bs*2] = CC[1+bs*3];
1442 CC[2+bs*2] = CC[2+bs*3];
1443 CC[3+bs*2] = CC[3+bs*3];
1444
1445 D0 += 1*bs;
1446 }
1447 else if(n0==2)
1448 {
1449 CC[0+bs*0] = CC[0+bs*2];
1450 CC[1+bs*0] = CC[1+bs*2];
1451 CC[2+bs*0] = CC[2+bs*2];
1452 CC[3+bs*0] = CC[3+bs*2];
1453
1454 CC[0+bs*1] = CC[0+bs*3];
1455 CC[1+bs*1] = CC[1+bs*3];
1456 CC[2+bs*1] = CC[2+bs*3];
1457 CC[3+bs*1] = CC[3+bs*3];
1458
1459 D0 += 2*bs;
1460 }
1461 else //if(n0==3)
1462 {
1463 CC[0+bs*0] = CC[0+bs*3];
1464 CC[1+bs*0] = CC[1+bs*3];
1465 CC[2+bs*0] = CC[2+bs*3];
1466 CC[3+bs*0] = CC[3+bs*3];
1467
1468 D0 += 3*bs;
1469 }
1470 }
1471
1472 n1 = 4<n1 ? 4 : n1;
1473 int kn = n1 - n0;
1474
1475 if(offsetD==0)
1476 {
1477 if(kn<=0)
1478 return;
1479
1480 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
1481 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0];
1482 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0];
1483 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0];
1484
1485 if(kn<=1)
1486 return;
1487
1488 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
1489 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
1490 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1];
1491 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1];
1492
1493 if(kn<=2)
1494 return;
1495
1496 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
1497 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
1498 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
1499 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2];
1500
1501 if(kn<=3)
1502 return;
1503
1504 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
1505 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
1506 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
1507 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3];
1508 }
1509 else if(offsetD==1)
1510 {
1511 D1 = D0 + sdd*bs;
1512
1513 if(kn<=0)
1514 return;
1515
1516 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0];
1517 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0];
1518 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0];
1519 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0];
1520
1521 if(kn<=1)
1522 return;
1523
1524 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1];
1525 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1];
1526 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1];
1527 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1];
1528
1529 if(kn<=2)
1530 return;
1531
1532 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2];
1533 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2];
1534 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2];
1535 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2];
1536
1537 if(kn<=3)
1538 return;
1539
1540 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3];
1541 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3];
1542 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3];
1543 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3];
1544 }
1545 else if(offsetD==2)
1546 {
1547 D1 = D0 + sdd*bs;
1548
1549 if(kn<=0)
1550 return;
1551
1552 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0];
1553 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0];
1554 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0];
1555 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0];
1556
1557 if(kn<=1)
1558 return;
1559
1560 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1];
1561 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1];
1562 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1];
1563 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1];
1564
1565 if(kn<=2)
1566 return;
1567
1568 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2];
1569 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2];
1570 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2];
1571 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2];
1572
1573 if(kn<=3)
1574 return;
1575
1576 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3];
1577 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3];
1578 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3];
1579 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3];
1580 }
1581 else //if(offsetD==3)
1582 {
1583 D1 = D0 + sdd*bs;
1584
1585 if(kn<=0)
1586 return;
1587
1588 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0];
1589 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0];
1590 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0];
1591 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0];
1592
1593 if(kn<=1)
1594 return;
1595
1596 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1];
1597 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1];
1598 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1];
1599 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1];
1600
1601 if(kn<=2)
1602 return;
1603
1604 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2];
1605 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2];
1606 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2];
1607 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2];
1608
1609 if(kn<=3)
1610 return;
1611
1612 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3];
1613 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3];
1614 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3];
1615 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3];
1616 }
1617
1618 return;
1619
1620 }
1621 #endif
1622
1623
1624
1625 #if defined(TARGET_GENERIC)
kernel_dgemm_tt_4x4_lib4(int kmax,double * alpha,int offsetA,double * A,int sda,double * B,double * beta,double * C,double * D)1626 void kernel_dgemm_tt_4x4_lib4(int kmax, double *alpha, int offsetA, double *A, int sda, double *B, double *beta, double *C, double *D)
1627 {
1628
1629 const int bs = 4;
1630
1631 #if defined(TARGET_GENERIC)
1632 double CC[16] = {0};
1633 #else
1634 ALIGNED( double CC[16], 64 ) = {0};
1635 #endif
1636
1637 double beta1 = 0.0;
1638
1639 kernel_dgemm_nn_4x4_lib4(kmax, alpha, B, offsetA, A, sda, &beta1, CC, CC);
1640
1641 // transpose
1642 double tmp;
1643
1644 tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
1645 tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
1646 tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
1647 tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
1648 tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
1649 tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
1650
1651 // store
1652 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0];
1653 D[1+bs*0] = beta[0]*C[1+bs*0] + CC[1+bs*0];
1654 D[2+bs*0] = beta[0]*C[2+bs*0] + CC[2+bs*0];
1655 D[3+bs*0] = beta[0]*C[3+bs*0] + CC[3+bs*0];
1656
1657 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1];
1658 D[1+bs*1] = beta[0]*C[1+bs*1] + CC[1+bs*1];
1659 D[2+bs*1] = beta[0]*C[2+bs*1] + CC[2+bs*1];
1660 D[3+bs*1] = beta[0]*C[3+bs*1] + CC[3+bs*1];
1661
1662 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2];
1663 D[1+bs*2] = beta[0]*C[1+bs*2] + CC[1+bs*2];
1664 D[2+bs*2] = beta[0]*C[2+bs*2] + CC[2+bs*2];
1665 D[3+bs*2] = beta[0]*C[3+bs*2] + CC[3+bs*2];
1666
1667 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3];
1668 D[1+bs*3] = beta[0]*C[1+bs*3] + CC[1+bs*3];
1669 D[2+bs*3] = beta[0]*C[2+bs*3] + CC[2+bs*3];
1670 D[3+bs*3] = beta[0]*C[3+bs*3] + CC[3+bs*3];
1671
1672 return;
1673
1674 }
1675 #endif
1676
1677
1678
1679 #if defined(TARGET_GENERIC)
kernel_dgemm_tt_4x4_vs_lib4(int kmax,double * alpha,int offsetA,double * A,int sda,double * B,double * beta,double * C,double * D,int km,int kn)1680 void kernel_dgemm_tt_4x4_vs_lib4(int kmax, double *alpha, int offsetA, double *A, int sda, double *B, double *beta, double *C, double *D, int km, int kn)
1681 {
1682
1683 const int bs = 4;
1684
1685 #if defined(TARGET_GENERIC)
1686 double CC[16] = {0};
1687 #else
1688 ALIGNED( double CC[16], 64 ) = {0};
1689 #endif
1690
1691 double beta1 = 0.0;
1692
1693 kernel_dgemm_nn_4x4_lib4(kmax, alpha, B, offsetA, A, sda, &beta1, CC, CC);
1694
1695 // transpose
1696 double tmp;
1697
1698 tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
1699 tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
1700 tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
1701 tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
1702 tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
1703 tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
1704
1705 // store
1706 if(km>=4)
1707 {
1708 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0];
1709 D[1+bs*0] = beta[0]*C[1+bs*0] + CC[1+bs*0];
1710 D[2+bs*0] = beta[0]*C[2+bs*0] + CC[2+bs*0];
1711 D[3+bs*0] = beta[0]*C[3+bs*0] + CC[3+bs*0];
1712
1713 if(kn==1)
1714 return;
1715
1716 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1];
1717 D[1+bs*1] = beta[0]*C[1+bs*1] + CC[1+bs*1];
1718 D[2+bs*1] = beta[0]*C[2+bs*1] + CC[2+bs*1];
1719 D[3+bs*1] = beta[0]*C[3+bs*1] + CC[3+bs*1];
1720
1721 if(kn==2)
1722 return;
1723
1724 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2];
1725 D[1+bs*2] = beta[0]*C[1+bs*2] + CC[1+bs*2];
1726 D[2+bs*2] = beta[0]*C[2+bs*2] + CC[2+bs*2];
1727 D[3+bs*2] = beta[0]*C[3+bs*2] + CC[3+bs*2];
1728
1729 if(kn==3)
1730 return;
1731
1732 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3];
1733 D[1+bs*3] = beta[0]*C[1+bs*3] + CC[1+bs*3];
1734 D[2+bs*3] = beta[0]*C[2+bs*3] + CC[2+bs*3];
1735 D[3+bs*3] = beta[0]*C[3+bs*3] + CC[3+bs*3];
1736 }
1737 else if(km>=3)
1738 {
1739 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0];
1740 D[1+bs*0] = beta[0]*C[1+bs*0] + CC[1+bs*0];
1741 D[2+bs*0] = beta[0]*C[2+bs*0] + CC[2+bs*0];
1742
1743 if(kn==1)
1744 return;
1745
1746 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1];
1747 D[1+bs*1] = beta[0]*C[1+bs*1] + CC[1+bs*1];
1748 D[2+bs*1] = beta[0]*C[2+bs*1] + CC[2+bs*1];
1749
1750 if(kn==2)
1751 return;
1752
1753 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2];
1754 D[1+bs*2] = beta[0]*C[1+bs*2] + CC[1+bs*2];
1755 D[2+bs*2] = beta[0]*C[2+bs*2] + CC[2+bs*2];
1756
1757 if(kn==3)
1758 return;
1759
1760 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3];
1761 D[1+bs*3] = beta[0]*C[1+bs*3] + CC[1+bs*3];
1762 D[2+bs*3] = beta[0]*C[2+bs*3] + CC[2+bs*3];
1763 }
1764 else if(km>=2)
1765 {
1766 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0];
1767 D[1+bs*0] = beta[0]*C[1+bs*0] + CC[1+bs*0];
1768
1769 if(kn==1)
1770 return;
1771
1772 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1];
1773 D[1+bs*1] = beta[0]*C[1+bs*1] + CC[1+bs*1];
1774
1775 if(kn==2)
1776 return;
1777
1778 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2];
1779 D[1+bs*2] = beta[0]*C[1+bs*2] + CC[1+bs*2];
1780
1781 if(kn==3)
1782 return;
1783
1784 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3];
1785 D[1+bs*3] = beta[0]*C[1+bs*3] + CC[1+bs*3];
1786 }
1787 else //if(km>=1)
1788 {
1789 D[0+bs*0] = beta[0]*C[0+bs*0] + CC[0+bs*0];
1790
1791 if(kn==1)
1792 return;
1793
1794 D[0+bs*1] = beta[0]*C[0+bs*1] + CC[0+bs*1];
1795
1796 if(kn==2)
1797 return;
1798
1799 D[0+bs*2] = beta[0]*C[0+bs*2] + CC[0+bs*2];
1800
1801 if(kn==3)
1802 return;
1803
1804 D[0+bs*3] = beta[0]*C[0+bs*3] + CC[0+bs*3];
1805 }
1806
1807 return;
1808
1809 }
1810 #endif
1811
1812
1813
1814 #if defined(TARGET_GENERIC)
kernel_dgemm_tt_4x4_gen_lib4(int kmax,double * alpha,int offsetA,double * A,int sda,double * B,double * beta,int offsetC,double * C0,int sdc,int offsetD,double * D0,int sdd,int m0,int m1,int n0,int n1)1815 void kernel_dgemm_tt_4x4_gen_lib4(int kmax, double *alpha, int offsetA, double *A, int sda, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
1816 {
1817
1818 const int bs = 4;
1819
1820 #if defined(TARGET_GENERIC)
1821 double CC[16] = {0};
1822 #else
1823 ALIGNED( double CC[16], 64 ) = {0};
1824 #endif
1825
1826 double beta1 = 0.0;
1827
1828 kernel_dgemm_nn_4x4_lib4(kmax, alpha, B, offsetA, A, sda, &beta1, CC, CC);
1829
1830 // transpose
1831 double tmp;
1832
1833 tmp = CC[1+bs*0]; CC[1+bs*0] = CC[0+bs*1]; CC[0+bs*1] = tmp;
1834 tmp = CC[2+bs*0]; CC[2+bs*0] = CC[0+bs*2]; CC[0+bs*2] = tmp;
1835 tmp = CC[3+bs*0]; CC[3+bs*0] = CC[0+bs*3]; CC[0+bs*3] = tmp;
1836 tmp = CC[2+bs*1]; CC[2+bs*1] = CC[1+bs*2]; CC[1+bs*2] = tmp;
1837 tmp = CC[3+bs*1]; CC[3+bs*1] = CC[1+bs*3]; CC[1+bs*3] = tmp;
1838 tmp = CC[3+bs*2]; CC[3+bs*2] = CC[2+bs*3]; CC[2+bs*3] = tmp;
1839
1840 // update
1841 double
1842 *C1, *D1;
1843
1844 if(offsetC==0)
1845 {
1846 CC[0+bs*0] += beta[0]*C0[0+bs*0];
1847 CC[1+bs*0] += beta[0]*C0[1+bs*0];
1848 CC[2+bs*0] += beta[0]*C0[2+bs*0];
1849 CC[3+bs*0] += beta[0]*C0[3+bs*0];
1850
1851 CC[0+bs*1] += beta[0]*C0[0+bs*1];
1852 CC[1+bs*1] += beta[0]*C0[1+bs*1];
1853 CC[2+bs*1] += beta[0]*C0[2+bs*1];
1854 CC[3+bs*1] += beta[0]*C0[3+bs*1];
1855
1856 CC[0+bs*2] += beta[0]*C0[0+bs*2];
1857 CC[1+bs*2] += beta[0]*C0[1+bs*2];
1858 CC[2+bs*2] += beta[0]*C0[2+bs*2];
1859 CC[3+bs*2] += beta[0]*C0[3+bs*2];
1860
1861 CC[0+bs*3] += beta[0]*C0[0+bs*3];
1862 CC[1+bs*3] += beta[0]*C0[1+bs*3];
1863 CC[2+bs*3] += beta[0]*C0[2+bs*3];
1864 CC[3+bs*3] += beta[0]*C0[3+bs*3];
1865 }
1866 else if(offsetC==1)
1867 {
1868 C1 = C0 + sdc*bs;
1869
1870 CC[0+bs*0] += beta[0]*C0[1+bs*0];
1871 CC[1+bs*0] += beta[0]*C0[2+bs*0];
1872 CC[2+bs*0] += beta[0]*C0[3+bs*0];
1873 CC[3+bs*0] += beta[0]*C1[0+bs*0];
1874
1875 CC[0+bs*1] += beta[0]*C0[1+bs*1];
1876 CC[1+bs*1] += beta[0]*C0[2+bs*1];
1877 CC[2+bs*1] += beta[0]*C0[3+bs*1];
1878 CC[3+bs*1] += beta[0]*C1[0+bs*1];
1879
1880 CC[0+bs*2] += beta[0]*C0[1+bs*2];
1881 CC[1+bs*2] += beta[0]*C0[2+bs*2];
1882 CC[2+bs*2] += beta[0]*C0[3+bs*2];
1883 CC[3+bs*2] += beta[0]*C1[0+bs*2];
1884
1885 CC[0+bs*3] += beta[0]*C0[1+bs*3];
1886 CC[1+bs*3] += beta[0]*C0[2+bs*3];
1887 CC[2+bs*3] += beta[0]*C0[3+bs*3];
1888 CC[3+bs*3] += beta[0]*C1[0+bs*3];
1889 }
1890 else if(offsetC==2)
1891 {
1892 C1 = C0 + sdc*bs;
1893
1894 CC[0+bs*0] += beta[0]*C0[2+bs*0];
1895 CC[1+bs*0] += beta[0]*C0[3+bs*0];
1896 CC[2+bs*0] += beta[0]*C1[0+bs*0];
1897 CC[3+bs*0] += beta[0]*C1[1+bs*0];
1898
1899 CC[0+bs*1] += beta[0]*C0[2+bs*1];
1900 CC[1+bs*1] += beta[0]*C0[3+bs*1];
1901 CC[2+bs*1] += beta[0]*C1[0+bs*1];
1902 CC[3+bs*1] += beta[0]*C1[1+bs*1];
1903
1904 CC[0+bs*2] += beta[0]*C0[2+bs*2];
1905 CC[1+bs*2] += beta[0]*C0[3+bs*2];
1906 CC[2+bs*2] += beta[0]*C1[0+bs*2];
1907 CC[3+bs*2] += beta[0]*C1[1+bs*2];
1908
1909 CC[0+bs*3] += beta[0]*C0[2+bs*3];
1910 CC[1+bs*3] += beta[0]*C0[3+bs*3];
1911 CC[2+bs*3] += beta[0]*C1[0+bs*3];
1912 CC[3+bs*3] += beta[0]*C1[1+bs*3];
1913 }
1914 else //if(offsetC==3)
1915 {
1916 C1 = C0 + sdc*bs;
1917
1918 CC[0+bs*0] += beta[0]*C0[3+bs*0];
1919 CC[1+bs*0] += beta[0]*C1[0+bs*0];
1920 CC[2+bs*0] += beta[0]*C1[1+bs*0];
1921 CC[3+bs*0] += beta[0]*C1[2+bs*0];
1922
1923 CC[0+bs*1] += beta[0]*C0[3+bs*1];
1924 CC[1+bs*1] += beta[0]*C1[0+bs*1];
1925 CC[2+bs*1] += beta[0]*C1[1+bs*1];
1926 CC[3+bs*1] += beta[0]*C1[2+bs*1];
1927
1928 CC[0+bs*2] += beta[0]*C0[3+bs*2];
1929 CC[1+bs*2] += beta[0]*C1[0+bs*2];
1930 CC[2+bs*2] += beta[0]*C1[1+bs*2];
1931 CC[3+bs*2] += beta[0]*C1[2+bs*2];
1932
1933 CC[0+bs*3] += beta[0]*C0[3+bs*3];
1934 CC[1+bs*3] += beta[0]*C1[0+bs*3];
1935 CC[2+bs*3] += beta[0]*C1[1+bs*3];
1936 CC[3+bs*3] += beta[0]*C1[2+bs*3];
1937 }
1938
1939 // store
1940
1941 // shift sol for cols
1942 if(n0>0)
1943 {
1944 if(n0==1)
1945 {
1946 CC[0+bs*0] = CC[0+bs*1];
1947 CC[1+bs*0] = CC[1+bs*1];
1948 CC[2+bs*0] = CC[2+bs*1];
1949 CC[3+bs*0] = CC[3+bs*1];
1950
1951 CC[0+bs*1] = CC[0+bs*2];
1952 CC[1+bs*1] = CC[1+bs*2];
1953 CC[2+bs*1] = CC[2+bs*2];
1954 CC[3+bs*1] = CC[3+bs*2];
1955
1956 CC[0+bs*2] = CC[0+bs*3];
1957 CC[1+bs*2] = CC[1+bs*3];
1958 CC[2+bs*2] = CC[2+bs*3];
1959 CC[3+bs*2] = CC[3+bs*3];
1960
1961 D0 += 1*bs;
1962 }
1963 else if(n0==2)
1964 {
1965 CC[0+bs*0] = CC[0+bs*2];
1966 CC[1+bs*0] = CC[1+bs*2];
1967 CC[2+bs*0] = CC[2+bs*2];
1968 CC[3+bs*0] = CC[3+bs*2];
1969
1970 CC[0+bs*1] = CC[0+bs*3];
1971 CC[1+bs*1] = CC[1+bs*3];
1972 CC[2+bs*1] = CC[2+bs*3];
1973 CC[3+bs*1] = CC[3+bs*3];
1974
1975 D0 += 2*bs;
1976 }
1977 else //if(n0==3)
1978 {
1979 CC[0+bs*0] = CC[0+bs*3];
1980 CC[1+bs*0] = CC[1+bs*3];
1981 CC[2+bs*0] = CC[2+bs*3];
1982 CC[3+bs*0] = CC[3+bs*3];
1983
1984 D0 += 3*bs;
1985 }
1986 }
1987
1988 n1 = 4<n1 ? 4 : n1;
1989 int kn = n1 - n0;
1990
1991 if(offsetD==0)
1992 {
1993 if(kn<=0)
1994 return;
1995
1996 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
1997 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0];
1998 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0];
1999 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0];
2000
2001 if(kn<=1)
2002 return;
2003
2004 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
2005 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
2006 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1];
2007 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1];
2008
2009 if(kn<=2)
2010 return;
2011
2012 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
2013 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
2014 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
2015 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2];
2016
2017 if(kn<=3)
2018 return;
2019
2020 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
2021 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
2022 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
2023 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3];
2024 }
2025 else if(offsetD==1)
2026 {
2027 D1 = D0 + sdd*bs;
2028
2029 if(kn<=0)
2030 return;
2031
2032 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0];
2033 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0];
2034 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0];
2035 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0];
2036
2037 if(kn<=1)
2038 return;
2039
2040 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1];
2041 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1];
2042 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1];
2043 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1];
2044
2045 if(kn<=2)
2046 return;
2047
2048 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2];
2049 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2];
2050 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2];
2051 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2];
2052
2053 if(kn<=3)
2054 return;
2055
2056 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3];
2057 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3];
2058 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3];
2059 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3];
2060 }
2061 else if(offsetD==2)
2062 {
2063 D1 = D0 + sdd*bs;
2064
2065 if(kn<=0)
2066 return;
2067
2068 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0];
2069 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0];
2070 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0];
2071 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0];
2072
2073 if(kn<=1)
2074 return;
2075
2076 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1];
2077 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1];
2078 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1];
2079 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1];
2080
2081 if(kn<=2)
2082 return;
2083
2084 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2];
2085 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2];
2086 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2];
2087 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2];
2088
2089 if(kn<=3)
2090 return;
2091
2092 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3];
2093 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3];
2094 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3];
2095 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3];
2096 }
2097 else //if(offsetD==3)
2098 {
2099 D1 = D0 + sdd*bs;
2100
2101 if(kn<=0)
2102 return;
2103
2104 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0];
2105 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0];
2106 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0];
2107 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0];
2108
2109 if(kn<=1)
2110 return;
2111
2112 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1];
2113 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1];
2114 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1];
2115 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1];
2116
2117 if(kn<=2)
2118 return;
2119
2120 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2];
2121 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2];
2122 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2];
2123 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2];
2124
2125 if(kn<=3)
2126 return;
2127
2128 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3];
2129 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3];
2130 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3];
2131 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3];
2132 }
2133
2134 return;
2135
2136 }
2137 #endif
2138
2139
2140
2141 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nn_u_4x4_lib4(int kmax,double * alpha,double * A,int offsetB,double * B,int sdb,double * beta,double * C,double * D)2142 void kernel_dsyrk_nn_u_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D)
2143 {
2144
2145 const int bs = 4;
2146
2147 #if defined(TARGET_GENERIC)
2148 double CC[16] = {0};
2149 #else
2150 ALIGNED( double CC[16], 64 ) = {0};
2151 #endif
2152
2153 kernel_dgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, CC);
2154
2155 D[0+bs*0] = CC[0+bs*0];
2156
2157 D[0+bs*1] = CC[0+bs*1];
2158 D[1+bs*1] = CC[1+bs*1];
2159
2160 D[0+bs*2] = CC[0+bs*2];
2161 D[1+bs*2] = CC[1+bs*2];
2162 D[2+bs*2] = CC[2+bs*2];
2163
2164 D[0+bs*3] = CC[0+bs*3];
2165 D[1+bs*3] = CC[1+bs*3];
2166 D[2+bs*3] = CC[2+bs*3];
2167 D[3+bs*3] = CC[3+bs*3];
2168
2169 return;
2170
2171 }
2172 #endif
2173
2174
2175
2176 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nn_u_4x4_vs_lib4(int kmax,double * alpha,double * A,int offsetB,double * B,int sdb,double * beta,double * C,double * D,int km,int kn)2177 void kernel_dsyrk_nn_u_4x4_vs_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int km, int kn)
2178 {
2179
2180 const int bs = 4;
2181
2182 #if defined(TARGET_GENERIC)
2183 double CC[16] = {0};
2184 #else
2185 ALIGNED( double CC[16], 64 ) = {0};
2186 #endif
2187
2188 kernel_dgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, CC);
2189
2190 if(km>=4)
2191 {
2192 D[0+bs*0] = CC[0+bs*0];
2193
2194 if(kn==1)
2195 return;
2196
2197 D[0+bs*1] = CC[0+bs*1];
2198 D[1+bs*1] = CC[1+bs*1];
2199
2200 if(kn==2)
2201 return;
2202
2203 D[0+bs*2] = CC[0+bs*2];
2204 D[1+bs*2] = CC[1+bs*2];
2205 D[2+bs*2] = CC[2+bs*2];
2206
2207 if(kn==3)
2208 return;
2209
2210 D[0+bs*3] = CC[0+bs*3];
2211 D[1+bs*3] = CC[1+bs*3];
2212 D[2+bs*3] = CC[2+bs*3];
2213 D[3+bs*3] = CC[3+bs*3];
2214 }
2215 else if(km>=3)
2216 {
2217 D[0+bs*0] = CC[0+bs*0];
2218
2219 if(kn==1)
2220 return;
2221
2222 D[0+bs*1] = CC[0+bs*1];
2223 D[1+bs*1] = CC[1+bs*1];
2224
2225 if(kn==2)
2226 return;
2227
2228 D[0+bs*2] = CC[0+bs*2];
2229 D[1+bs*2] = CC[1+bs*2];
2230 D[2+bs*2] = CC[2+bs*2];
2231
2232 if(kn==3)
2233 return;
2234
2235 D[0+bs*3] = CC[0+bs*3];
2236 D[1+bs*3] = CC[1+bs*3];
2237 D[2+bs*3] = CC[2+bs*3];
2238 }
2239 else if(km>=2)
2240 {
2241 D[0+bs*0] = CC[0+bs*0];
2242
2243 if(kn==1)
2244 return;
2245
2246 D[0+bs*1] = CC[0+bs*1];
2247 D[1+bs*1] = CC[1+bs*1];
2248
2249 if(kn==2)
2250 return;
2251
2252 D[0+bs*2] = CC[0+bs*2];
2253 D[1+bs*2] = CC[1+bs*2];
2254
2255 if(kn==3)
2256 return;
2257
2258 D[0+bs*3] = CC[0+bs*3];
2259 D[1+bs*3] = CC[1+bs*3];
2260 }
2261 else //if(km>=1)
2262 {
2263 D[0+bs*0] = CC[0+bs*0];
2264
2265 if(kn==1)
2266 return;
2267
2268 D[0+bs*1] = CC[0+bs*1];
2269
2270 if(kn==2)
2271 return;
2272
2273 D[0+bs*2] = CC[0+bs*2];
2274
2275 if(kn==3)
2276 return;
2277
2278 D[0+bs*3] = CC[0+bs*3];
2279 }
2280
2281 return;
2282
2283 }
2284 #endif
2285
2286
2287
2288 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR)
kernel_dsyrk_nt_l_4x4_lib4(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D)2289 void kernel_dsyrk_nt_l_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
2290 {
2291
2292 const int bs = 4;
2293
2294 #if defined(TARGET_GENERIC)
2295 double CC[16] = {0};
2296 #else
2297 ALIGNED( double CC[16], 64 ) = {0};
2298 #endif
2299
2300 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC);
2301
2302 D[0+bs*0] = CC[0+bs*0];
2303 D[1+bs*0] = CC[1+bs*0];
2304 D[2+bs*0] = CC[2+bs*0];
2305 D[3+bs*0] = CC[3+bs*0];
2306
2307 D[1+bs*1] = CC[1+bs*1];
2308 D[2+bs*1] = CC[2+bs*1];
2309 D[3+bs*1] = CC[3+bs*1];
2310
2311 D[2+bs*2] = CC[2+bs*2];
2312 D[3+bs*2] = CC[3+bs*2];
2313
2314 D[3+bs*3] = CC[3+bs*3];
2315
2316 return;
2317
2318 }
2319 #endif
2320
2321
2322
2323 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int km,int kn)2324 void kernel_dsyrk_nt_l_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
2325 {
2326
2327 const int bs = 4;
2328
2329 #if defined(TARGET_GENERIC)
2330 double CC[16] = {0};
2331 #else
2332 ALIGNED( double CC[16], 64 ) = {0};
2333 #endif
2334
2335 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC);
2336
2337 if(km>=4)
2338 {
2339 D[0+bs*0] = CC[0+bs*0];
2340 D[1+bs*0] = CC[1+bs*0];
2341 D[2+bs*0] = CC[2+bs*0];
2342 D[3+bs*0] = CC[3+bs*0];
2343
2344 if(kn==1)
2345 return;
2346
2347 D[1+bs*1] = CC[1+bs*1];
2348 D[2+bs*1] = CC[2+bs*1];
2349 D[3+bs*1] = CC[3+bs*1];
2350
2351 if(kn==2)
2352 return;
2353
2354 D[2+bs*2] = CC[2+bs*2];
2355 D[3+bs*2] = CC[3+bs*2];
2356
2357 if(kn==3)
2358 return;
2359
2360 D[3+bs*3] = CC[3+bs*3];
2361 }
2362 else if(km>=3)
2363 {
2364 D[0+bs*0] = CC[0+bs*0];
2365 D[1+bs*0] = CC[1+bs*0];
2366 D[2+bs*0] = CC[2+bs*0];
2367
2368 if(kn==1)
2369 return;
2370
2371 D[1+bs*1] = CC[1+bs*1];
2372 D[2+bs*1] = CC[2+bs*1];
2373
2374 if(kn==2)
2375 return;
2376
2377 D[2+bs*2] = CC[2+bs*2];
2378 }
2379 else if(km>=2)
2380 {
2381 D[0+bs*0] = CC[0+bs*0];
2382 D[1+bs*0] = CC[1+bs*0];
2383
2384 if(kn==1)
2385 return;
2386
2387 D[1+bs*1] = CC[1+bs*1];
2388 }
2389 else //if(km>=1)
2390 {
2391 D[0+bs*0] = CC[0+bs*0];
2392 }
2393
2394 return;
2395
2396 }
2397 #endif
2398
2399
2400
2401 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax,double * alpha,double * A,double * B,double * beta,int offsetC,double * C0,int sdc,int offsetD,double * D0,int sdd,int m0,int m1,int n0,int n1)2402 void kernel_dsyrk_nt_l_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
2403 {
2404
2405 const int bs = 4;
2406
2407 #if defined(TARGET_GENERIC)
2408 double CC[16] = {0};
2409 #else
2410 ALIGNED( double CC[16], 64 ) = {0};
2411 #endif
2412
2413 double
2414 *C1, *D1;
2415
2416 if(offsetC==0)
2417 {
2418 CC[0+bs*0] = beta[0]*C0[0+bs*0];
2419 CC[1+bs*0] = beta[0]*C0[1+bs*0];
2420 CC[2+bs*0] = beta[0]*C0[2+bs*0];
2421 CC[3+bs*0] = beta[0]*C0[3+bs*0];
2422
2423 CC[1+bs*1] = beta[0]*C0[1+bs*1];
2424 CC[2+bs*1] = beta[0]*C0[2+bs*1];
2425 CC[3+bs*1] = beta[0]*C0[3+bs*1];
2426
2427 CC[2+bs*2] = beta[0]*C0[2+bs*2];
2428 CC[3+bs*2] = beta[0]*C0[3+bs*2];
2429
2430 CC[3+bs*3] = beta[0]*C0[3+bs*3];
2431 }
2432 else if(offsetC==1)
2433 {
2434 C1 = C0 + sdc*bs;
2435
2436 CC[0+bs*0] = beta[0]*C0[1+bs*0];
2437 CC[1+bs*0] = beta[0]*C0[2+bs*0];
2438 CC[2+bs*0] = beta[0]*C0[3+bs*0];
2439 CC[3+bs*0] = beta[0]*C1[0+bs*0];
2440
2441 CC[1+bs*1] = beta[0]*C0[2+bs*1];
2442 CC[2+bs*1] = beta[0]*C0[3+bs*1];
2443 CC[3+bs*1] = beta[0]*C1[0+bs*1];
2444
2445 CC[2+bs*2] = beta[0]*C0[3+bs*2];
2446 CC[3+bs*2] = beta[0]*C1[0+bs*2];
2447
2448 CC[3+bs*3] = beta[0]*C1[0+bs*3];
2449 }
2450 else if(offsetC==2)
2451 {
2452 C1 = C0 + sdc*bs;
2453
2454 CC[0+bs*0] = beta[0]*C0[2+bs*0];
2455 CC[1+bs*0] = beta[0]*C0[3+bs*0];
2456 CC[2+bs*0] = beta[0]*C1[0+bs*0];
2457 CC[3+bs*0] = beta[0]*C1[1+bs*0];
2458
2459 CC[1+bs*1] = beta[0]*C0[3+bs*1];
2460 CC[2+bs*1] = beta[0]*C1[0+bs*1];
2461 CC[3+bs*1] = beta[0]*C1[1+bs*1];
2462
2463 CC[2+bs*2] = beta[0]*C1[0+bs*2];
2464 CC[3+bs*2] = beta[0]*C1[1+bs*2];
2465
2466 CC[3+bs*3] = beta[0]*C1[1+bs*3];
2467 }
2468 else //if(offsetC==3)
2469 {
2470 C1 = C0 + sdc*bs;
2471
2472 CC[0+bs*0] = beta[0]*C0[3+bs*0];
2473 CC[1+bs*0] = beta[0]*C1[0+bs*0];
2474 CC[2+bs*0] = beta[0]*C1[1+bs*0];
2475 CC[3+bs*0] = beta[0]*C1[2+bs*0];
2476
2477 CC[1+bs*1] = beta[0]*C1[0+bs*1];
2478 CC[2+bs*1] = beta[0]*C1[1+bs*1];
2479 CC[3+bs*1] = beta[0]*C1[2+bs*1];
2480
2481 CC[2+bs*2] = beta[0]*C1[1+bs*2];
2482 CC[3+bs*2] = beta[0]*C1[2+bs*2];
2483
2484 CC[3+bs*3] = beta[0]*C1[2+bs*3];
2485 }
2486
2487 double beta1 = 1.0;
2488
2489 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC);
2490
2491 // shift sol for cols
2492 if(n0>0)
2493 {
2494 if(n0==1)
2495 {
2496 CC[0+bs*0] = CC[0+bs*1];
2497 CC[1+bs*0] = CC[1+bs*1];
2498 CC[2+bs*0] = CC[2+bs*1];
2499 CC[3+bs*0] = CC[3+bs*1];
2500
2501 CC[0+bs*1] = CC[0+bs*2];
2502 CC[1+bs*1] = CC[1+bs*2];
2503 CC[2+bs*1] = CC[2+bs*2];
2504 CC[3+bs*1] = CC[3+bs*2];
2505
2506 CC[0+bs*2] = CC[0+bs*3];
2507 CC[1+bs*2] = CC[1+bs*3];
2508 CC[2+bs*2] = CC[2+bs*3];
2509 CC[3+bs*2] = CC[3+bs*3];
2510
2511 D0 += 1*bs;
2512 }
2513 else if(n0==2)
2514 {
2515 CC[0+bs*0] = CC[0+bs*2];
2516 CC[1+bs*0] = CC[1+bs*2];
2517 CC[2+bs*0] = CC[2+bs*2];
2518 CC[3+bs*0] = CC[3+bs*2];
2519
2520 CC[0+bs*1] = CC[0+bs*3];
2521 CC[1+bs*1] = CC[1+bs*3];
2522 CC[2+bs*1] = CC[2+bs*3];
2523 CC[3+bs*1] = CC[3+bs*3];
2524
2525 D0 += 2*bs;
2526 }
2527 else //if(n0==3)
2528 {
2529 CC[0+bs*0] = CC[0+bs*3];
2530 CC[1+bs*0] = CC[1+bs*3];
2531 CC[2+bs*0] = CC[2+bs*3];
2532 CC[3+bs*0] = CC[3+bs*3];
2533
2534 D0 += 3*bs;
2535 }
2536 }
2537
2538 n1 = 4<n1 ? 4 : n1;
2539 int kn = n1 - n0;
2540
2541 if(offsetD==0)
2542 {
2543 if(m0<=0)
2544 {
2545 if(kn<=0)
2546 return;
2547
2548 if(m1>0) D0[0+bs*0] = CC[0+bs*0];
2549 if(m1>1) D0[1+bs*0] = CC[1+bs*0];
2550 if(m1>2) D0[2+bs*0] = CC[2+bs*0];
2551 if(m1>3) D0[3+bs*0] = CC[3+bs*0];
2552
2553 if(kn<=1)
2554 return;
2555
2556 if(m1>1) D0[1+bs*1] = CC[1+bs*1];
2557 if(m1>2) D0[2+bs*1] = CC[2+bs*1];
2558 if(m1>3) D0[3+bs*1] = CC[3+bs*1];
2559
2560 if(kn<=2)
2561 return;
2562
2563 if(m1>2) D0[2+bs*2] = CC[2+bs*2];
2564 if(m1>3) D0[3+bs*2] = CC[3+bs*2];
2565
2566 if(kn<=3)
2567 return;
2568
2569 if(m1>3) D0[3+bs*3] = CC[3+bs*3];
2570 }
2571 else if(m0<=1)
2572 {
2573 if(kn<=0)
2574 return;
2575
2576 if(m1>1) D0[1+bs*0] = CC[1+bs*0];
2577 if(m1>2) D0[2+bs*0] = CC[2+bs*0];
2578 if(m1>3) D0[3+bs*0] = CC[3+bs*0];
2579
2580 if(kn<=1)
2581 return;
2582
2583 if(m1>2) D0[2+bs*1] = CC[2+bs*1];
2584 if(m1>3) D0[3+bs*1] = CC[3+bs*1];
2585
2586 if(kn<=2)
2587 return;
2588
2589 if(m1>3) D0[3+bs*2] = CC[3+bs*2];
2590 }
2591 else if(m0<=2)
2592 {
2593 if(kn<=0)
2594 return;
2595
2596 if(m1>2) D0[2+bs*0] = CC[2+bs*0];
2597 if(m1>3) D0[3+bs*0] = CC[3+bs*0];
2598
2599 if(kn<=1)
2600 return;
2601
2602 if(m1>3) D0[3+bs*1] = CC[3+bs*1];
2603 }
2604 else if(m0<=3)
2605 {
2606 if(kn<=0)
2607 return;
2608
2609 if(m1>3) D0[3+bs*0] = CC[3+bs*0];
2610 }
2611 }
2612 else if(offsetD==1)
2613 {
2614 D1 = D0 + sdd*bs;
2615 if(m0<=0)
2616 {
2617 if(kn<=0)
2618 return;
2619
2620 if(m1>0) D0[1+bs*0] = CC[0+bs*0];
2621 if(m1>1) D0[2+bs*0] = CC[1+bs*0];
2622 if(m1>2) D0[3+bs*0] = CC[2+bs*0];
2623 if(m1>3) D1[0+bs*0] = CC[3+bs*0];
2624
2625 if(kn<=1)
2626 return;
2627
2628 if(m1>1) D0[2+bs*1] = CC[1+bs*1];
2629 if(m1>2) D0[3+bs*1] = CC[2+bs*1];
2630 if(m1>3) D1[0+bs*1] = CC[3+bs*1];
2631
2632 if(kn<=2)
2633 return;
2634
2635 if(m1>2) D0[3+bs*2] = CC[2+bs*2];
2636 if(m1>3) D1[0+bs*2] = CC[3+bs*2];
2637
2638 if(kn<=3)
2639 return;
2640
2641 if(m1>3) D1[0+bs*3] = CC[3+bs*3];
2642 }
2643 else if(m0<=1)
2644 {
2645 if(kn<=0)
2646 return;
2647
2648 if(m1>1) D0[2+bs*0] = CC[1+bs*0];
2649 if(m1>2) D0[3+bs*0] = CC[2+bs*0];
2650 if(m1>3) D1[0+bs*0] = CC[3+bs*0];
2651
2652 if(kn<=1)
2653 return;
2654
2655 if(m1>2) D0[3+bs*1] = CC[2+bs*1];
2656 if(m1>3) D1[0+bs*1] = CC[3+bs*1];
2657
2658 if(kn<=2)
2659 return;
2660
2661 if(m1>3) D1[0+bs*2] = CC[3+bs*2];
2662 }
2663 else if(m0<=2)
2664 {
2665 if(kn<=0)
2666 return;
2667
2668 if(m1>2) D0[3+bs*0] = CC[2+bs*0];
2669 if(m1>3) D1[0+bs*0] = CC[3+bs*0];
2670
2671 if(kn<=1)
2672 return;
2673
2674 if(m1>3) D1[0+bs*1] = CC[3+bs*1];
2675 }
2676 else if(m0<=3)
2677 {
2678 if(kn<=0)
2679 return;
2680
2681 if(m1>3) D1[0+bs*0] = CC[3+bs*0];
2682 }
2683 }
2684 else if(offsetD==2)
2685 {
2686 D1 = D0 + sdd*bs;
2687 if(m0<=0)
2688 {
2689 if(kn<=0)
2690 return;
2691
2692 if(m1>0) D0[2+bs*0] = CC[0+bs*0];
2693 if(m1>1) D0[3+bs*0] = CC[1+bs*0];
2694 if(m1>2) D1[0+bs*0] = CC[2+bs*0];
2695 if(m1>3) D1[1+bs*0] = CC[3+bs*0];
2696
2697 if(kn<=1)
2698 return;
2699
2700 if(m1>1) D0[3+bs*1] = CC[1+bs*1];
2701 if(m1>2) D1[0+bs*1] = CC[2+bs*1];
2702 if(m1>3) D1[1+bs*1] = CC[3+bs*1];
2703
2704 if(kn<=2)
2705 return;
2706
2707 if(m1>2) D1[0+bs*2] = CC[2+bs*2];
2708 if(m1>3) D1[1+bs*2] = CC[3+bs*2];
2709
2710 if(kn<=3)
2711 return;
2712
2713 if(m1>3) D1[1+bs*3] = CC[3+bs*3];
2714 }
2715 else if(m0<=1)
2716 {
2717 if(kn<=0)
2718 return;
2719
2720 if(m1>1) D0[3+bs*0] = CC[1+bs*0];
2721 if(m1>2) D1[0+bs*0] = CC[2+bs*0];
2722 if(m1>3) D1[1+bs*0] = CC[3+bs*0];
2723
2724 if(kn<=1)
2725 return;
2726
2727 if(m1>2) D1[0+bs*1] = CC[2+bs*1];
2728 if(m1>3) D1[1+bs*1] = CC[3+bs*1];
2729
2730 if(kn<=2)
2731 return;
2732
2733 if(m1>3) D1[1+bs*2] = CC[3+bs*2];
2734 }
2735 else if(m0<=2)
2736 {
2737 if(kn<=0)
2738 return;
2739
2740 if(m1>2) D1[0+bs*0] = CC[2+bs*0];
2741 if(m1>3) D1[1+bs*0] = CC[3+bs*0];
2742
2743 if(kn<=1)
2744 return;
2745
2746 if(m1>3) D1[1+bs*1] = CC[3+bs*1];
2747 }
2748 else if(m0<=3)
2749 {
2750 if(kn<=0)
2751 return;
2752
2753 if(m1>3) D1[1+bs*0] = CC[3+bs*0];
2754 }
2755 }
2756 else //if(offsetD==3)
2757 {
2758 D1 = D0 + sdd*bs;
2759 if(m0<=0)
2760 {
2761 if(kn<=0)
2762 return;
2763
2764 if(m1>0) D0[3+bs*0] = CC[0+bs*0];
2765 if(m1>1) D1[0+bs*0] = CC[1+bs*0];
2766 if(m1>2) D1[1+bs*0] = CC[2+bs*0];
2767 if(m1>3) D1[2+bs*0] = CC[3+bs*0];
2768
2769 if(kn<=1)
2770 return;
2771
2772 if(m1>1) D1[0+bs*1] = CC[1+bs*1];
2773 if(m1>2) D1[1+bs*1] = CC[2+bs*1];
2774 if(m1>3) D1[2+bs*1] = CC[3+bs*1];
2775
2776 if(kn<=2)
2777 return;
2778
2779 if(m1>2) D1[1+bs*2] = CC[2+bs*2];
2780 if(m1>3) D1[2+bs*2] = CC[3+bs*2];
2781
2782 if(kn<=3)
2783 return;
2784
2785 if(m1>3) D1[2+bs*3] = CC[3+bs*3];
2786 }
2787 else if(m0<=1)
2788 {
2789 if(kn<=0)
2790 return;
2791
2792 if(m1>1) D1[0+bs*0] = CC[1+bs*0];
2793 if(m1>2) D1[1+bs*0] = CC[2+bs*0];
2794 if(m1>3) D1[2+bs*0] = CC[3+bs*0];
2795
2796 if(kn<=1)
2797 return;
2798
2799 if(m1>2) D1[1+bs*1] = CC[2+bs*1];
2800 if(m1>3) D1[2+bs*1] = CC[3+bs*1];
2801
2802 if(kn<=2)
2803 return;
2804
2805 if(m1>3) D1[2+bs*2] = CC[3+bs*2];
2806 }
2807 else if(m0<=2)
2808 {
2809 if(kn<=0)
2810 return;
2811
2812 if(m1>2) D1[1+bs*0] = CC[2+bs*0];
2813 if(m1>3) D1[2+bs*0] = CC[3+bs*0];
2814
2815 if(kn<=1)
2816 return;
2817
2818 if(m1>3) D1[2+bs*1] = CC[3+bs*1];
2819 }
2820 else if(m0<=3)
2821 {
2822 if(kn<=0)
2823 return;
2824
2825 if(m1>3) D1[2+bs*0] = CC[3+bs*0];
2826 }
2827 }
2828
2829 return;
2830
2831 }
2832 #endif
2833
2834
2835
2836 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nt_u_4x4_lib4(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D)2837 void kernel_dsyrk_nt_u_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D)
2838 {
2839
2840 const int bs = 4;
2841
2842 #if defined(TARGET_GENERIC)
2843 double CC[16] = {0};
2844 #else
2845 ALIGNED( double CC[16], 64 ) = {0};
2846 #endif
2847
2848 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC);
2849
2850 D[0+bs*0] = CC[0+bs*0];
2851
2852 D[0+bs*1] = CC[0+bs*1];
2853 D[1+bs*1] = CC[1+bs*1];
2854
2855 D[0+bs*2] = CC[0+bs*2];
2856 D[1+bs*2] = CC[1+bs*2];
2857 D[2+bs*2] = CC[2+bs*2];
2858
2859 D[0+bs*3] = CC[0+bs*3];
2860 D[1+bs*3] = CC[1+bs*3];
2861 D[2+bs*3] = CC[2+bs*3];
2862 D[3+bs*3] = CC[3+bs*3];
2863
2864 return;
2865
2866 }
2867 #endif
2868
2869
2870
2871 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dsyrk_nt_u_4x4_vs_lib4(int kmax,double * alpha,double * A,double * B,double * beta,double * C,double * D,int km,int kn)2872 void kernel_dsyrk_nt_u_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D, int km, int kn)
2873 {
2874
2875 const int bs = 4;
2876
2877 #if defined(TARGET_GENERIC)
2878 double CC[16] = {0};
2879 #else
2880 ALIGNED( double CC[16], 64 ) = {0};
2881 #endif
2882
2883 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC);
2884
2885 if(km>=4)
2886 {
2887 D[0+bs*0] = CC[0+bs*0];
2888
2889 if(kn==1)
2890 return;
2891
2892 D[0+bs*1] = CC[0+bs*1];
2893 D[1+bs*1] = CC[1+bs*1];
2894
2895 if(kn==2)
2896 return;
2897
2898 D[0+bs*2] = CC[0+bs*2];
2899 D[1+bs*2] = CC[1+bs*2];
2900 D[2+bs*2] = CC[2+bs*2];
2901
2902 if(kn==3)
2903 return;
2904
2905 D[0+bs*3] = CC[0+bs*3];
2906 D[1+bs*3] = CC[1+bs*3];
2907 D[2+bs*3] = CC[2+bs*3];
2908 D[3+bs*3] = CC[3+bs*3];
2909 }
2910 else if(km>=3)
2911 {
2912 D[0+bs*0] = CC[0+bs*0];
2913
2914 if(kn==1)
2915 return;
2916
2917 D[0+bs*1] = CC[0+bs*1];
2918 D[1+bs*1] = CC[1+bs*1];
2919
2920 if(kn==2)
2921 return;
2922
2923 D[0+bs*2] = CC[0+bs*2];
2924 D[1+bs*2] = CC[1+bs*2];
2925 D[2+bs*2] = CC[2+bs*2];
2926
2927 if(kn==3)
2928 return;
2929
2930 D[0+bs*3] = CC[0+bs*3];
2931 D[1+bs*3] = CC[1+bs*3];
2932 D[2+bs*3] = CC[2+bs*3];
2933 }
2934 else if(km>=2)
2935 {
2936 D[0+bs*0] = CC[0+bs*0];
2937
2938 if(kn==1)
2939 return;
2940
2941 D[0+bs*1] = CC[0+bs*1];
2942 D[1+bs*1] = CC[1+bs*1];
2943
2944 if(kn==2)
2945 return;
2946
2947 D[0+bs*2] = CC[0+bs*2];
2948 D[1+bs*2] = CC[1+bs*2];
2949
2950 if(kn==3)
2951 return;
2952
2953 D[0+bs*3] = CC[0+bs*3];
2954 D[1+bs*3] = CC[1+bs*3];
2955 }
2956 else //if(km>=1)
2957 {
2958 D[0+bs*0] = CC[0+bs*0];
2959
2960 if(kn==1)
2961 return;
2962
2963 D[0+bs*1] = CC[0+bs*1];
2964
2965 if(kn==2)
2966 return;
2967
2968 D[0+bs*2] = CC[0+bs*2];
2969
2970 if(kn==3)
2971 return;
2972
2973 D[0+bs*3] = CC[0+bs*3];
2974 }
2975
2976 return;
2977
2978 }
2979 #endif
2980
2981
2982
2983 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dsyrk_nt_u_4x4_gen_lib4(int kmax,double * alpha,double * A,double * B,double * beta,int offsetC,double * C0,int sdc,int offsetD,double * D0,int sdd,int m0,int m1,int n0,int n1)2984 void kernel_dsyrk_nt_u_4x4_gen_lib4(int kmax, double *alpha, double *A, double *B, double *beta, int offsetC, double *C0, int sdc, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
2985 {
2986
2987 const int bs = 4;
2988
2989 #if defined(TARGET_GENERIC)
2990 double CC[16] = {0};
2991 #else
2992 ALIGNED( double CC[16], 64 ) = {0};
2993 #endif
2994
2995 double
2996 *C1, *D1;
2997
2998 if(offsetC==0)
2999 {
3000 CC[0+bs*0] = beta[0]*C0[0+bs*0];
3001
3002 CC[0+bs*1] = beta[0]*C0[0+bs*1];
3003 CC[1+bs*1] = beta[0]*C0[1+bs*1];
3004
3005 CC[0+bs*2] = beta[0]*C0[0+bs*2];
3006 CC[1+bs*2] = beta[0]*C0[1+bs*2];
3007 CC[2+bs*2] = beta[0]*C0[2+bs*2];
3008
3009 CC[0+bs*3] = beta[0]*C0[0+bs*3];
3010 CC[1+bs*3] = beta[0]*C0[1+bs*3];
3011 CC[2+bs*3] = beta[0]*C0[2+bs*3];
3012 CC[3+bs*3] = beta[0]*C0[3+bs*3];
3013 }
3014 else if(offsetC==1)
3015 {
3016 C1 = C0 + sdc*bs;
3017
3018 CC[0+bs*0] = beta[0]*C0[0+bs*0];
3019
3020 CC[0+bs*1] = beta[0]*C0[0+bs*1];
3021 CC[1+bs*1] = beta[0]*C0[1+bs*1];
3022
3023 CC[0+bs*2] = beta[0]*C0[0+bs*2];
3024 CC[1+bs*2] = beta[0]*C0[1+bs*2];
3025 CC[2+bs*2] = beta[0]*C0[2+bs*2];
3026
3027 CC[0+bs*3] = beta[0]*C0[0+bs*3];
3028 CC[1+bs*3] = beta[0]*C0[1+bs*3];
3029 CC[2+bs*3] = beta[0]*C0[2+bs*3];
3030 CC[3+bs*3] = beta[0]*C1[3+bs*3];
3031 }
3032 else if(offsetC==2)
3033 {
3034 C1 = C0 + sdc*bs;
3035
3036 CC[0+bs*0] = beta[0]*C0[0+bs*0];
3037
3038 CC[0+bs*1] = beta[0]*C0[0+bs*1];
3039 CC[1+bs*1] = beta[0]*C0[1+bs*1];
3040
3041 CC[0+bs*2] = beta[0]*C0[0+bs*2];
3042 CC[1+bs*2] = beta[0]*C0[1+bs*2];
3043 CC[2+bs*2] = beta[0]*C1[2+bs*2];
3044
3045 CC[0+bs*3] = beta[0]*C0[0+bs*3];
3046 CC[1+bs*3] = beta[0]*C0[1+bs*3];
3047 CC[2+bs*3] = beta[0]*C1[2+bs*3];
3048 CC[3+bs*3] = beta[0]*C1[3+bs*3];
3049 }
3050 else //if(offsetC==3)
3051 {
3052 C1 = C0 + sdc*bs;
3053
3054 CC[0+bs*0] = beta[0]*C0[0+bs*0];
3055
3056 CC[0+bs*1] = beta[0]*C0[0+bs*1];
3057 CC[1+bs*1] = beta[0]*C1[1+bs*1];
3058
3059 CC[0+bs*2] = beta[0]*C0[0+bs*2];
3060 CC[1+bs*2] = beta[0]*C1[1+bs*2];
3061 CC[2+bs*2] = beta[0]*C1[2+bs*2];
3062
3063 CC[0+bs*3] = beta[0]*C0[0+bs*3];
3064 CC[1+bs*3] = beta[0]*C1[1+bs*3];
3065 CC[2+bs*3] = beta[0]*C1[2+bs*3];
3066 CC[3+bs*3] = beta[0]*C1[3+bs*3];
3067 }
3068
3069 double beta1 = 1.0;
3070
3071 kernel_dgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC);
3072
3073 // TODO fix below !!!!!!!!!!!!
3074
3075 if(offsetD==0)
3076 {
3077 if(n0==0)
3078 goto l_00;
3079 else if(n0==1)
3080 goto l_01;
3081 else if(n0==2)
3082 goto l_02;
3083 else if(n0==3)
3084 goto l_03;
3085 else
3086 return;
3087
3088 l_00:
3089 if(n1<=0)
3090 return;
3091
3092 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
3093
3094 l_01:
3095 if(n1<=1)
3096 return;
3097
3098 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
3099 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
3100
3101 l_02:
3102 if(n1<=2)
3103 return;
3104
3105 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
3106 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
3107 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
3108
3109 l_03:
3110 if(n1<=3)
3111 return;
3112
3113 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
3114 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
3115 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
3116 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3];
3117 }
3118 else if(offsetD==1)
3119 {
3120 D1 = D0 + sdd*bs;
3121
3122 if(n0==0)
3123 goto l_10;
3124 else if(n0==1)
3125 goto l_11;
3126 else if(n0==2)
3127 goto l_12;
3128 else if(n0==3)
3129 goto l_13;
3130 else
3131 return;
3132
3133 l_10:
3134 if(n1<=0)
3135 return;
3136
3137 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
3138
3139 l_11:
3140 if(n1<=1)
3141 return;
3142
3143 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
3144 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
3145
3146 l_12:
3147 if(n1<=2)
3148 return;
3149
3150 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
3151 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
3152 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
3153
3154 l_13:
3155 if(n1<=3)
3156 return;
3157
3158 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
3159 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
3160 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
3161 if(m0<=3 & m1>3) D1[3+bs*3] = CC[3+bs*3];
3162 }
3163 else if(offsetD==2)
3164 {
3165 D1 = D0 + sdd*bs;
3166
3167 if(n0==0)
3168 goto l_20;
3169 else if(n0==1)
3170 goto l_21;
3171 else if(n0==2)
3172 goto l_22;
3173 else if(n0==3)
3174 goto l_23;
3175 else
3176 return;
3177
3178 l_20:
3179 if(n1<=0)
3180 return;
3181
3182 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
3183
3184 l_21:
3185 if(n1<=1)
3186 return;
3187
3188 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
3189 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
3190
3191 l_22:
3192 if(n1<=2)
3193 return;
3194
3195 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
3196 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
3197 if(m0<=2 & m1>2) D1[2+bs*2] = CC[2+bs*2];
3198
3199 l_23:
3200 if(n1<=3)
3201 return;
3202
3203 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
3204 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
3205 if(m0<=2 & m1>2) D1[2+bs*3] = CC[2+bs*3];
3206 if(m0<=3 & m1>3) D1[3+bs*3] = CC[3+bs*3];
3207 }
3208 else //if(offsetD==3)
3209 {
3210 D1 = D0 + sdd*bs;
3211
3212 if(n0==0)
3213 goto l_30;
3214 else if(n0==1)
3215 goto l_31;
3216 else if(n0==2)
3217 goto l_32;
3218 else if(n0==3)
3219 goto l_33;
3220 else
3221 return;
3222
3223 l_30:
3224 if(n1<=0)
3225 return;
3226
3227 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
3228
3229 l_31:
3230 if(n1<=1)
3231 return;
3232
3233 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
3234 if(m0<=1 & m1>1) D1[1+bs*1] = CC[1+bs*1];
3235
3236 l_32:
3237 if(n1<=2)
3238 return;
3239
3240 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
3241 if(m0<=1 & m1>1) D1[1+bs*2] = CC[1+bs*2];
3242 if(m0<=2 & m1>2) D1[2+bs*2] = CC[2+bs*2];
3243
3244 l_33:
3245 if(n1<=3)
3246 return;
3247
3248 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
3249 if(m0<=1 & m1>1) D1[1+bs*3] = CC[1+bs*3];
3250 if(m0<=2 & m1>2) D1[2+bs*3] = CC[2+bs*3];
3251 if(m0<=3 & m1>3) D1[3+bs*3] = CC[3+bs*3];
3252 }
3253
3254 return;
3255
3256 }
3257 #endif
3258
3259
3260
3261 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_lib4(int kmax,double * alpha,double * A,double * B,double * D)3262 void kernel_dtrmm_nt_ru_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *D)
3263 {
3264
3265 const int bs = 4;
3266
3267 double
3268 a_0, a_1, a_2, a_3,
3269 b_0, b_1, b_2, b_3;
3270
3271 #if defined(TARGET_GENERIC)
3272 double CC[16] = {0};
3273 #else
3274 ALIGNED( double CC[16], 64 ) = {0};
3275 #endif
3276
3277 int k;
3278
3279 k = 0;
3280
3281 // k = 0
3282 if(kmax>0)
3283 {
3284 a_0 = A[0];
3285 a_1 = A[1];
3286 a_2 = A[2];
3287 a_3 = A[3];
3288
3289 b_0 = B[0];
3290
3291 CC[0+bs*0] += a_0 * b_0;
3292 CC[1+bs*0] += a_1 * b_0;
3293 CC[2+bs*0] += a_2 * b_0;
3294 CC[3+bs*0] += a_3 * b_0;
3295
3296 A += 4;
3297 B += 4;
3298 k++;
3299 }
3300
3301 // k = 1
3302 if(kmax>1)
3303 {
3304 a_0 = A[0];
3305 a_1 = A[1];
3306 a_2 = A[2];
3307 a_3 = A[3];
3308
3309 b_0 = B[0];
3310 b_1 = B[1];
3311
3312 CC[0+bs*0] += a_0 * b_0;
3313 CC[1+bs*0] += a_1 * b_0;
3314 CC[2+bs*0] += a_2 * b_0;
3315 CC[3+bs*0] += a_3 * b_0;
3316
3317 CC[0+bs*1] += a_0 * b_1;
3318 CC[1+bs*1] += a_1 * b_1;
3319 CC[2+bs*1] += a_2 * b_1;
3320 CC[3+bs*1] += a_3 * b_1;
3321
3322 A += 4;
3323 B += 4;
3324 k++;
3325 }
3326
3327 // k = 2
3328 if(kmax>2)
3329 {
3330 a_0 = A[0];
3331 a_1 = A[1];
3332 a_2 = A[2];
3333 a_3 = A[3];
3334
3335 b_0 = B[0];
3336 b_1 = B[1];
3337 b_2 = B[2];
3338
3339 CC[0+bs*0] += a_0 * b_0;
3340 CC[1+bs*0] += a_1 * b_0;
3341 CC[2+bs*0] += a_2 * b_0;
3342 CC[3+bs*0] += a_3 * b_0;
3343
3344 CC[0+bs*1] += a_0 * b_1;
3345 CC[1+bs*1] += a_1 * b_1;
3346 CC[2+bs*1] += a_2 * b_1;
3347 CC[3+bs*1] += a_3 * b_1;
3348
3349 CC[0+bs*2] += a_0 * b_2;
3350 CC[1+bs*2] += a_1 * b_2;
3351 CC[2+bs*2] += a_2 * b_2;
3352 CC[3+bs*2] += a_3 * b_2;
3353
3354 A += 4;
3355 B += 4;
3356 k++;
3357 }
3358
3359 kernel_dgemm_nt_4x4_lib4(kmax-k, alpha, A, B, alpha, CC, D);
3360
3361 return;
3362
3363 }
3364 #endif
3365
3366
3367
3368 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax,double * alpha,double * A,double * B,double * D,int km,int kn)3369 void kernel_dtrmm_nt_ru_4x4_vs_lib4(int kmax, double *alpha, double *A, double *B, double *D, int km, int kn)
3370 {
3371
3372 const int bs = 4;
3373
3374 double
3375 a_0, a_1, a_2, a_3,
3376 b_0, b_1, b_2, b_3;
3377
3378 #if defined(TARGET_GENERIC)
3379 double CC[16] = {0};
3380 #else
3381 ALIGNED( double CC[16], 64 ) = {0};
3382 #endif
3383
3384 int k;
3385
3386 k = 0;
3387
3388 // k = 0
3389 if(kmax>0)
3390 {
3391 a_0 = A[0];
3392 a_1 = A[1];
3393 a_2 = A[2];
3394 a_3 = A[3];
3395
3396 b_0 = B[0];
3397
3398 CC[0+bs*0] += a_0 * b_0;
3399 CC[1+bs*0] += a_1 * b_0;
3400 CC[2+bs*0] += a_2 * b_0;
3401 CC[3+bs*0] += a_3 * b_0;
3402
3403 A += 4;
3404 B += 4;
3405 k++;
3406 }
3407
3408 // k = 1
3409 if(kmax>1)
3410 {
3411 a_0 = A[0];
3412 a_1 = A[1];
3413 a_2 = A[2];
3414 a_3 = A[3];
3415
3416 b_0 = B[0];
3417 b_1 = B[1];
3418
3419 CC[0+bs*0] += a_0 * b_0;
3420 CC[1+bs*0] += a_1 * b_0;
3421 CC[2+bs*0] += a_2 * b_0;
3422 CC[3+bs*0] += a_3 * b_0;
3423
3424 CC[0+bs*1] += a_0 * b_1;
3425 CC[1+bs*1] += a_1 * b_1;
3426 CC[2+bs*1] += a_2 * b_1;
3427 CC[3+bs*1] += a_3 * b_1;
3428
3429 A += 4;
3430 B += 4;
3431 k++;
3432 }
3433
3434 // k = 2
3435 if(kmax>2)
3436 {
3437 a_0 = A[0];
3438 a_1 = A[1];
3439 a_2 = A[2];
3440 a_3 = A[3];
3441
3442 b_0 = B[0];
3443 b_1 = B[1];
3444 b_2 = B[2];
3445
3446 CC[0+bs*0] += a_0 * b_0;
3447 CC[1+bs*0] += a_1 * b_0;
3448 CC[2+bs*0] += a_2 * b_0;
3449 CC[3+bs*0] += a_3 * b_0;
3450
3451 CC[0+bs*1] += a_0 * b_1;
3452 CC[1+bs*1] += a_1 * b_1;
3453 CC[2+bs*1] += a_2 * b_1;
3454 CC[3+bs*1] += a_3 * b_1;
3455
3456 CC[0+bs*2] += a_0 * b_2;
3457 CC[1+bs*2] += a_1 * b_2;
3458 CC[2+bs*2] += a_2 * b_2;
3459 CC[3+bs*2] += a_3 * b_2;
3460
3461 A += 4;
3462 B += 4;
3463 k++;
3464 }
3465
3466 kernel_dgemm_nt_4x4_lib4(kmax-k, alpha, A, B, alpha, CC, CC);
3467
3468 if(km>=4)
3469 {
3470 D[0+bs*0] = CC[0+bs*0];
3471 D[1+bs*0] = CC[1+bs*0];
3472 D[2+bs*0] = CC[2+bs*0];
3473 D[3+bs*0] = CC[3+bs*0];
3474
3475 if(kn==1)
3476 return;
3477
3478 D[0+bs*1] = CC[0+bs*1];
3479 D[1+bs*1] = CC[1+bs*1];
3480 D[2+bs*1] = CC[2+bs*1];
3481 D[3+bs*1] = CC[3+bs*1];
3482
3483 if(kn==2)
3484 return;
3485
3486 D[0+bs*2] = CC[0+bs*2];
3487 D[1+bs*2] = CC[1+bs*2];
3488 D[2+bs*2] = CC[2+bs*2];
3489 D[3+bs*2] = CC[3+bs*2];
3490
3491 if(kn==3)
3492 return;
3493
3494 D[0+bs*3] = CC[0+bs*3];
3495 D[1+bs*3] = CC[1+bs*3];
3496 D[2+bs*3] = CC[2+bs*3];
3497 D[3+bs*3] = CC[3+bs*3];
3498 }
3499 else if(km>=3)
3500 {
3501 D[0+bs*0] = CC[0+bs*0];
3502 D[1+bs*0] = CC[1+bs*0];
3503 D[2+bs*0] = CC[2+bs*0];
3504
3505 if(kn==1)
3506 return;
3507
3508 D[0+bs*1] = CC[0+bs*1];
3509 D[1+bs*1] = CC[1+bs*1];
3510 D[2+bs*1] = CC[2+bs*1];
3511
3512 if(kn==2)
3513 return;
3514
3515 D[0+bs*2] = CC[0+bs*2];
3516 D[1+bs*2] = CC[1+bs*2];
3517 D[2+bs*2] = CC[2+bs*2];
3518
3519 if(kn==3)
3520 return;
3521
3522 D[0+bs*3] = CC[0+bs*3];
3523 D[1+bs*3] = CC[1+bs*3];
3524 D[2+bs*3] = CC[2+bs*3];
3525 }
3526 else if(km>=2)
3527 {
3528 D[0+bs*0] = CC[0+bs*0];
3529 D[1+bs*0] = CC[1+bs*0];
3530
3531 if(kn==1)
3532 return;
3533
3534 D[0+bs*1] = CC[0+bs*1];
3535 D[1+bs*1] = CC[1+bs*1];
3536
3537 if(kn==2)
3538 return;
3539
3540 D[0+bs*2] = CC[0+bs*2];
3541 D[1+bs*2] = CC[1+bs*2];
3542
3543 if(kn==3)
3544 return;
3545
3546 D[0+bs*3] = CC[0+bs*3];
3547 D[1+bs*3] = CC[1+bs*3];
3548 }
3549 else //if(km>=1)
3550 {
3551 D[0+bs*0] = CC[0+bs*0];
3552
3553 if(kn==1)
3554 return;
3555
3556 D[0+bs*1] = CC[0+bs*1];
3557
3558 if(kn==2)
3559 return;
3560
3561 D[0+bs*2] = CC[0+bs*2];
3562
3563 if(kn==3)
3564 return;
3565
3566 D[0+bs*3] = CC[0+bs*3];
3567 }
3568
3569 return;
3570
3571 }
3572 #endif
3573
3574
3575
3576
3577 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_4x4_lib4(int kmax,double * alpha,double * A,int offsetB,double * B,int sdb,double * D)3578 void kernel_dtrmm_nn_rl_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D)
3579 {
3580
3581 const int bs = 4;
3582
3583 double
3584 a_0, a_1, a_2, a_3,
3585 b_0, b_1, b_2, b_3;
3586
3587 #if defined(TARGET_GENERIC)
3588 double CC[16] = {0};
3589 #else
3590 ALIGNED( double CC[16], 64 ) = {0};
3591 #endif
3592
3593 double *D1;
3594
3595 int k;
3596
3597 B += offsetB;
3598
3599 k = 0;
3600
3601 if(offsetB==0)
3602 {
3603
3604 // k = 0
3605
3606 a_0 = A[0];
3607 a_1 = A[1];
3608 a_2 = A[2];
3609 a_3 = A[3];
3610
3611 b_0 = B[0];
3612 CC[0+bs*0] += a_0 * b_0;
3613 CC[1+bs*0] += a_1 * b_0;
3614 CC[2+bs*0] += a_2 * b_0;
3615 CC[3+bs*0] += a_3 * b_0;
3616
3617 A += 4;
3618 B += 1;
3619 k += 1;
3620
3621 if(k>=kmax)
3622 goto store;
3623
3624 // k = 1
3625
3626 a_0 = A[0];
3627 a_1 = A[1];
3628 a_2 = A[2];
3629 a_3 = A[3];
3630
3631 b_0 = B[0];
3632 CC[0+bs*0] += a_0 * b_0;
3633 CC[1+bs*0] += a_1 * b_0;
3634 CC[2+bs*0] += a_2 * b_0;
3635 CC[3+bs*0] += a_3 * b_0;
3636
3637 b_1 = B[4];
3638 CC[0+bs*1] += a_0 * b_1;
3639 CC[1+bs*1] += a_1 * b_1;
3640 CC[2+bs*1] += a_2 * b_1;
3641 CC[3+bs*1] += a_3 * b_1;
3642
3643 A += 4;
3644 B += 1;
3645 k += 1;
3646
3647 if(k>=kmax)
3648 goto store;
3649
3650 // k = 2
3651
3652 a_0 = A[0];
3653 a_1 = A[1];
3654 a_2 = A[2];
3655 a_3 = A[3];
3656
3657 b_0 = B[0];
3658 CC[0+bs*0] += a_0 * b_0;
3659 CC[1+bs*0] += a_1 * b_0;
3660 CC[2+bs*0] += a_2 * b_0;
3661 CC[3+bs*0] += a_3 * b_0;
3662
3663 b_1 = B[4];
3664 CC[0+bs*1] += a_0 * b_1;
3665 CC[1+bs*1] += a_1 * b_1;
3666 CC[2+bs*1] += a_2 * b_1;
3667 CC[3+bs*1] += a_3 * b_1;
3668
3669 b_2 = B[8];
3670 CC[0+bs*2] += a_0 * b_2;
3671 CC[1+bs*2] += a_1 * b_2;
3672 CC[2+bs*2] += a_2 * b_2;
3673 CC[3+bs*2] += a_3 * b_2;
3674
3675 A += 4;
3676 B += 1;
3677 k += 1;
3678
3679 if(k>=kmax)
3680 goto store;
3681
3682 // k = 3
3683
3684 a_0 = A[0];
3685 a_1 = A[1];
3686 a_2 = A[2];
3687 a_3 = A[3];
3688
3689 b_0 = B[0];
3690 CC[0+bs*0] += a_0 * b_0;
3691 CC[1+bs*0] += a_1 * b_0;
3692 CC[2+bs*0] += a_2 * b_0;
3693 CC[3+bs*0] += a_3 * b_0;
3694
3695 b_1 = B[4];
3696 CC[0+bs*1] += a_0 * b_1;
3697 CC[1+bs*1] += a_1 * b_1;
3698 CC[2+bs*1] += a_2 * b_1;
3699 CC[3+bs*1] += a_3 * b_1;
3700
3701 b_2 = B[8];
3702 CC[0+bs*2] += a_0 * b_2;
3703 CC[1+bs*2] += a_1 * b_2;
3704 CC[2+bs*2] += a_2 * b_2;
3705 CC[3+bs*2] += a_3 * b_2;
3706
3707 b_3 = B[12];
3708 CC[0+bs*3] += a_0 * b_3;
3709 CC[1+bs*3] += a_1 * b_3;
3710 CC[2+bs*3] += a_2 * b_3;
3711 CC[3+bs*3] += a_3 * b_3;
3712
3713 A += 4;
3714 B += 4*sdb-3;
3715 k += 1;
3716
3717 }
3718 else if(offsetB==1)
3719 {
3720
3721 // k = 0
3722
3723 a_0 = A[0];
3724 a_1 = A[1];
3725 a_2 = A[2];
3726 a_3 = A[3];
3727
3728 b_0 = B[0];
3729 CC[0+bs*0] += a_0 * b_0;
3730 CC[1+bs*0] += a_1 * b_0;
3731 CC[2+bs*0] += a_2 * b_0;
3732 CC[3+bs*0] += a_3 * b_0;
3733
3734 A += 4;
3735 B += 1;
3736 k += 1;
3737
3738 if(k>=kmax)
3739 goto store;
3740
3741 // k = 1
3742
3743 a_0 = A[0];
3744 a_1 = A[1];
3745 a_2 = A[2];
3746 a_3 = A[3];
3747
3748 b_0 = B[0];
3749 CC[0+bs*0] += a_0 * b_0;
3750 CC[1+bs*0] += a_1 * b_0;
3751 CC[2+bs*0] += a_2 * b_0;
3752 CC[3+bs*0] += a_3 * b_0;
3753
3754 b_1 = B[4];
3755 CC[0+bs*1] += a_0 * b_1;
3756 CC[1+bs*1] += a_1 * b_1;
3757 CC[2+bs*1] += a_2 * b_1;
3758 CC[3+bs*1] += a_3 * b_1;
3759
3760 A += 4;
3761 B += 1;
3762 k += 1;
3763
3764 if(k>=kmax)
3765 goto store;
3766
3767 // k = 2
3768
3769 a_0 = A[0];
3770 a_1 = A[1];
3771 a_2 = A[2];
3772 a_3 = A[3];
3773
3774 b_0 = B[0];
3775 CC[0+bs*0] += a_0 * b_0;
3776 CC[1+bs*0] += a_1 * b_0;
3777 CC[2+bs*0] += a_2 * b_0;
3778 CC[3+bs*0] += a_3 * b_0;
3779
3780 b_1 = B[4];
3781 CC[0+bs*1] += a_0 * b_1;
3782 CC[1+bs*1] += a_1 * b_1;
3783 CC[2+bs*1] += a_2 * b_1;
3784 CC[3+bs*1] += a_3 * b_1;
3785
3786 b_2 = B[8];
3787 CC[0+bs*2] += a_0 * b_2;
3788 CC[1+bs*2] += a_1 * b_2;
3789 CC[2+bs*2] += a_2 * b_2;
3790 CC[3+bs*2] += a_3 * b_2;
3791
3792 A += 4;
3793 B += 4*sdb-3;
3794 k += 1;
3795
3796 }
3797 else if(offsetB==2)
3798 {
3799
3800 // k = 0
3801
3802 a_0 = A[0];
3803 a_1 = A[1];
3804 a_2 = A[2];
3805 a_3 = A[3];
3806
3807 b_0 = B[0];
3808 CC[0+bs*0] += a_0 * b_0;
3809 CC[1+bs*0] += a_1 * b_0;
3810 CC[2+bs*0] += a_2 * b_0;
3811 CC[3+bs*0] += a_3 * b_0;
3812
3813 A += 4;
3814 B += 1;
3815 k += 1;
3816
3817 if(k>=kmax)
3818 goto store;
3819
3820 // k = 1
3821
3822 a_0 = A[0];
3823 a_1 = A[1];
3824 a_2 = A[2];
3825 a_3 = A[3];
3826
3827 b_0 = B[0];
3828 CC[0+bs*0] += a_0 * b_0;
3829 CC[1+bs*0] += a_1 * b_0;
3830 CC[2+bs*0] += a_2 * b_0;
3831 CC[3+bs*0] += a_3 * b_0;
3832
3833 b_1 = B[4];
3834 CC[0+bs*1] += a_0 * b_1;
3835 CC[1+bs*1] += a_1 * b_1;
3836 CC[2+bs*1] += a_2 * b_1;
3837 CC[3+bs*1] += a_3 * b_1;
3838
3839 A += 4;
3840 B += 4*sdb-3;
3841 k += 1;
3842
3843 if(k>=kmax)
3844 goto store;
3845
3846 // k = 2
3847
3848 a_0 = A[0];
3849 a_1 = A[1];
3850 a_2 = A[2];
3851 a_3 = A[3];
3852
3853 b_0 = B[0];
3854 CC[0+bs*0] += a_0 * b_0;
3855 CC[1+bs*0] += a_1 * b_0;
3856 CC[2+bs*0] += a_2 * b_0;
3857 CC[3+bs*0] += a_3 * b_0;
3858
3859 b_1 = B[4];
3860 CC[0+bs*1] += a_0 * b_1;
3861 CC[1+bs*1] += a_1 * b_1;
3862 CC[2+bs*1] += a_2 * b_1;
3863 CC[3+bs*1] += a_3 * b_1;
3864
3865 b_2 = B[8];
3866 CC[0+bs*2] += a_0 * b_2;
3867 CC[1+bs*2] += a_1 * b_2;
3868 CC[2+bs*2] += a_2 * b_2;
3869 CC[3+bs*2] += a_3 * b_2;
3870
3871 A += 4;
3872 B += 1;
3873 k += 1;
3874
3875 if(k>=kmax)
3876 goto store;
3877
3878 // k = 3
3879
3880 a_0 = A[0];
3881 a_1 = A[1];
3882 a_2 = A[2];
3883 a_3 = A[3];
3884
3885 b_0 = B[0];
3886 CC[0+bs*0] += a_0 * b_0;
3887 CC[1+bs*0] += a_1 * b_0;
3888 CC[2+bs*0] += a_2 * b_0;
3889 CC[3+bs*0] += a_3 * b_0;
3890
3891 b_1 = B[4];
3892 CC[0+bs*1] += a_0 * b_1;
3893 CC[1+bs*1] += a_1 * b_1;
3894 CC[2+bs*1] += a_2 * b_1;
3895 CC[3+bs*1] += a_3 * b_1;
3896
3897 b_2 = B[8];
3898 CC[0+bs*2] += a_0 * b_2;
3899 CC[1+bs*2] += a_1 * b_2;
3900 CC[2+bs*2] += a_2 * b_2;
3901 CC[3+bs*2] += a_3 * b_2;
3902
3903 b_3 = B[12];
3904 CC[0+bs*3] += a_0 * b_3;
3905 CC[1+bs*3] += a_1 * b_3;
3906 CC[2+bs*3] += a_2 * b_3;
3907 CC[3+bs*3] += a_3 * b_3;
3908
3909 A += 4;
3910 B += 1;
3911 k += 1;
3912
3913 if(k>=kmax)
3914 goto store;
3915
3916 // k = 4
3917
3918 a_0 = A[0];
3919 a_1 = A[1];
3920 a_2 = A[2];
3921 a_3 = A[3];
3922
3923 b_0 = B[0];
3924 CC[0+bs*0] += a_0 * b_0;
3925 CC[1+bs*0] += a_1 * b_0;
3926 CC[2+bs*0] += a_2 * b_0;
3927 CC[3+bs*0] += a_3 * b_0;
3928
3929 b_1 = B[4];
3930 CC[0+bs*1] += a_0 * b_1;
3931 CC[1+bs*1] += a_1 * b_1;
3932 CC[2+bs*1] += a_2 * b_1;
3933 CC[3+bs*1] += a_3 * b_1;
3934
3935 b_2 = B[8];
3936 CC[0+bs*2] += a_0 * b_2;
3937 CC[1+bs*2] += a_1 * b_2;
3938 CC[2+bs*2] += a_2 * b_2;
3939 CC[3+bs*2] += a_3 * b_2;
3940
3941 b_3 = B[12];
3942 CC[0+bs*3] += a_0 * b_3;
3943 CC[1+bs*3] += a_1 * b_3;
3944 CC[2+bs*3] += a_2 * b_3;
3945 CC[3+bs*3] += a_3 * b_3;
3946
3947 A += 4;
3948 B += 1;
3949 k += 1;
3950
3951 if(k>=kmax)
3952 goto store;
3953
3954 // k = 5
3955
3956 a_0 = A[0];
3957 a_1 = A[1];
3958 a_2 = A[2];
3959 a_3 = A[3];
3960
3961 b_0 = B[0];
3962 CC[0+bs*0] += a_0 * b_0;
3963 CC[1+bs*0] += a_1 * b_0;
3964 CC[2+bs*0] += a_2 * b_0;
3965 CC[3+bs*0] += a_3 * b_0;
3966
3967 b_1 = B[4];
3968 CC[0+bs*1] += a_0 * b_1;
3969 CC[1+bs*1] += a_1 * b_1;
3970 CC[2+bs*1] += a_2 * b_1;
3971 CC[3+bs*1] += a_3 * b_1;
3972
3973 b_2 = B[8];
3974 CC[0+bs*2] += a_0 * b_2;
3975 CC[1+bs*2] += a_1 * b_2;
3976 CC[2+bs*2] += a_2 * b_2;
3977 CC[3+bs*2] += a_3 * b_2;
3978
3979 b_3 = B[12];
3980 CC[0+bs*3] += a_0 * b_3;
3981 CC[1+bs*3] += a_1 * b_3;
3982 CC[2+bs*3] += a_2 * b_3;
3983 CC[3+bs*3] += a_3 * b_3;
3984
3985 A += 4;
3986 B += 4*sdb-3;
3987 k += 1;
3988
3989 }
3990 else // if(offetB==3)
3991 {
3992
3993 // k = 0
3994
3995 a_0 = A[0];
3996 a_1 = A[1];
3997 a_2 = A[2];
3998 a_3 = A[3];
3999
4000 b_0 = B[0];
4001 CC[0+bs*0] += a_0 * b_0;
4002 CC[1+bs*0] += a_1 * b_0;
4003 CC[2+bs*0] += a_2 * b_0;
4004 CC[3+bs*0] += a_3 * b_0;
4005
4006 A += 4;
4007 B += 4*sdb-3;
4008 k += 1;
4009
4010 if(k>=kmax)
4011 goto store;
4012
4013 // k = 1
4014
4015 a_0 = A[0];
4016 a_1 = A[1];
4017 a_2 = A[2];
4018 a_3 = A[3];
4019
4020 b_0 = B[0];
4021 CC[0+bs*0] += a_0 * b_0;
4022 CC[1+bs*0] += a_1 * b_0;
4023 CC[2+bs*0] += a_2 * b_0;
4024 CC[3+bs*0] += a_3 * b_0;
4025
4026 b_1 = B[4];
4027 CC[0+bs*1] += a_0 * b_1;
4028 CC[1+bs*1] += a_1 * b_1;
4029 CC[2+bs*1] += a_2 * b_1;
4030 CC[3+bs*1] += a_3 * b_1;
4031
4032 A += 4;
4033 B += 1;
4034 k += 1;
4035
4036 if(k>=kmax)
4037 goto store;
4038
4039 // k = 2
4040
4041 a_0 = A[0];
4042 a_1 = A[1];
4043 a_2 = A[2];
4044 a_3 = A[3];
4045
4046 b_0 = B[0];
4047 CC[0+bs*0] += a_0 * b_0;
4048 CC[1+bs*0] += a_1 * b_0;
4049 CC[2+bs*0] += a_2 * b_0;
4050 CC[3+bs*0] += a_3 * b_0;
4051
4052 b_1 = B[4];
4053 CC[0+bs*1] += a_0 * b_1;
4054 CC[1+bs*1] += a_1 * b_1;
4055 CC[2+bs*1] += a_2 * b_1;
4056 CC[3+bs*1] += a_3 * b_1;
4057
4058 b_2 = B[8];
4059 CC[0+bs*2] += a_0 * b_2;
4060 CC[1+bs*2] += a_1 * b_2;
4061 CC[2+bs*2] += a_2 * b_2;
4062 CC[3+bs*2] += a_3 * b_2;
4063
4064 A += 4;
4065 B += 1;
4066 k += 1;
4067
4068 if(k>=kmax)
4069 goto store;
4070
4071 // k = 3
4072
4073 a_0 = A[0];
4074 a_1 = A[1];
4075 a_2 = A[2];
4076 a_3 = A[3];
4077
4078 b_0 = B[0];
4079 CC[0+bs*0] += a_0 * b_0;
4080 CC[1+bs*0] += a_1 * b_0;
4081 CC[2+bs*0] += a_2 * b_0;
4082 CC[3+bs*0] += a_3 * b_0;
4083
4084 b_1 = B[4];
4085 CC[0+bs*1] += a_0 * b_1;
4086 CC[1+bs*1] += a_1 * b_1;
4087 CC[2+bs*1] += a_2 * b_1;
4088 CC[3+bs*1] += a_3 * b_1;
4089
4090 b_2 = B[8];
4091 CC[0+bs*2] += a_0 * b_2;
4092 CC[1+bs*2] += a_1 * b_2;
4093 CC[2+bs*2] += a_2 * b_2;
4094 CC[3+bs*2] += a_3 * b_2;
4095
4096 b_3 = B[12];
4097 CC[0+bs*3] += a_0 * b_3;
4098 CC[1+bs*3] += a_1 * b_3;
4099 CC[2+bs*3] += a_2 * b_3;
4100 CC[3+bs*3] += a_3 * b_3;
4101
4102 A += 4;
4103 B += 1;
4104 k += 1;
4105
4106 if(k>=kmax)
4107 goto store;
4108
4109 // k = 4
4110
4111 a_0 = A[0];
4112 a_1 = A[1];
4113 a_2 = A[2];
4114 a_3 = A[3];
4115
4116 b_0 = B[0];
4117 CC[0+bs*0] += a_0 * b_0;
4118 CC[1+bs*0] += a_1 * b_0;
4119 CC[2+bs*0] += a_2 * b_0;
4120 CC[3+bs*0] += a_3 * b_0;
4121
4122 b_1 = B[4];
4123 CC[0+bs*1] += a_0 * b_1;
4124 CC[1+bs*1] += a_1 * b_1;
4125 CC[2+bs*1] += a_2 * b_1;
4126 CC[3+bs*1] += a_3 * b_1;
4127
4128 b_2 = B[8];
4129 CC[0+bs*2] += a_0 * b_2;
4130 CC[1+bs*2] += a_1 * b_2;
4131 CC[2+bs*2] += a_2 * b_2;
4132 CC[3+bs*2] += a_3 * b_2;
4133
4134 b_3 = B[12];
4135 CC[0+bs*3] += a_0 * b_3;
4136 CC[1+bs*3] += a_1 * b_3;
4137 CC[2+bs*3] += a_2 * b_3;
4138 CC[3+bs*3] += a_3 * b_3;
4139
4140 A += 4;
4141 B += 4*sdb-3;
4142 k += 1;
4143
4144 }
4145
4146 store:
4147
4148 CC[0+bs*0] = alpha[0]*CC[0+bs*0];
4149 CC[1+bs*0] = alpha[0]*CC[1+bs*0];
4150 CC[2+bs*0] = alpha[0]*CC[2+bs*0];
4151 CC[3+bs*0] = alpha[0]*CC[3+bs*0];
4152
4153 CC[0+bs*1] = alpha[0]*CC[0+bs*1];
4154 CC[1+bs*1] = alpha[0]*CC[1+bs*1];
4155 CC[2+bs*1] = alpha[0]*CC[2+bs*1];
4156 CC[3+bs*1] = alpha[0]*CC[3+bs*1];
4157
4158 CC[0+bs*2] = alpha[0]*CC[0+bs*2];
4159 CC[1+bs*2] = alpha[0]*CC[1+bs*2];
4160 CC[2+bs*2] = alpha[0]*CC[2+bs*2];
4161 CC[3+bs*2] = alpha[0]*CC[3+bs*2];
4162
4163 CC[0+bs*3] = alpha[0]*CC[0+bs*3];
4164 CC[1+bs*3] = alpha[0]*CC[1+bs*3];
4165 CC[2+bs*3] = alpha[0]*CC[2+bs*3];
4166 CC[3+bs*3] = alpha[0]*CC[3+bs*3];
4167
4168 double beta1 = 1.0;
4169
4170 kernel_dgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, D);
4171
4172 return;
4173
4174 }
4175 #endif
4176
4177
4178
4179 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_4x4_vs_lib4(int kmax,double * alpha,double * A,int offsetB,double * B,int sdb,double * D,int m1,int n1)4180 void kernel_dtrmm_nn_rl_4x4_vs_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *D, int m1, int n1)
4181 {
4182
4183 const int bs = 4;
4184
4185 double
4186 a_0, a_1, a_2, a_3,
4187 b_0, b_1, b_2, b_3;
4188
4189 #if defined(TARGET_GENERIC)
4190 double CC[16] = {0};
4191 #else
4192 ALIGNED( double CC[16], 64 ) = {0};
4193 #endif
4194
4195 double *D1;
4196
4197 int k;
4198
4199 B += offsetB;
4200
4201 k = 0;
4202
4203 if(offsetB==0)
4204 {
4205
4206 // k = 0
4207
4208 a_0 = A[0];
4209 a_1 = A[1];
4210 a_2 = A[2];
4211 a_3 = A[3];
4212
4213 b_0 = B[0];
4214 CC[0+bs*0] += a_0 * b_0;
4215 CC[1+bs*0] += a_1 * b_0;
4216 CC[2+bs*0] += a_2 * b_0;
4217 CC[3+bs*0] += a_3 * b_0;
4218
4219 A += 4;
4220 B += 1;
4221 k += 1;
4222
4223 if(k>=kmax)
4224 goto store;
4225
4226 // k = 1
4227
4228 a_0 = A[0];
4229 a_1 = A[1];
4230 a_2 = A[2];
4231 a_3 = A[3];
4232
4233 b_0 = B[0];
4234 CC[0+bs*0] += a_0 * b_0;
4235 CC[1+bs*0] += a_1 * b_0;
4236 CC[2+bs*0] += a_2 * b_0;
4237 CC[3+bs*0] += a_3 * b_0;
4238
4239 b_1 = B[4];
4240 CC[0+bs*1] += a_0 * b_1;
4241 CC[1+bs*1] += a_1 * b_1;
4242 CC[2+bs*1] += a_2 * b_1;
4243 CC[3+bs*1] += a_3 * b_1;
4244
4245 A += 4;
4246 B += 1;
4247 k += 1;
4248
4249 if(k>=kmax)
4250 goto store;
4251
4252 // k = 2
4253
4254 a_0 = A[0];
4255 a_1 = A[1];
4256 a_2 = A[2];
4257 a_3 = A[3];
4258
4259 b_0 = B[0];
4260 CC[0+bs*0] += a_0 * b_0;
4261 CC[1+bs*0] += a_1 * b_0;
4262 CC[2+bs*0] += a_2 * b_0;
4263 CC[3+bs*0] += a_3 * b_0;
4264
4265 b_1 = B[4];
4266 CC[0+bs*1] += a_0 * b_1;
4267 CC[1+bs*1] += a_1 * b_1;
4268 CC[2+bs*1] += a_2 * b_1;
4269 CC[3+bs*1] += a_3 * b_1;
4270
4271 b_2 = B[8];
4272 CC[0+bs*2] += a_0 * b_2;
4273 CC[1+bs*2] += a_1 * b_2;
4274 CC[2+bs*2] += a_2 * b_2;
4275 CC[3+bs*2] += a_3 * b_2;
4276
4277 A += 4;
4278 B += 1;
4279 k += 1;
4280
4281 if(k>=kmax)
4282 goto store;
4283
4284 // k = 3
4285
4286 a_0 = A[0];
4287 a_1 = A[1];
4288 a_2 = A[2];
4289 a_3 = A[3];
4290
4291 b_0 = B[0];
4292 CC[0+bs*0] += a_0 * b_0;
4293 CC[1+bs*0] += a_1 * b_0;
4294 CC[2+bs*0] += a_2 * b_0;
4295 CC[3+bs*0] += a_3 * b_0;
4296
4297 b_1 = B[4];
4298 CC[0+bs*1] += a_0 * b_1;
4299 CC[1+bs*1] += a_1 * b_1;
4300 CC[2+bs*1] += a_2 * b_1;
4301 CC[3+bs*1] += a_3 * b_1;
4302
4303 b_2 = B[8];
4304 CC[0+bs*2] += a_0 * b_2;
4305 CC[1+bs*2] += a_1 * b_2;
4306 CC[2+bs*2] += a_2 * b_2;
4307 CC[3+bs*2] += a_3 * b_2;
4308
4309 b_3 = B[12];
4310 CC[0+bs*3] += a_0 * b_3;
4311 CC[1+bs*3] += a_1 * b_3;
4312 CC[2+bs*3] += a_2 * b_3;
4313 CC[3+bs*3] += a_3 * b_3;
4314
4315 A += 4;
4316 B += 4*sdb-3;
4317 k += 1;
4318
4319 }
4320 else if(offsetB==1)
4321 {
4322
4323 // k = 0
4324
4325 a_0 = A[0];
4326 a_1 = A[1];
4327 a_2 = A[2];
4328 a_3 = A[3];
4329
4330 b_0 = B[0];
4331 CC[0+bs*0] += a_0 * b_0;
4332 CC[1+bs*0] += a_1 * b_0;
4333 CC[2+bs*0] += a_2 * b_0;
4334 CC[3+bs*0] += a_3 * b_0;
4335
4336 A += 4;
4337 B += 1;
4338 k += 1;
4339
4340 if(k>=kmax)
4341 goto store;
4342
4343 // k = 1
4344
4345 a_0 = A[0];
4346 a_1 = A[1];
4347 a_2 = A[2];
4348 a_3 = A[3];
4349
4350 b_0 = B[0];
4351 CC[0+bs*0] += a_0 * b_0;
4352 CC[1+bs*0] += a_1 * b_0;
4353 CC[2+bs*0] += a_2 * b_0;
4354 CC[3+bs*0] += a_3 * b_0;
4355
4356 b_1 = B[4];
4357 CC[0+bs*1] += a_0 * b_1;
4358 CC[1+bs*1] += a_1 * b_1;
4359 CC[2+bs*1] += a_2 * b_1;
4360 CC[3+bs*1] += a_3 * b_1;
4361
4362 A += 4;
4363 B += 1;
4364 k += 1;
4365
4366 if(k>=kmax)
4367 goto store;
4368
4369 // k = 2
4370
4371 a_0 = A[0];
4372 a_1 = A[1];
4373 a_2 = A[2];
4374 a_3 = A[3];
4375
4376 b_0 = B[0];
4377 CC[0+bs*0] += a_0 * b_0;
4378 CC[1+bs*0] += a_1 * b_0;
4379 CC[2+bs*0] += a_2 * b_0;
4380 CC[3+bs*0] += a_3 * b_0;
4381
4382 b_1 = B[4];
4383 CC[0+bs*1] += a_0 * b_1;
4384 CC[1+bs*1] += a_1 * b_1;
4385 CC[2+bs*1] += a_2 * b_1;
4386 CC[3+bs*1] += a_3 * b_1;
4387
4388 b_2 = B[8];
4389 CC[0+bs*2] += a_0 * b_2;
4390 CC[1+bs*2] += a_1 * b_2;
4391 CC[2+bs*2] += a_2 * b_2;
4392 CC[3+bs*2] += a_3 * b_2;
4393
4394 A += 4;
4395 B += 4*sdb-3;
4396 k += 1;
4397
4398 }
4399 else if(offsetB==2)
4400 {
4401
4402 // k = 0
4403
4404 a_0 = A[0];
4405 a_1 = A[1];
4406 a_2 = A[2];
4407 a_3 = A[3];
4408
4409 b_0 = B[0];
4410 CC[0+bs*0] += a_0 * b_0;
4411 CC[1+bs*0] += a_1 * b_0;
4412 CC[2+bs*0] += a_2 * b_0;
4413 CC[3+bs*0] += a_3 * b_0;
4414
4415 A += 4;
4416 B += 1;
4417 k += 1;
4418
4419 if(k>=kmax)
4420 goto store;
4421
4422 // k = 1
4423
4424 a_0 = A[0];
4425 a_1 = A[1];
4426 a_2 = A[2];
4427 a_3 = A[3];
4428
4429 b_0 = B[0];
4430 CC[0+bs*0] += a_0 * b_0;
4431 CC[1+bs*0] += a_1 * b_0;
4432 CC[2+bs*0] += a_2 * b_0;
4433 CC[3+bs*0] += a_3 * b_0;
4434
4435 b_1 = B[4];
4436 CC[0+bs*1] += a_0 * b_1;
4437 CC[1+bs*1] += a_1 * b_1;
4438 CC[2+bs*1] += a_2 * b_1;
4439 CC[3+bs*1] += a_3 * b_1;
4440
4441 A += 4;
4442 B += 4*sdb-3;
4443 k += 1;
4444
4445 if(k>=kmax)
4446 goto store;
4447
4448 // k = 2
4449
4450 a_0 = A[0];
4451 a_1 = A[1];
4452 a_2 = A[2];
4453 a_3 = A[3];
4454
4455 b_0 = B[0];
4456 CC[0+bs*0] += a_0 * b_0;
4457 CC[1+bs*0] += a_1 * b_0;
4458 CC[2+bs*0] += a_2 * b_0;
4459 CC[3+bs*0] += a_3 * b_0;
4460
4461 b_1 = B[4];
4462 CC[0+bs*1] += a_0 * b_1;
4463 CC[1+bs*1] += a_1 * b_1;
4464 CC[2+bs*1] += a_2 * b_1;
4465 CC[3+bs*1] += a_3 * b_1;
4466
4467 b_2 = B[8];
4468 CC[0+bs*2] += a_0 * b_2;
4469 CC[1+bs*2] += a_1 * b_2;
4470 CC[2+bs*2] += a_2 * b_2;
4471 CC[3+bs*2] += a_3 * b_2;
4472
4473 A += 4;
4474 B += 1;
4475 k += 1;
4476
4477 if(k>=kmax)
4478 goto store;
4479
4480 // k = 3
4481
4482 a_0 = A[0];
4483 a_1 = A[1];
4484 a_2 = A[2];
4485 a_3 = A[3];
4486
4487 b_0 = B[0];
4488 CC[0+bs*0] += a_0 * b_0;
4489 CC[1+bs*0] += a_1 * b_0;
4490 CC[2+bs*0] += a_2 * b_0;
4491 CC[3+bs*0] += a_3 * b_0;
4492
4493 b_1 = B[4];
4494 CC[0+bs*1] += a_0 * b_1;
4495 CC[1+bs*1] += a_1 * b_1;
4496 CC[2+bs*1] += a_2 * b_1;
4497 CC[3+bs*1] += a_3 * b_1;
4498
4499 b_2 = B[8];
4500 CC[0+bs*2] += a_0 * b_2;
4501 CC[1+bs*2] += a_1 * b_2;
4502 CC[2+bs*2] += a_2 * b_2;
4503 CC[3+bs*2] += a_3 * b_2;
4504
4505 b_3 = B[12];
4506 CC[0+bs*3] += a_0 * b_3;
4507 CC[1+bs*3] += a_1 * b_3;
4508 CC[2+bs*3] += a_2 * b_3;
4509 CC[3+bs*3] += a_3 * b_3;
4510
4511 A += 4;
4512 B += 1;
4513 k += 1;
4514
4515 if(k>=kmax)
4516 goto store;
4517
4518 // k = 4
4519
4520 a_0 = A[0];
4521 a_1 = A[1];
4522 a_2 = A[2];
4523 a_3 = A[3];
4524
4525 b_0 = B[0];
4526 CC[0+bs*0] += a_0 * b_0;
4527 CC[1+bs*0] += a_1 * b_0;
4528 CC[2+bs*0] += a_2 * b_0;
4529 CC[3+bs*0] += a_3 * b_0;
4530
4531 b_1 = B[4];
4532 CC[0+bs*1] += a_0 * b_1;
4533 CC[1+bs*1] += a_1 * b_1;
4534 CC[2+bs*1] += a_2 * b_1;
4535 CC[3+bs*1] += a_3 * b_1;
4536
4537 b_2 = B[8];
4538 CC[0+bs*2] += a_0 * b_2;
4539 CC[1+bs*2] += a_1 * b_2;
4540 CC[2+bs*2] += a_2 * b_2;
4541 CC[3+bs*2] += a_3 * b_2;
4542
4543 b_3 = B[12];
4544 CC[0+bs*3] += a_0 * b_3;
4545 CC[1+bs*3] += a_1 * b_3;
4546 CC[2+bs*3] += a_2 * b_3;
4547 CC[3+bs*3] += a_3 * b_3;
4548
4549 A += 4;
4550 B += 1;
4551 k += 1;
4552
4553 if(k>=kmax)
4554 goto store;
4555
4556 // k = 5
4557
4558 a_0 = A[0];
4559 a_1 = A[1];
4560 a_2 = A[2];
4561 a_3 = A[3];
4562
4563 b_0 = B[0];
4564 CC[0+bs*0] += a_0 * b_0;
4565 CC[1+bs*0] += a_1 * b_0;
4566 CC[2+bs*0] += a_2 * b_0;
4567 CC[3+bs*0] += a_3 * b_0;
4568
4569 b_1 = B[4];
4570 CC[0+bs*1] += a_0 * b_1;
4571 CC[1+bs*1] += a_1 * b_1;
4572 CC[2+bs*1] += a_2 * b_1;
4573 CC[3+bs*1] += a_3 * b_1;
4574
4575 b_2 = B[8];
4576 CC[0+bs*2] += a_0 * b_2;
4577 CC[1+bs*2] += a_1 * b_2;
4578 CC[2+bs*2] += a_2 * b_2;
4579 CC[3+bs*2] += a_3 * b_2;
4580
4581 b_3 = B[12];
4582 CC[0+bs*3] += a_0 * b_3;
4583 CC[1+bs*3] += a_1 * b_3;
4584 CC[2+bs*3] += a_2 * b_3;
4585 CC[3+bs*3] += a_3 * b_3;
4586
4587 A += 4;
4588 B += 4*sdb-3;
4589 k += 1;
4590
4591 }
4592 else // if(offetB==3)
4593 {
4594
4595 // k = 0
4596
4597 a_0 = A[0];
4598 a_1 = A[1];
4599 a_2 = A[2];
4600 a_3 = A[3];
4601
4602 b_0 = B[0];
4603 CC[0+bs*0] += a_0 * b_0;
4604 CC[1+bs*0] += a_1 * b_0;
4605 CC[2+bs*0] += a_2 * b_0;
4606 CC[3+bs*0] += a_3 * b_0;
4607
4608 A += 4;
4609 B += 4*sdb-3;
4610 k += 1;
4611
4612 if(k>=kmax)
4613 goto store;
4614
4615 // k = 1
4616
4617 a_0 = A[0];
4618 a_1 = A[1];
4619 a_2 = A[2];
4620 a_3 = A[3];
4621
4622 b_0 = B[0];
4623 CC[0+bs*0] += a_0 * b_0;
4624 CC[1+bs*0] += a_1 * b_0;
4625 CC[2+bs*0] += a_2 * b_0;
4626 CC[3+bs*0] += a_3 * b_0;
4627
4628 b_1 = B[4];
4629 CC[0+bs*1] += a_0 * b_1;
4630 CC[1+bs*1] += a_1 * b_1;
4631 CC[2+bs*1] += a_2 * b_1;
4632 CC[3+bs*1] += a_3 * b_1;
4633
4634 A += 4;
4635 B += 1;
4636 k += 1;
4637
4638 if(k>=kmax)
4639 goto store;
4640
4641 // k = 2
4642
4643 a_0 = A[0];
4644 a_1 = A[1];
4645 a_2 = A[2];
4646 a_3 = A[3];
4647
4648 b_0 = B[0];
4649 CC[0+bs*0] += a_0 * b_0;
4650 CC[1+bs*0] += a_1 * b_0;
4651 CC[2+bs*0] += a_2 * b_0;
4652 CC[3+bs*0] += a_3 * b_0;
4653
4654 b_1 = B[4];
4655 CC[0+bs*1] += a_0 * b_1;
4656 CC[1+bs*1] += a_1 * b_1;
4657 CC[2+bs*1] += a_2 * b_1;
4658 CC[3+bs*1] += a_3 * b_1;
4659
4660 b_2 = B[8];
4661 CC[0+bs*2] += a_0 * b_2;
4662 CC[1+bs*2] += a_1 * b_2;
4663 CC[2+bs*2] += a_2 * b_2;
4664 CC[3+bs*2] += a_3 * b_2;
4665
4666 A += 4;
4667 B += 1;
4668 k += 1;
4669
4670 if(k>=kmax)
4671 goto store;
4672
4673 // k = 3
4674
4675 a_0 = A[0];
4676 a_1 = A[1];
4677 a_2 = A[2];
4678 a_3 = A[3];
4679
4680 b_0 = B[0];
4681 CC[0+bs*0] += a_0 * b_0;
4682 CC[1+bs*0] += a_1 * b_0;
4683 CC[2+bs*0] += a_2 * b_0;
4684 CC[3+bs*0] += a_3 * b_0;
4685
4686 b_1 = B[4];
4687 CC[0+bs*1] += a_0 * b_1;
4688 CC[1+bs*1] += a_1 * b_1;
4689 CC[2+bs*1] += a_2 * b_1;
4690 CC[3+bs*1] += a_3 * b_1;
4691
4692 b_2 = B[8];
4693 CC[0+bs*2] += a_0 * b_2;
4694 CC[1+bs*2] += a_1 * b_2;
4695 CC[2+bs*2] += a_2 * b_2;
4696 CC[3+bs*2] += a_3 * b_2;
4697
4698 b_3 = B[12];
4699 CC[0+bs*3] += a_0 * b_3;
4700 CC[1+bs*3] += a_1 * b_3;
4701 CC[2+bs*3] += a_2 * b_3;
4702 CC[3+bs*3] += a_3 * b_3;
4703
4704 A += 4;
4705 B += 1;
4706 k += 1;
4707
4708 if(k>=kmax)
4709 goto store;
4710
4711 // k = 4
4712
4713 a_0 = A[0];
4714 a_1 = A[1];
4715 a_2 = A[2];
4716 a_3 = A[3];
4717
4718 b_0 = B[0];
4719 CC[0+bs*0] += a_0 * b_0;
4720 CC[1+bs*0] += a_1 * b_0;
4721 CC[2+bs*0] += a_2 * b_0;
4722 CC[3+bs*0] += a_3 * b_0;
4723
4724 b_1 = B[4];
4725 CC[0+bs*1] += a_0 * b_1;
4726 CC[1+bs*1] += a_1 * b_1;
4727 CC[2+bs*1] += a_2 * b_1;
4728 CC[3+bs*1] += a_3 * b_1;
4729
4730 b_2 = B[8];
4731 CC[0+bs*2] += a_0 * b_2;
4732 CC[1+bs*2] += a_1 * b_2;
4733 CC[2+bs*2] += a_2 * b_2;
4734 CC[3+bs*2] += a_3 * b_2;
4735
4736 b_3 = B[12];
4737 CC[0+bs*3] += a_0 * b_3;
4738 CC[1+bs*3] += a_1 * b_3;
4739 CC[2+bs*3] += a_2 * b_3;
4740 CC[3+bs*3] += a_3 * b_3;
4741
4742 A += 4;
4743 B += 4*sdb-3;
4744 k += 1;
4745
4746 }
4747
4748 store:
4749
4750 CC[0+bs*0] = alpha[0]*CC[0+bs*0];
4751 CC[1+bs*0] = alpha[0]*CC[1+bs*0];
4752 CC[2+bs*0] = alpha[0]*CC[2+bs*0];
4753 CC[3+bs*0] = alpha[0]*CC[3+bs*0];
4754
4755 CC[0+bs*1] = alpha[0]*CC[0+bs*1];
4756 CC[1+bs*1] = alpha[0]*CC[1+bs*1];
4757 CC[2+bs*1] = alpha[0]*CC[2+bs*1];
4758 CC[3+bs*1] = alpha[0]*CC[3+bs*1];
4759
4760 CC[0+bs*2] = alpha[0]*CC[0+bs*2];
4761 CC[1+bs*2] = alpha[0]*CC[1+bs*2];
4762 CC[2+bs*2] = alpha[0]*CC[2+bs*2];
4763 CC[3+bs*2] = alpha[0]*CC[3+bs*2];
4764
4765 CC[0+bs*3] = alpha[0]*CC[0+bs*3];
4766 CC[1+bs*3] = alpha[0]*CC[1+bs*3];
4767 CC[2+bs*3] = alpha[0]*CC[2+bs*3];
4768 CC[3+bs*3] = alpha[0]*CC[3+bs*3];
4769
4770 double beta1 = 1.0;
4771
4772 kernel_dgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, CC);
4773
4774 if(m1>=4)
4775 {
4776 D[0+bs*0] = CC[0+bs*0];
4777 D[1+bs*0] = CC[1+bs*0];
4778 D[2+bs*0] = CC[2+bs*0];
4779 D[3+bs*0] = CC[3+bs*0];
4780
4781 if(n1==1)
4782 return;
4783
4784 D[0+bs*1] = CC[0+bs*1];
4785 D[1+bs*1] = CC[1+bs*1];
4786 D[2+bs*1] = CC[2+bs*1];
4787 D[3+bs*1] = CC[3+bs*1];
4788
4789 if(n1==2)
4790 return;
4791
4792 D[0+bs*2] = CC[0+bs*2];
4793 D[1+bs*2] = CC[1+bs*2];
4794 D[2+bs*2] = CC[2+bs*2];
4795 D[3+bs*2] = CC[3+bs*2];
4796
4797 if(n1==3)
4798 return;
4799
4800 D[0+bs*3] = CC[0+bs*3];
4801 D[1+bs*3] = CC[1+bs*3];
4802 D[2+bs*3] = CC[2+bs*3];
4803 D[3+bs*3] = CC[3+bs*3];
4804 }
4805 else if(m1>=3)
4806 {
4807 D[0+bs*0] = CC[0+bs*0];
4808 D[1+bs*0] = CC[1+bs*0];
4809 D[2+bs*0] = CC[2+bs*0];
4810
4811 if(n1==1)
4812 return;
4813
4814 D[0+bs*1] = CC[0+bs*1];
4815 D[1+bs*1] = CC[1+bs*1];
4816 D[2+bs*1] = CC[2+bs*1];
4817
4818 if(n1==2)
4819 return;
4820
4821 D[0+bs*2] = CC[0+bs*2];
4822 D[1+bs*2] = CC[1+bs*2];
4823 D[2+bs*2] = CC[2+bs*2];
4824
4825 if(n1==3)
4826 return;
4827
4828 D[0+bs*3] = CC[0+bs*3];
4829 D[1+bs*3] = CC[1+bs*3];
4830 D[2+bs*3] = CC[2+bs*3];
4831 }
4832 else if(m1>=2)
4833 {
4834 D[0+bs*0] = CC[0+bs*0];
4835 D[1+bs*0] = CC[1+bs*0];
4836
4837 if(n1==1)
4838 return;
4839
4840 D[0+bs*1] = CC[0+bs*1];
4841 D[1+bs*1] = CC[1+bs*1];
4842
4843 if(n1==2)
4844 return;
4845
4846 D[0+bs*2] = CC[0+bs*2];
4847 D[1+bs*2] = CC[1+bs*2];
4848
4849 if(n1==3)
4850 return;
4851
4852 D[0+bs*3] = CC[0+bs*3];
4853 D[1+bs*3] = CC[1+bs*3];
4854 }
4855 else //if(m1>=1)
4856 {
4857 D[0+bs*0] = CC[0+bs*0];
4858
4859 if(n1==1)
4860 return;
4861
4862 D[0+bs*1] = CC[0+bs*1];
4863
4864 if(n1==2)
4865 return;
4866
4867 D[0+bs*2] = CC[0+bs*2];
4868
4869 if(n1==3)
4870 return;
4871
4872 D[0+bs*3] = CC[0+bs*3];
4873 }
4874
4875 return;
4876
4877 }
4878 #endif
4879
4880
4881
4882 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax,double * alpha,double * A,int offsetB,double * B,int sdb,int offsetD,double * D0,int sdd,int m0,int m1,int n0,int n1)4883 void kernel_dtrmm_nn_rl_4x4_gen_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, int offsetD, double *D0, int sdd, int m0, int m1, int n0, int n1)
4884 {
4885
4886 const int bs = 4;
4887
4888 double
4889 a_0, a_1, a_2, a_3,
4890 b_0, b_1, b_2, b_3;
4891
4892 #if defined(TARGET_GENERIC)
4893 double CC[16] = {0};
4894 #else
4895 ALIGNED( double CC[16], 64 ) = {0};
4896 #endif
4897
4898 double *D1;
4899
4900 int k;
4901
4902 B += offsetB;
4903
4904 k = 0;
4905
4906 if(offsetB==0)
4907 {
4908
4909 // k = 0
4910
4911 a_0 = A[0];
4912 a_1 = A[1];
4913 a_2 = A[2];
4914 a_3 = A[3];
4915
4916 b_0 = B[0];
4917 CC[0+bs*0] += a_0 * b_0;
4918 CC[1+bs*0] += a_1 * b_0;
4919 CC[2+bs*0] += a_2 * b_0;
4920 CC[3+bs*0] += a_3 * b_0;
4921
4922 A += 4;
4923 B += 1;
4924 k += 1;
4925
4926 if(k>=kmax)
4927 goto store;
4928
4929 // k = 1
4930
4931 a_0 = A[0];
4932 a_1 = A[1];
4933 a_2 = A[2];
4934 a_3 = A[3];
4935
4936 b_0 = B[0];
4937 CC[0+bs*0] += a_0 * b_0;
4938 CC[1+bs*0] += a_1 * b_0;
4939 CC[2+bs*0] += a_2 * b_0;
4940 CC[3+bs*0] += a_3 * b_0;
4941
4942 b_1 = B[4];
4943 CC[0+bs*1] += a_0 * b_1;
4944 CC[1+bs*1] += a_1 * b_1;
4945 CC[2+bs*1] += a_2 * b_1;
4946 CC[3+bs*1] += a_3 * b_1;
4947
4948 A += 4;
4949 B += 1;
4950 k += 1;
4951
4952 if(k>=kmax)
4953 goto store;
4954
4955 // k = 2
4956
4957 a_0 = A[0];
4958 a_1 = A[1];
4959 a_2 = A[2];
4960 a_3 = A[3];
4961
4962 b_0 = B[0];
4963 CC[0+bs*0] += a_0 * b_0;
4964 CC[1+bs*0] += a_1 * b_0;
4965 CC[2+bs*0] += a_2 * b_0;
4966 CC[3+bs*0] += a_3 * b_0;
4967
4968 b_1 = B[4];
4969 CC[0+bs*1] += a_0 * b_1;
4970 CC[1+bs*1] += a_1 * b_1;
4971 CC[2+bs*1] += a_2 * b_1;
4972 CC[3+bs*1] += a_3 * b_1;
4973
4974 b_2 = B[8];
4975 CC[0+bs*2] += a_0 * b_2;
4976 CC[1+bs*2] += a_1 * b_2;
4977 CC[2+bs*2] += a_2 * b_2;
4978 CC[3+bs*2] += a_3 * b_2;
4979
4980 A += 4;
4981 B += 1;
4982 k += 1;
4983
4984 if(k>=kmax)
4985 goto store;
4986
4987 // k = 3
4988
4989 a_0 = A[0];
4990 a_1 = A[1];
4991 a_2 = A[2];
4992 a_3 = A[3];
4993
4994 b_0 = B[0];
4995 CC[0+bs*0] += a_0 * b_0;
4996 CC[1+bs*0] += a_1 * b_0;
4997 CC[2+bs*0] += a_2 * b_0;
4998 CC[3+bs*0] += a_3 * b_0;
4999
5000 b_1 = B[4];
5001 CC[0+bs*1] += a_0 * b_1;
5002 CC[1+bs*1] += a_1 * b_1;
5003 CC[2+bs*1] += a_2 * b_1;
5004 CC[3+bs*1] += a_3 * b_1;
5005
5006 b_2 = B[8];
5007 CC[0+bs*2] += a_0 * b_2;
5008 CC[1+bs*2] += a_1 * b_2;
5009 CC[2+bs*2] += a_2 * b_2;
5010 CC[3+bs*2] += a_3 * b_2;
5011
5012 b_3 = B[12];
5013 CC[0+bs*3] += a_0 * b_3;
5014 CC[1+bs*3] += a_1 * b_3;
5015 CC[2+bs*3] += a_2 * b_3;
5016 CC[3+bs*3] += a_3 * b_3;
5017
5018 A += 4;
5019 B += 4*sdb-3;
5020 k += 1;
5021
5022 }
5023 else if(offsetB==1)
5024 {
5025
5026 // k = 0
5027
5028 a_0 = A[0];
5029 a_1 = A[1];
5030 a_2 = A[2];
5031 a_3 = A[3];
5032
5033 b_0 = B[0];
5034 CC[0+bs*0] += a_0 * b_0;
5035 CC[1+bs*0] += a_1 * b_0;
5036 CC[2+bs*0] += a_2 * b_0;
5037 CC[3+bs*0] += a_3 * b_0;
5038
5039 A += 4;
5040 B += 1;
5041 k += 1;
5042
5043 if(k>=kmax)
5044 goto store;
5045
5046 // k = 1
5047
5048 a_0 = A[0];
5049 a_1 = A[1];
5050 a_2 = A[2];
5051 a_3 = A[3];
5052
5053 b_0 = B[0];
5054 CC[0+bs*0] += a_0 * b_0;
5055 CC[1+bs*0] += a_1 * b_0;
5056 CC[2+bs*0] += a_2 * b_0;
5057 CC[3+bs*0] += a_3 * b_0;
5058
5059 b_1 = B[4];
5060 CC[0+bs*1] += a_0 * b_1;
5061 CC[1+bs*1] += a_1 * b_1;
5062 CC[2+bs*1] += a_2 * b_1;
5063 CC[3+bs*1] += a_3 * b_1;
5064
5065 A += 4;
5066 B += 1;
5067 k += 1;
5068
5069 if(k>=kmax)
5070 goto store;
5071
5072 // k = 2
5073
5074 a_0 = A[0];
5075 a_1 = A[1];
5076 a_2 = A[2];
5077 a_3 = A[3];
5078
5079 b_0 = B[0];
5080 CC[0+bs*0] += a_0 * b_0;
5081 CC[1+bs*0] += a_1 * b_0;
5082 CC[2+bs*0] += a_2 * b_0;
5083 CC[3+bs*0] += a_3 * b_0;
5084
5085 b_1 = B[4];
5086 CC[0+bs*1] += a_0 * b_1;
5087 CC[1+bs*1] += a_1 * b_1;
5088 CC[2+bs*1] += a_2 * b_1;
5089 CC[3+bs*1] += a_3 * b_1;
5090
5091 b_2 = B[8];
5092 CC[0+bs*2] += a_0 * b_2;
5093 CC[1+bs*2] += a_1 * b_2;
5094 CC[2+bs*2] += a_2 * b_2;
5095 CC[3+bs*2] += a_3 * b_2;
5096
5097 A += 4;
5098 B += 4*sdb-3;
5099 k += 1;
5100
5101 }
5102 else if(offsetB==2)
5103 {
5104
5105 // k = 0
5106
5107 a_0 = A[0];
5108 a_1 = A[1];
5109 a_2 = A[2];
5110 a_3 = A[3];
5111
5112 b_0 = B[0];
5113 CC[0+bs*0] += a_0 * b_0;
5114 CC[1+bs*0] += a_1 * b_0;
5115 CC[2+bs*0] += a_2 * b_0;
5116 CC[3+bs*0] += a_3 * b_0;
5117
5118 A += 4;
5119 B += 1;
5120 k += 1;
5121
5122 if(k>=kmax)
5123 goto store;
5124
5125 // k = 1
5126
5127 a_0 = A[0];
5128 a_1 = A[1];
5129 a_2 = A[2];
5130 a_3 = A[3];
5131
5132 b_0 = B[0];
5133 CC[0+bs*0] += a_0 * b_0;
5134 CC[1+bs*0] += a_1 * b_0;
5135 CC[2+bs*0] += a_2 * b_0;
5136 CC[3+bs*0] += a_3 * b_0;
5137
5138 b_1 = B[4];
5139 CC[0+bs*1] += a_0 * b_1;
5140 CC[1+bs*1] += a_1 * b_1;
5141 CC[2+bs*1] += a_2 * b_1;
5142 CC[3+bs*1] += a_3 * b_1;
5143
5144 A += 4;
5145 B += 4*sdb-3;
5146 k += 1;
5147
5148 if(k>=kmax)
5149 goto store;
5150
5151 // k = 2
5152
5153 a_0 = A[0];
5154 a_1 = A[1];
5155 a_2 = A[2];
5156 a_3 = A[3];
5157
5158 b_0 = B[0];
5159 CC[0+bs*0] += a_0 * b_0;
5160 CC[1+bs*0] += a_1 * b_0;
5161 CC[2+bs*0] += a_2 * b_0;
5162 CC[3+bs*0] += a_3 * b_0;
5163
5164 b_1 = B[4];
5165 CC[0+bs*1] += a_0 * b_1;
5166 CC[1+bs*1] += a_1 * b_1;
5167 CC[2+bs*1] += a_2 * b_1;
5168 CC[3+bs*1] += a_3 * b_1;
5169
5170 b_2 = B[8];
5171 CC[0+bs*2] += a_0 * b_2;
5172 CC[1+bs*2] += a_1 * b_2;
5173 CC[2+bs*2] += a_2 * b_2;
5174 CC[3+bs*2] += a_3 * b_2;
5175
5176 A += 4;
5177 B += 1;
5178 k += 1;
5179
5180 if(k>=kmax)
5181 goto store;
5182
5183 // k = 3
5184
5185 a_0 = A[0];
5186 a_1 = A[1];
5187 a_2 = A[2];
5188 a_3 = A[3];
5189
5190 b_0 = B[0];
5191 CC[0+bs*0] += a_0 * b_0;
5192 CC[1+bs*0] += a_1 * b_0;
5193 CC[2+bs*0] += a_2 * b_0;
5194 CC[3+bs*0] += a_3 * b_0;
5195
5196 b_1 = B[4];
5197 CC[0+bs*1] += a_0 * b_1;
5198 CC[1+bs*1] += a_1 * b_1;
5199 CC[2+bs*1] += a_2 * b_1;
5200 CC[3+bs*1] += a_3 * b_1;
5201
5202 b_2 = B[8];
5203 CC[0+bs*2] += a_0 * b_2;
5204 CC[1+bs*2] += a_1 * b_2;
5205 CC[2+bs*2] += a_2 * b_2;
5206 CC[3+bs*2] += a_3 * b_2;
5207
5208 b_3 = B[12];
5209 CC[0+bs*3] += a_0 * b_3;
5210 CC[1+bs*3] += a_1 * b_3;
5211 CC[2+bs*3] += a_2 * b_3;
5212 CC[3+bs*3] += a_3 * b_3;
5213
5214 A += 4;
5215 B += 1;
5216 k += 1;
5217
5218 if(k>=kmax)
5219 goto store;
5220
5221 // k = 4
5222
5223 a_0 = A[0];
5224 a_1 = A[1];
5225 a_2 = A[2];
5226 a_3 = A[3];
5227
5228 b_0 = B[0];
5229 CC[0+bs*0] += a_0 * b_0;
5230 CC[1+bs*0] += a_1 * b_0;
5231 CC[2+bs*0] += a_2 * b_0;
5232 CC[3+bs*0] += a_3 * b_0;
5233
5234 b_1 = B[4];
5235 CC[0+bs*1] += a_0 * b_1;
5236 CC[1+bs*1] += a_1 * b_1;
5237 CC[2+bs*1] += a_2 * b_1;
5238 CC[3+bs*1] += a_3 * b_1;
5239
5240 b_2 = B[8];
5241 CC[0+bs*2] += a_0 * b_2;
5242 CC[1+bs*2] += a_1 * b_2;
5243 CC[2+bs*2] += a_2 * b_2;
5244 CC[3+bs*2] += a_3 * b_2;
5245
5246 b_3 = B[12];
5247 CC[0+bs*3] += a_0 * b_3;
5248 CC[1+bs*3] += a_1 * b_3;
5249 CC[2+bs*3] += a_2 * b_3;
5250 CC[3+bs*3] += a_3 * b_3;
5251
5252 A += 4;
5253 B += 1;
5254 k += 1;
5255
5256 if(k>=kmax)
5257 goto store;
5258
5259 // k = 5
5260
5261 a_0 = A[0];
5262 a_1 = A[1];
5263 a_2 = A[2];
5264 a_3 = A[3];
5265
5266 b_0 = B[0];
5267 CC[0+bs*0] += a_0 * b_0;
5268 CC[1+bs*0] += a_1 * b_0;
5269 CC[2+bs*0] += a_2 * b_0;
5270 CC[3+bs*0] += a_3 * b_0;
5271
5272 b_1 = B[4];
5273 CC[0+bs*1] += a_0 * b_1;
5274 CC[1+bs*1] += a_1 * b_1;
5275 CC[2+bs*1] += a_2 * b_1;
5276 CC[3+bs*1] += a_3 * b_1;
5277
5278 b_2 = B[8];
5279 CC[0+bs*2] += a_0 * b_2;
5280 CC[1+bs*2] += a_1 * b_2;
5281 CC[2+bs*2] += a_2 * b_2;
5282 CC[3+bs*2] += a_3 * b_2;
5283
5284 b_3 = B[12];
5285 CC[0+bs*3] += a_0 * b_3;
5286 CC[1+bs*3] += a_1 * b_3;
5287 CC[2+bs*3] += a_2 * b_3;
5288 CC[3+bs*3] += a_3 * b_3;
5289
5290 A += 4;
5291 B += 4*sdb-3;
5292 k += 1;
5293
5294 }
5295 else // if(offetB==3)
5296 {
5297
5298 // k = 0
5299
5300 a_0 = A[0];
5301 a_1 = A[1];
5302 a_2 = A[2];
5303 a_3 = A[3];
5304
5305 b_0 = B[0];
5306 CC[0+bs*0] += a_0 * b_0;
5307 CC[1+bs*0] += a_1 * b_0;
5308 CC[2+bs*0] += a_2 * b_0;
5309 CC[3+bs*0] += a_3 * b_0;
5310
5311 A += 4;
5312 B += 4*sdb-3;
5313 k += 1;
5314
5315 if(k>=kmax)
5316 goto store;
5317
5318 // k = 1
5319
5320 a_0 = A[0];
5321 a_1 = A[1];
5322 a_2 = A[2];
5323 a_3 = A[3];
5324
5325 b_0 = B[0];
5326 CC[0+bs*0] += a_0 * b_0;
5327 CC[1+bs*0] += a_1 * b_0;
5328 CC[2+bs*0] += a_2 * b_0;
5329 CC[3+bs*0] += a_3 * b_0;
5330
5331 b_1 = B[4];
5332 CC[0+bs*1] += a_0 * b_1;
5333 CC[1+bs*1] += a_1 * b_1;
5334 CC[2+bs*1] += a_2 * b_1;
5335 CC[3+bs*1] += a_3 * b_1;
5336
5337 A += 4;
5338 B += 1;
5339 k += 1;
5340
5341 if(k>=kmax)
5342 goto store;
5343
5344 // k = 2
5345
5346 a_0 = A[0];
5347 a_1 = A[1];
5348 a_2 = A[2];
5349 a_3 = A[3];
5350
5351 b_0 = B[0];
5352 CC[0+bs*0] += a_0 * b_0;
5353 CC[1+bs*0] += a_1 * b_0;
5354 CC[2+bs*0] += a_2 * b_0;
5355 CC[3+bs*0] += a_3 * b_0;
5356
5357 b_1 = B[4];
5358 CC[0+bs*1] += a_0 * b_1;
5359 CC[1+bs*1] += a_1 * b_1;
5360 CC[2+bs*1] += a_2 * b_1;
5361 CC[3+bs*1] += a_3 * b_1;
5362
5363 b_2 = B[8];
5364 CC[0+bs*2] += a_0 * b_2;
5365 CC[1+bs*2] += a_1 * b_2;
5366 CC[2+bs*2] += a_2 * b_2;
5367 CC[3+bs*2] += a_3 * b_2;
5368
5369 A += 4;
5370 B += 1;
5371 k += 1;
5372
5373 if(k>=kmax)
5374 goto store;
5375
5376 // k = 3
5377
5378 a_0 = A[0];
5379 a_1 = A[1];
5380 a_2 = A[2];
5381 a_3 = A[3];
5382
5383 b_0 = B[0];
5384 CC[0+bs*0] += a_0 * b_0;
5385 CC[1+bs*0] += a_1 * b_0;
5386 CC[2+bs*0] += a_2 * b_0;
5387 CC[3+bs*0] += a_3 * b_0;
5388
5389 b_1 = B[4];
5390 CC[0+bs*1] += a_0 * b_1;
5391 CC[1+bs*1] += a_1 * b_1;
5392 CC[2+bs*1] += a_2 * b_1;
5393 CC[3+bs*1] += a_3 * b_1;
5394
5395 b_2 = B[8];
5396 CC[0+bs*2] += a_0 * b_2;
5397 CC[1+bs*2] += a_1 * b_2;
5398 CC[2+bs*2] += a_2 * b_2;
5399 CC[3+bs*2] += a_3 * b_2;
5400
5401 b_3 = B[12];
5402 CC[0+bs*3] += a_0 * b_3;
5403 CC[1+bs*3] += a_1 * b_3;
5404 CC[2+bs*3] += a_2 * b_3;
5405 CC[3+bs*3] += a_3 * b_3;
5406
5407 A += 4;
5408 B += 1;
5409 k += 1;
5410
5411 if(k>=kmax)
5412 goto store;
5413
5414 // k = 4
5415
5416 a_0 = A[0];
5417 a_1 = A[1];
5418 a_2 = A[2];
5419 a_3 = A[3];
5420
5421 b_0 = B[0];
5422 CC[0+bs*0] += a_0 * b_0;
5423 CC[1+bs*0] += a_1 * b_0;
5424 CC[2+bs*0] += a_2 * b_0;
5425 CC[3+bs*0] += a_3 * b_0;
5426
5427 b_1 = B[4];
5428 CC[0+bs*1] += a_0 * b_1;
5429 CC[1+bs*1] += a_1 * b_1;
5430 CC[2+bs*1] += a_2 * b_1;
5431 CC[3+bs*1] += a_3 * b_1;
5432
5433 b_2 = B[8];
5434 CC[0+bs*2] += a_0 * b_2;
5435 CC[1+bs*2] += a_1 * b_2;
5436 CC[2+bs*2] += a_2 * b_2;
5437 CC[3+bs*2] += a_3 * b_2;
5438
5439 b_3 = B[12];
5440 CC[0+bs*3] += a_0 * b_3;
5441 CC[1+bs*3] += a_1 * b_3;
5442 CC[2+bs*3] += a_2 * b_3;
5443 CC[3+bs*3] += a_3 * b_3;
5444
5445 A += 4;
5446 B += 4*sdb-3;
5447 k += 1;
5448
5449 }
5450
5451 store:
5452
5453 CC[0+bs*0] = alpha[0]*CC[0+bs*0];
5454 CC[1+bs*0] = alpha[0]*CC[1+bs*0];
5455 CC[2+bs*0] = alpha[0]*CC[2+bs*0];
5456 CC[3+bs*0] = alpha[0]*CC[3+bs*0];
5457
5458 CC[0+bs*1] = alpha[0]*CC[0+bs*1];
5459 CC[1+bs*1] = alpha[0]*CC[1+bs*1];
5460 CC[2+bs*1] = alpha[0]*CC[2+bs*1];
5461 CC[3+bs*1] = alpha[0]*CC[3+bs*1];
5462
5463 CC[0+bs*2] = alpha[0]*CC[0+bs*2];
5464 CC[1+bs*2] = alpha[0]*CC[1+bs*2];
5465 CC[2+bs*2] = alpha[0]*CC[2+bs*2];
5466 CC[3+bs*2] = alpha[0]*CC[3+bs*2];
5467
5468 CC[0+bs*3] = alpha[0]*CC[0+bs*3];
5469 CC[1+bs*3] = alpha[0]*CC[1+bs*3];
5470 CC[2+bs*3] = alpha[0]*CC[2+bs*3];
5471 CC[3+bs*3] = alpha[0]*CC[3+bs*3];
5472
5473 double beta1 = 1.0;
5474
5475 kernel_dgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, CC);
5476
5477 // shift sol for cols
5478 if(n0>0)
5479 {
5480 if(n0==1)
5481 {
5482 CC[0+bs*0] = CC[0+bs*1];
5483 CC[1+bs*0] = CC[1+bs*1];
5484 CC[2+bs*0] = CC[2+bs*1];
5485 CC[3+bs*0] = CC[3+bs*1];
5486
5487 CC[0+bs*1] = CC[0+bs*2];
5488 CC[1+bs*1] = CC[1+bs*2];
5489 CC[2+bs*1] = CC[2+bs*2];
5490 CC[3+bs*1] = CC[3+bs*2];
5491
5492 CC[0+bs*2] = CC[0+bs*3];
5493 CC[1+bs*2] = CC[1+bs*3];
5494 CC[2+bs*2] = CC[2+bs*3];
5495 CC[3+bs*2] = CC[3+bs*3];
5496
5497 D0 += 1*bs;
5498 }
5499 else if(n0==2)
5500 {
5501 CC[0+bs*0] = CC[0+bs*2];
5502 CC[1+bs*0] = CC[1+bs*2];
5503 CC[2+bs*0] = CC[2+bs*2];
5504 CC[3+bs*0] = CC[3+bs*2];
5505
5506 CC[0+bs*1] = CC[0+bs*3];
5507 CC[1+bs*1] = CC[1+bs*3];
5508 CC[2+bs*1] = CC[2+bs*3];
5509 CC[3+bs*1] = CC[3+bs*3];
5510
5511 D0 += 2*bs;
5512 }
5513 else //if(n0==3)
5514 {
5515 CC[0+bs*0] = CC[0+bs*3];
5516 CC[1+bs*0] = CC[1+bs*3];
5517 CC[2+bs*0] = CC[2+bs*3];
5518 CC[3+bs*0] = CC[3+bs*3];
5519
5520 D0 += 3*bs;
5521 }
5522 }
5523
5524 n1 = 4<n1 ? 4 : n1;
5525 int kn = n1 - n0;
5526
5527 if(offsetD==0)
5528 {
5529 if(kn<=0)
5530 return;
5531
5532 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
5533 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0];
5534 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0];
5535 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0];
5536
5537 if(kn<=1)
5538 return;
5539
5540 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
5541 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
5542 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1];
5543 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1];
5544
5545 if(kn<=2)
5546 return;
5547
5548 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
5549 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
5550 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
5551 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2];
5552
5553 if(kn<=3)
5554 return;
5555
5556 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
5557 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
5558 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
5559 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3];
5560 }
5561 else if(offsetD==1)
5562 {
5563 D1 = D0 + sdd*bs;
5564
5565 if(kn<=0)
5566 return;
5567
5568 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0];
5569 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0];
5570 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0];
5571 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0];
5572
5573 if(kn<=1)
5574 return;
5575
5576 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1];
5577 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1];
5578 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1];
5579 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1];
5580
5581 if(kn<=2)
5582 return;
5583
5584 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2];
5585 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2];
5586 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2];
5587 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2];
5588
5589 if(kn<=3)
5590 return;
5591
5592 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3];
5593 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3];
5594 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3];
5595 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3];
5596 }
5597 else if(offsetD==2)
5598 {
5599 D1 = D0 + sdd*bs;
5600
5601 if(kn<=0)
5602 return;
5603
5604 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0];
5605 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0];
5606 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0];
5607 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0];
5608
5609 if(kn<=1)
5610 return;
5611
5612 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1];
5613 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1];
5614 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1];
5615 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1];
5616
5617 if(kn<=2)
5618 return;
5619
5620 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2];
5621 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2];
5622 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2];
5623 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2];
5624
5625 if(kn<=3)
5626 return;
5627
5628 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3];
5629 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3];
5630 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3];
5631 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3];
5632 }
5633 else //if(offsetD==3)
5634 {
5635 D1 = D0 + sdd*bs;
5636
5637 if(kn<=0)
5638 return;
5639
5640 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0];
5641 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0];
5642 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0];
5643 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0];
5644
5645 if(kn<=1)
5646 return;
5647
5648 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1];
5649 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1];
5650 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1];
5651 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1];
5652
5653 if(kn<=2)
5654 return;
5655
5656 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2];
5657 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2];
5658 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2];
5659 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2];
5660
5661 if(kn<=3)
5662 return;
5663
5664 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3];
5665 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3];
5666 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3];
5667 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3];
5668 }
5669
5670 return;
5671
5672 }
5673 #endif
5674
5675
5676
5677 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR)
kernel_dpotrf_nt_l_4x4_lib4(int kmax,double * A,double * B,double * C,double * D,double * inv_diag_D)5678 void kernel_dpotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
5679 {
5680
5681 const int bs = 4;
5682
5683 double
5684 a_0, a_1, a_2, a_3,
5685 b_0, b_1, b_2, b_3,
5686 tmp;
5687
5688 #if defined(TARGET_GENERIC)
5689 double CC[16] = {0};
5690 #else
5691 ALIGNED( double CC[16], 64 ) = {0};
5692 #endif
5693
5694 int k;
5695
5696 double alpha1 = -1.0;
5697 double beta1 = 1.0;
5698
5699 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC);
5700
5701 if(CC[0+bs*0]>0)
5702 {
5703 CC[0+bs*0] = sqrt(CC[0+bs*0]);
5704 tmp = 1.0/CC[0+bs*0];
5705 }
5706 else
5707 {
5708 CC[0+bs*0] = 0.0;
5709 tmp = 0.0;
5710 }
5711 CC[1+bs*0] *= tmp;
5712 CC[2+bs*0] *= tmp;
5713 CC[3+bs*0] *= tmp;
5714 inv_diag_D[0] = tmp;
5715
5716 CC[1+bs*1] -= CC[1+bs*0] * CC[1+bs*0];
5717 CC[2+bs*1] -= CC[2+bs*0] * CC[1+bs*0];
5718 CC[3+bs*1] -= CC[3+bs*0] * CC[1+bs*0];
5719 if(CC[1+bs*1]>0)
5720 {
5721 CC[1+bs*1] = sqrt(CC[1+bs*1]);
5722 tmp = 1.0/CC[1+bs*1];
5723 }
5724 else
5725 {
5726 CC[1+bs*1] = 0.0;
5727 tmp = 0.0;
5728 }
5729 CC[2+bs*1] *= tmp;
5730 CC[3+bs*1] *= tmp;
5731 inv_diag_D[1] = tmp;
5732
5733 CC[2+bs*2] -= CC[2+bs*0] * CC[2+bs*0];
5734 CC[3+bs*2] -= CC[3+bs*0] * CC[2+bs*0];
5735 CC[2+bs*2] -= CC[2+bs*1] * CC[2+bs*1];
5736 CC[3+bs*2] -= CC[3+bs*1] * CC[2+bs*1];
5737 if(CC[2+bs*2]>0)
5738 {
5739 CC[2+bs*2] = sqrt(CC[2+bs*2]);
5740 tmp = 1.0/CC[2+bs*2];
5741 }
5742 else
5743 {
5744 CC[2+bs*2] = 0.0;
5745 tmp = 0.0;
5746 }
5747 CC[3+bs*2] *= tmp;
5748 inv_diag_D[2] = tmp;
5749
5750 CC[3+bs*3] -= CC[3+bs*0] * CC[3+bs*0];
5751 CC[3+bs*3] -= CC[3+bs*1] * CC[3+bs*1];
5752 CC[3+bs*3] -= CC[3+bs*2] * CC[3+bs*2];
5753 if(CC[3+bs*3]>0)
5754 {
5755 CC[3+bs*3] = sqrt(CC[3+bs*3]);
5756 tmp = 1.0/CC[3+bs*3];
5757 }
5758 else
5759 {
5760 CC[3+bs*3] = 0.0;
5761 tmp = 0.0;
5762 }
5763 inv_diag_D[3] = tmp;
5764
5765 D[0+bs*0] = CC[0+bs*0];
5766 D[1+bs*0] = CC[1+bs*0];
5767 D[2+bs*0] = CC[2+bs*0];
5768 D[3+bs*0] = CC[3+bs*0];
5769
5770 D[1+bs*1] = CC[1+bs*1];
5771 D[2+bs*1] = CC[2+bs*1];
5772 D[3+bs*1] = CC[3+bs*1];
5773
5774 D[2+bs*2] = CC[2+bs*2];
5775 D[3+bs*2] = CC[3+bs*2];
5776
5777 D[3+bs*3] = CC[3+bs*3];
5778
5779 return;
5780
5781 }
5782 #endif
5783
5784
5785
5786 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax,double * A,double * B,double * C,double * D,double * inv_diag_D,int km,int kn)5787 void kernel_dpotrf_nt_l_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
5788 {
5789
5790 const int bs = 4;
5791
5792 double tmp;
5793
5794 #if defined(TARGET_GENERIC)
5795 double CC[16] = {0};
5796 #else
5797 ALIGNED( double CC[16], 64 ) = {0};
5798 #endif
5799
5800 double alpha1 = -1.0;
5801 double beta1 = 1.0;
5802
5803 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC);
5804
5805 if(CC[0+bs*0]>0)
5806 {
5807 CC[0+bs*0] = sqrt(CC[0+bs*0]);
5808 tmp = 1.0/CC[0+bs*0];
5809 }
5810 else
5811 {
5812 CC[0+bs*0] = 0.0;
5813 tmp = 0.0;
5814 }
5815 CC[1+bs*0] *= tmp;
5816 CC[2+bs*0] *= tmp;
5817 CC[3+bs*0] *= tmp;
5818 inv_diag_D[0] = tmp;
5819
5820 if(kn==1)
5821 goto store;
5822
5823 CC[1+bs*1] -= CC[1+bs*0] * CC[1+bs*0];
5824 CC[2+bs*1] -= CC[2+bs*0] * CC[1+bs*0];
5825 CC[3+bs*1] -= CC[3+bs*0] * CC[1+bs*0];
5826 if(CC[1+bs*1]>0)
5827 {
5828 CC[1+bs*1] = sqrt(CC[1+bs*1]);
5829 tmp = 1.0/CC[1+bs*1];
5830 }
5831 else
5832 {
5833 CC[1+bs*1] = 0.0;
5834 tmp = 0.0;
5835 }
5836 CC[2+bs*1] *= tmp;
5837 CC[3+bs*1] *= tmp;
5838 inv_diag_D[1] = tmp;
5839
5840 if(kn==2)
5841 goto store;
5842
5843 CC[2+bs*2] -= CC[2+bs*0] * CC[2+bs*0];
5844 CC[3+bs*2] -= CC[3+bs*0] * CC[2+bs*0];
5845 CC[2+bs*2] -= CC[2+bs*1] * CC[2+bs*1];
5846 CC[3+bs*2] -= CC[3+bs*1] * CC[2+bs*1];
5847 if(CC[2+bs*2]>0)
5848 {
5849 CC[2+bs*2] = sqrt(CC[2+bs*2]);
5850 tmp = 1.0/CC[2+bs*2];
5851 }
5852 else
5853 {
5854 CC[2+bs*2] = 0.0;
5855 tmp = 0.0;
5856 }
5857 CC[3+bs*2] *= tmp;
5858 inv_diag_D[2] = tmp;
5859
5860 if(kn==3)
5861 goto store;
5862
5863 CC[3+bs*3] -= CC[3+bs*0] * CC[3+bs*0];
5864 CC[3+bs*3] -= CC[3+bs*1] * CC[3+bs*1];
5865 CC[3+bs*3] -= CC[3+bs*2] * CC[3+bs*2];
5866 if(CC[3+bs*3]>0)
5867 {
5868 CC[3+bs*3] = sqrt(CC[3+bs*3]);
5869 tmp = 1.0/CC[3+bs*3];
5870 }
5871 else
5872 {
5873 CC[3+bs*3] = 0.0;
5874 tmp = 0.0;
5875 }
5876 inv_diag_D[3] = tmp;
5877
5878
5879 store:
5880
5881 if(km>=4)
5882 {
5883 D[0+bs*0] = CC[0+bs*0];
5884 D[1+bs*0] = CC[1+bs*0];
5885 D[2+bs*0] = CC[2+bs*0];
5886 D[3+bs*0] = CC[3+bs*0];
5887
5888 if(kn==1)
5889 return;
5890
5891 D[1+bs*1] = CC[1+bs*1];
5892 D[2+bs*1] = CC[2+bs*1];
5893 D[3+bs*1] = CC[3+bs*1];
5894
5895 if(kn==2)
5896 return;
5897
5898 D[2+bs*2] = CC[2+bs*2];
5899 D[3+bs*2] = CC[3+bs*2];
5900
5901 if(kn==3)
5902 return;
5903
5904 D[3+bs*3] = CC[3+bs*3];
5905 }
5906 else if(km>=3)
5907 {
5908 D[0+bs*0] = CC[0+bs*0];
5909 D[1+bs*0] = CC[1+bs*0];
5910 D[2+bs*0] = CC[2+bs*0];
5911
5912 if(kn==1)
5913 return;
5914
5915 D[1+bs*1] = CC[1+bs*1];
5916 D[2+bs*1] = CC[2+bs*1];
5917
5918 if(kn==2)
5919 return;
5920
5921 D[2+bs*2] = CC[2+bs*2];
5922 }
5923 else if(km>=2)
5924 {
5925 D[0+bs*0] = CC[0+bs*0];
5926 D[1+bs*0] = CC[1+bs*0];
5927
5928 if(kn==1)
5929 return;
5930
5931 D[1+bs*1] = CC[1+bs*1];
5932 }
5933 else //if(km>=1)
5934 {
5935 D[0+bs*0] = CC[0+bs*0];
5936 }
5937
5938 return;
5939
5940 }
5941 #endif
5942
5943
5944
5945 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp,double * Ap,double * Bp,int km_,double * Am,double * Bm,double * C,double * D,double * inv_diag_D)5946 void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D)
5947 {
5948 double alpha = 1.0;
5949 double beta = 1.0;
5950 kernel_dsyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
5951 kernel_dpotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
5952 }
5953 #endif
5954
5955
5956
5957 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp,double * Ap,double * Bp,int km_,double * Am,double * Bm,double * C,double * D,double * inv_diag_D,int km,int kn)5958 void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn)
5959 {
5960 double alpha = 1.0;
5961 double beta = 1.0;
5962 kernel_dsyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
5963 kernel_dpotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
5964 }
5965 #endif
5966
5967
5968
5969 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA)
kernel_dtrsm_nt_rl_inv_4x4_lib4(int kmax,double * A,double * B,double * beta,double * C,double * D,double * E,double * inv_diag_E)5970 void kernel_dtrsm_nt_rl_inv_4x4_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E, double *inv_diag_E)
5971 {
5972
5973 const int bs = 4;
5974
5975 double tmp;
5976
5977 #if defined(TARGET_GENERIC)
5978 double CC[16] = {0};
5979 #else
5980 ALIGNED( double CC[16], 64 ) = {0};
5981 #endif
5982
5983 double alpha1 = -1.0;
5984
5985 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
5986
5987 tmp = inv_diag_E[0];
5988 CC[0+bs*0] *= tmp;
5989 CC[1+bs*0] *= tmp;
5990 CC[2+bs*0] *= tmp;
5991 CC[3+bs*0] *= tmp;
5992
5993 tmp = E[1+bs*0];
5994 CC[0+bs*1] -= CC[0+bs*0] * tmp;
5995 CC[1+bs*1] -= CC[1+bs*0] * tmp;
5996 CC[2+bs*1] -= CC[2+bs*0] * tmp;
5997 CC[3+bs*1] -= CC[3+bs*0] * tmp;
5998 tmp = inv_diag_E[1];
5999 CC[0+bs*1] *= tmp;
6000 CC[1+bs*1] *= tmp;
6001 CC[2+bs*1] *= tmp;
6002 CC[3+bs*1] *= tmp;
6003
6004 tmp = E[2+bs*0];
6005 CC[0+bs*2] -= CC[0+bs*0] * tmp;
6006 CC[1+bs*2] -= CC[1+bs*0] * tmp;
6007 CC[2+bs*2] -= CC[2+bs*0] * tmp;
6008 CC[3+bs*2] -= CC[3+bs*0] * tmp;
6009 tmp = E[2+bs*1];
6010 CC[0+bs*2] -= CC[0+bs*1] * tmp;
6011 CC[1+bs*2] -= CC[1+bs*1] * tmp;
6012 CC[2+bs*2] -= CC[2+bs*1] * tmp;
6013 CC[3+bs*2] -= CC[3+bs*1] * tmp;
6014 tmp = inv_diag_E[2];
6015 CC[0+bs*2] *= tmp;
6016 CC[1+bs*2] *= tmp;
6017 CC[2+bs*2] *= tmp;
6018 CC[3+bs*2] *= tmp;
6019
6020 tmp = E[3+bs*0];
6021 CC[0+bs*3] -= CC[0+bs*0] * tmp;
6022 CC[1+bs*3] -= CC[1+bs*0] * tmp;
6023 CC[2+bs*3] -= CC[2+bs*0] * tmp;
6024 CC[3+bs*3] -= CC[3+bs*0] * tmp;
6025 tmp = E[3+bs*1];
6026 CC[0+bs*3] -= CC[0+bs*1] * tmp;
6027 CC[1+bs*3] -= CC[1+bs*1] * tmp;
6028 CC[2+bs*3] -= CC[2+bs*1] * tmp;
6029 CC[3+bs*3] -= CC[3+bs*1] * tmp;
6030 tmp = E[3+bs*2];
6031 CC[0+bs*3] -= CC[0+bs*2] * tmp;
6032 CC[1+bs*3] -= CC[1+bs*2] * tmp;
6033 CC[2+bs*3] -= CC[2+bs*2] * tmp;
6034 CC[3+bs*3] -= CC[3+bs*2] * tmp;
6035 tmp = inv_diag_E[3];
6036 CC[0+bs*3] *= tmp;
6037 CC[1+bs*3] *= tmp;
6038 CC[2+bs*3] *= tmp;
6039 CC[3+bs*3] *= tmp;
6040
6041 D[0+bs*0] = CC[0+bs*0];
6042 D[1+bs*0] = CC[1+bs*0];
6043 D[2+bs*0] = CC[2+bs*0];
6044 D[3+bs*0] = CC[3+bs*0];
6045
6046 D[0+bs*1] = CC[0+bs*1];
6047 D[1+bs*1] = CC[1+bs*1];
6048 D[2+bs*1] = CC[2+bs*1];
6049 D[3+bs*1] = CC[3+bs*1];
6050
6051 D[0+bs*2] = CC[0+bs*2];
6052 D[1+bs*2] = CC[1+bs*2];
6053 D[2+bs*2] = CC[2+bs*2];
6054 D[3+bs*2] = CC[3+bs*2];
6055
6056 D[0+bs*3] = CC[0+bs*3];
6057 D[1+bs*3] = CC[1+bs*3];
6058 D[2+bs*3] = CC[2+bs*3];
6059 D[3+bs*3] = CC[3+bs*3];
6060
6061 return;
6062
6063 }
6064 #endif
6065
6066
6067
6068 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax,double * A,double * B,double * beta,double * C,double * D,double * E,double * inv_diag_E,int km,int kn)6069 void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
6070 {
6071
6072 const int bs = 4;
6073
6074 double tmp;
6075
6076 #if defined(TARGET_GENERIC)
6077 double CC[16] = {0};
6078 #else
6079 ALIGNED( double CC[16], 64 ) = {0};
6080 #endif
6081
6082 double alpha1 = -1.0;
6083
6084 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6085
6086 tmp = inv_diag_E[0];
6087 CC[0+bs*0] *= tmp;
6088 CC[1+bs*0] *= tmp;
6089 CC[2+bs*0] *= tmp;
6090 CC[3+bs*0] *= tmp;
6091
6092 if(kn==1)
6093 goto store;
6094
6095 tmp = E[1+bs*0];
6096 CC[0+bs*1] -= CC[0+bs*0] * tmp;
6097 CC[1+bs*1] -= CC[1+bs*0] * tmp;
6098 CC[2+bs*1] -= CC[2+bs*0] * tmp;
6099 CC[3+bs*1] -= CC[3+bs*0] * tmp;
6100 tmp = inv_diag_E[1];
6101 CC[0+bs*1] *= tmp;
6102 CC[1+bs*1] *= tmp;
6103 CC[2+bs*1] *= tmp;
6104 CC[3+bs*1] *= tmp;
6105
6106 if(kn==2)
6107 goto store;
6108
6109 tmp = E[2+bs*0];
6110 CC[0+bs*2] -= CC[0+bs*0] * tmp;
6111 CC[1+bs*2] -= CC[1+bs*0] * tmp;
6112 CC[2+bs*2] -= CC[2+bs*0] * tmp;
6113 CC[3+bs*2] -= CC[3+bs*0] * tmp;
6114 tmp = E[2+bs*1];
6115 CC[0+bs*2] -= CC[0+bs*1] * tmp;
6116 CC[1+bs*2] -= CC[1+bs*1] * tmp;
6117 CC[2+bs*2] -= CC[2+bs*1] * tmp;
6118 CC[3+bs*2] -= CC[3+bs*1] * tmp;
6119 tmp = inv_diag_E[2];
6120 CC[0+bs*2] *= tmp;
6121 CC[1+bs*2] *= tmp;
6122 CC[2+bs*2] *= tmp;
6123 CC[3+bs*2] *= tmp;
6124
6125 if(kn==3)
6126 goto store;
6127
6128 tmp = E[3+bs*0];
6129 CC[0+bs*3] -= CC[0+bs*0] * tmp;
6130 CC[1+bs*3] -= CC[1+bs*0] * tmp;
6131 CC[2+bs*3] -= CC[2+bs*0] * tmp;
6132 CC[3+bs*3] -= CC[3+bs*0] * tmp;
6133 tmp = E[3+bs*1];
6134 CC[0+bs*3] -= CC[0+bs*1] * tmp;
6135 CC[1+bs*3] -= CC[1+bs*1] * tmp;
6136 CC[2+bs*3] -= CC[2+bs*1] * tmp;
6137 CC[3+bs*3] -= CC[3+bs*1] * tmp;
6138 tmp = E[3+bs*2];
6139 CC[0+bs*3] -= CC[0+bs*2] * tmp;
6140 CC[1+bs*3] -= CC[1+bs*2] * tmp;
6141 CC[2+bs*3] -= CC[2+bs*2] * tmp;
6142 CC[3+bs*3] -= CC[3+bs*2] * tmp;
6143 tmp = inv_diag_E[3];
6144 CC[0+bs*3] *= tmp;
6145 CC[1+bs*3] *= tmp;
6146 CC[2+bs*3] *= tmp;
6147 CC[3+bs*3] *= tmp;
6148
6149 store:
6150
6151 if(km>=4)
6152 {
6153 D[0+bs*0] = CC[0+bs*0];
6154 D[1+bs*0] = CC[1+bs*0];
6155 D[2+bs*0] = CC[2+bs*0];
6156 D[3+bs*0] = CC[3+bs*0];
6157
6158 if(kn==1)
6159 return;
6160
6161 D[0+bs*1] = CC[0+bs*1];
6162 D[1+bs*1] = CC[1+bs*1];
6163 D[2+bs*1] = CC[2+bs*1];
6164 D[3+bs*1] = CC[3+bs*1];
6165
6166 if(kn==2)
6167 return;
6168
6169 D[0+bs*2] = CC[0+bs*2];
6170 D[1+bs*2] = CC[1+bs*2];
6171 D[2+bs*2] = CC[2+bs*2];
6172 D[3+bs*2] = CC[3+bs*2];
6173
6174 if(kn==3)
6175 return;
6176
6177 D[0+bs*3] = CC[0+bs*3];
6178 D[1+bs*3] = CC[1+bs*3];
6179 D[2+bs*3] = CC[2+bs*3];
6180 D[3+bs*3] = CC[3+bs*3];
6181 }
6182 else if(km>=3)
6183 {
6184 D[0+bs*0] = CC[0+bs*0];
6185 D[1+bs*0] = CC[1+bs*0];
6186 D[2+bs*0] = CC[2+bs*0];
6187
6188 if(kn==1)
6189 return;
6190
6191 D[0+bs*1] = CC[0+bs*1];
6192 D[1+bs*1] = CC[1+bs*1];
6193 D[2+bs*1] = CC[2+bs*1];
6194
6195 if(kn==2)
6196 return;
6197
6198 D[0+bs*2] = CC[0+bs*2];
6199 D[1+bs*2] = CC[1+bs*2];
6200 D[2+bs*2] = CC[2+bs*2];
6201
6202 if(kn==3)
6203 return;
6204
6205 D[0+bs*3] = CC[0+bs*3];
6206 D[1+bs*3] = CC[1+bs*3];
6207 D[2+bs*3] = CC[2+bs*3];
6208 }
6209 else if(km>=2)
6210 {
6211 D[0+bs*0] = CC[0+bs*0];
6212 D[1+bs*0] = CC[1+bs*0];
6213
6214 if(kn==1)
6215 return;
6216
6217 D[0+bs*1] = CC[0+bs*1];
6218 D[1+bs*1] = CC[1+bs*1];
6219
6220 if(kn==2)
6221 return;
6222
6223 D[0+bs*2] = CC[0+bs*2];
6224 D[1+bs*2] = CC[1+bs*2];
6225
6226 if(kn==3)
6227 return;
6228
6229 D[0+bs*3] = CC[0+bs*3];
6230 D[1+bs*3] = CC[1+bs*3];
6231 }
6232 else //if(km>=1)
6233 {
6234 D[0+bs*0] = CC[0+bs*0];
6235
6236 if(kn==1)
6237 return;
6238
6239 D[0+bs*1] = CC[0+bs*1];
6240
6241 if(kn==2)
6242 return;
6243
6244 D[0+bs*2] = CC[0+bs*2];
6245
6246 if(kn==3)
6247 return;
6248
6249 D[0+bs*3] = CC[0+bs*3];
6250 }
6251
6252 return;
6253
6254 }
6255 #endif
6256
6257
6258
6259 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp,double * Ap,double * Bp,int km_,double * Am,double * Bm,double * C,double * D,double * E,double * inv_diag_E)6260 void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E)
6261 {
6262 double alpha = 1.0;
6263 double beta = 1.0;
6264 kernel_dgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
6265 kernel_dtrsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, &beta, D, D, E, inv_diag_E);
6266 }
6267 #endif
6268
6269
6270
6271 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp,double * Ap,double * Bp,int km_,double * Am,double * Bm,double * C,double * D,double * E,double * inv_diag_E,int km,int kn)6272 void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
6273 {
6274 double alpha = 1.0;
6275 double beta = 1.0;
6276 kernel_dgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
6277 kernel_dtrsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, &beta, D, D, E, inv_diag_E, km, kn);
6278 }
6279 #endif
6280
6281
6282
6283 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nt_rl_one_4x4_lib4(int kmax,double * A,double * B,double * beta,double * C,double * D,double * E)6284 void kernel_dtrsm_nt_rl_one_4x4_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E)
6285 {
6286
6287 const int bs = 4;
6288
6289 double tmp;
6290
6291 #if defined(TARGET_GENERIC)
6292 double CC[16] = {0};
6293 #else
6294 ALIGNED( double CC[16], 64 ) = {0};
6295 #endif
6296
6297 double alpha1 = -1.0;
6298
6299 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6300
6301 tmp = E[1+bs*0];
6302 CC[0+bs*1] -= CC[0+bs*0] * tmp;
6303 CC[1+bs*1] -= CC[1+bs*0] * tmp;
6304 CC[2+bs*1] -= CC[2+bs*0] * tmp;
6305 CC[3+bs*1] -= CC[3+bs*0] * tmp;
6306
6307 tmp = E[2+bs*0];
6308 CC[0+bs*2] -= CC[0+bs*0] * tmp;
6309 CC[1+bs*2] -= CC[1+bs*0] * tmp;
6310 CC[2+bs*2] -= CC[2+bs*0] * tmp;
6311 CC[3+bs*2] -= CC[3+bs*0] * tmp;
6312 tmp = E[2+bs*1];
6313 CC[0+bs*2] -= CC[0+bs*1] * tmp;
6314 CC[1+bs*2] -= CC[1+bs*1] * tmp;
6315 CC[2+bs*2] -= CC[2+bs*1] * tmp;
6316 CC[3+bs*2] -= CC[3+bs*1] * tmp;
6317
6318 tmp = E[3+bs*0];
6319 CC[0+bs*3] -= CC[0+bs*0] * tmp;
6320 CC[1+bs*3] -= CC[1+bs*0] * tmp;
6321 CC[2+bs*3] -= CC[2+bs*0] * tmp;
6322 CC[3+bs*3] -= CC[3+bs*0] * tmp;
6323 tmp = E[3+bs*1];
6324 CC[0+bs*3] -= CC[0+bs*1] * tmp;
6325 CC[1+bs*3] -= CC[1+bs*1] * tmp;
6326 CC[2+bs*3] -= CC[2+bs*1] * tmp;
6327 CC[3+bs*3] -= CC[3+bs*1] * tmp;
6328 tmp = E[3+bs*2];
6329 CC[0+bs*3] -= CC[0+bs*2] * tmp;
6330 CC[1+bs*3] -= CC[1+bs*2] * tmp;
6331 CC[2+bs*3] -= CC[2+bs*2] * tmp;
6332 CC[3+bs*3] -= CC[3+bs*2] * tmp;
6333
6334 D[0+bs*0] = CC[0+bs*0];
6335 D[1+bs*0] = CC[1+bs*0];
6336 D[2+bs*0] = CC[2+bs*0];
6337 D[3+bs*0] = CC[3+bs*0];
6338
6339 D[0+bs*1] = CC[0+bs*1];
6340 D[1+bs*1] = CC[1+bs*1];
6341 D[2+bs*1] = CC[2+bs*1];
6342 D[3+bs*1] = CC[3+bs*1];
6343
6344 D[0+bs*2] = CC[0+bs*2];
6345 D[1+bs*2] = CC[1+bs*2];
6346 D[2+bs*2] = CC[2+bs*2];
6347 D[3+bs*2] = CC[3+bs*2];
6348
6349 D[0+bs*3] = CC[0+bs*3];
6350 D[1+bs*3] = CC[1+bs*3];
6351 D[2+bs*3] = CC[2+bs*3];
6352 D[3+bs*3] = CC[3+bs*3];
6353
6354 return;
6355
6356 }
6357 #endif
6358
6359
6360
6361 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax,double * A,double * B,double * beta,double * C,double * D,double * E,int km,int kn)6362 void kernel_dtrsm_nt_rl_one_4x4_vs_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E, int km, int kn)
6363 {
6364
6365 const int bs = 4;
6366
6367 double tmp;
6368
6369 #if defined(TARGET_GENERIC)
6370 double CC[16] = {0};
6371 #else
6372 ALIGNED( double CC[16], 64 ) = {0};
6373 #endif
6374
6375 double alpha1 = -1.0;
6376
6377 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6378
6379 if(kn==1)
6380 goto store;
6381
6382 tmp = E[1+bs*0];
6383 CC[0+bs*1] -= CC[0+bs*0] * tmp;
6384 CC[1+bs*1] -= CC[1+bs*0] * tmp;
6385 CC[2+bs*1] -= CC[2+bs*0] * tmp;
6386 CC[3+bs*1] -= CC[3+bs*0] * tmp;
6387
6388 if(kn==2)
6389 goto store;
6390
6391 tmp = E[2+bs*0];
6392 CC[0+bs*2] -= CC[0+bs*0] * tmp;
6393 CC[1+bs*2] -= CC[1+bs*0] * tmp;
6394 CC[2+bs*2] -= CC[2+bs*0] * tmp;
6395 CC[3+bs*2] -= CC[3+bs*0] * tmp;
6396 tmp = E[2+bs*1];
6397 CC[0+bs*2] -= CC[0+bs*1] * tmp;
6398 CC[1+bs*2] -= CC[1+bs*1] * tmp;
6399 CC[2+bs*2] -= CC[2+bs*1] * tmp;
6400 CC[3+bs*2] -= CC[3+bs*1] * tmp;
6401
6402 if(kn==3)
6403 goto store;
6404
6405 tmp = E[3+bs*0];
6406 CC[0+bs*3] -= CC[0+bs*0] * tmp;
6407 CC[1+bs*3] -= CC[1+bs*0] * tmp;
6408 CC[2+bs*3] -= CC[2+bs*0] * tmp;
6409 CC[3+bs*3] -= CC[3+bs*0] * tmp;
6410 tmp = E[3+bs*1];
6411 CC[0+bs*3] -= CC[0+bs*1] * tmp;
6412 CC[1+bs*3] -= CC[1+bs*1] * tmp;
6413 CC[2+bs*3] -= CC[2+bs*1] * tmp;
6414 CC[3+bs*3] -= CC[3+bs*1] * tmp;
6415 tmp = E[3+bs*2];
6416 CC[0+bs*3] -= CC[0+bs*2] * tmp;
6417 CC[1+bs*3] -= CC[1+bs*2] * tmp;
6418 CC[2+bs*3] -= CC[2+bs*2] * tmp;
6419 CC[3+bs*3] -= CC[3+bs*2] * tmp;
6420
6421 store:
6422
6423 if(km>=4)
6424 {
6425 D[0+bs*0] = CC[0+bs*0];
6426 D[1+bs*0] = CC[1+bs*0];
6427 D[2+bs*0] = CC[2+bs*0];
6428 D[3+bs*0] = CC[3+bs*0];
6429
6430 if(kn==1)
6431 return;
6432
6433 D[0+bs*1] = CC[0+bs*1];
6434 D[1+bs*1] = CC[1+bs*1];
6435 D[2+bs*1] = CC[2+bs*1];
6436 D[3+bs*1] = CC[3+bs*1];
6437
6438 if(kn==2)
6439 return;
6440
6441 D[0+bs*2] = CC[0+bs*2];
6442 D[1+bs*2] = CC[1+bs*2];
6443 D[2+bs*2] = CC[2+bs*2];
6444 D[3+bs*2] = CC[3+bs*2];
6445
6446 if(kn==3)
6447 return;
6448
6449 D[0+bs*3] = CC[0+bs*3];
6450 D[1+bs*3] = CC[1+bs*3];
6451 D[2+bs*3] = CC[2+bs*3];
6452 D[3+bs*3] = CC[3+bs*3];
6453 }
6454 else if(km>=3)
6455 {
6456 D[0+bs*0] = CC[0+bs*0];
6457 D[1+bs*0] = CC[1+bs*0];
6458 D[2+bs*0] = CC[2+bs*0];
6459
6460 if(kn==1)
6461 return;
6462
6463 D[0+bs*1] = CC[0+bs*1];
6464 D[1+bs*1] = CC[1+bs*1];
6465 D[2+bs*1] = CC[2+bs*1];
6466
6467 if(kn==2)
6468 return;
6469
6470 D[0+bs*2] = CC[0+bs*2];
6471 D[1+bs*2] = CC[1+bs*2];
6472 D[2+bs*2] = CC[2+bs*2];
6473
6474 if(kn==3)
6475 return;
6476
6477 D[0+bs*3] = CC[0+bs*3];
6478 D[1+bs*3] = CC[1+bs*3];
6479 D[2+bs*3] = CC[2+bs*3];
6480 }
6481 else if(km>=2)
6482 {
6483 D[0+bs*0] = CC[0+bs*0];
6484 D[1+bs*0] = CC[1+bs*0];
6485
6486 if(kn==1)
6487 return;
6488
6489 D[0+bs*1] = CC[0+bs*1];
6490 D[1+bs*1] = CC[1+bs*1];
6491
6492 if(kn==2)
6493 return;
6494
6495 D[0+bs*2] = CC[0+bs*2];
6496 D[1+bs*2] = CC[1+bs*2];
6497
6498 if(kn==3)
6499 return;
6500
6501 D[0+bs*3] = CC[0+bs*3];
6502 D[1+bs*3] = CC[1+bs*3];
6503 }
6504 else //if(km>=1)
6505 {
6506 D[0+bs*0] = CC[0+bs*0];
6507
6508 if(kn==1)
6509 return;
6510
6511 D[0+bs*1] = CC[0+bs*1];
6512
6513 if(kn==2)
6514 return;
6515
6516 D[0+bs*2] = CC[0+bs*2];
6517
6518 if(kn==3)
6519 return;
6520
6521 D[0+bs*3] = CC[0+bs*3];
6522 }
6523
6524 return;
6525
6526 }
6527 #endif
6528
6529
6530
6531 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_inv_4x4_lib4(int kmax,double * A,double * B,double * beta,double * C,double * D,double * E,double * inv_diag_E)6532 void kernel_dtrsm_nt_ru_inv_4x4_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E, double *inv_diag_E)
6533 {
6534
6535 const int bs = 4;
6536
6537 double tmp;
6538
6539 #if defined(TARGET_GENERIC)
6540 double CC[16] = {0};
6541 #else
6542 ALIGNED( double CC[16], 64 ) = {0};
6543 #endif
6544
6545 double alpha1 = -1.0;
6546
6547 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6548
6549 tmp = inv_diag_E[3];
6550 CC[0+bs*3] *= tmp;
6551 CC[1+bs*3] *= tmp;
6552 CC[2+bs*3] *= tmp;
6553 CC[3+bs*3] *= tmp;
6554 tmp = E[2+bs*3];
6555 CC[0+bs*2] -= CC[0+bs*3] * tmp;
6556 CC[1+bs*2] -= CC[1+bs*3] * tmp;
6557 CC[2+bs*2] -= CC[2+bs*3] * tmp;
6558 CC[3+bs*2] -= CC[3+bs*3] * tmp;
6559 tmp = E[1+bs*3];
6560 CC[0+bs*1] -= CC[0+bs*3] * tmp;
6561 CC[1+bs*1] -= CC[1+bs*3] * tmp;
6562 CC[2+bs*1] -= CC[2+bs*3] * tmp;
6563 CC[3+bs*1] -= CC[3+bs*3] * tmp;
6564 tmp = E[0+bs*3];
6565 CC[0+bs*0] -= CC[0+bs*3] * tmp;
6566 CC[1+bs*0] -= CC[1+bs*3] * tmp;
6567 CC[2+bs*0] -= CC[2+bs*3] * tmp;
6568 CC[3+bs*0] -= CC[3+bs*3] * tmp;
6569
6570 tmp = inv_diag_E[2];
6571 CC[0+bs*2] *= tmp;
6572 CC[1+bs*2] *= tmp;
6573 CC[2+bs*2] *= tmp;
6574 CC[3+bs*2] *= tmp;
6575 tmp = E[1+bs*2];
6576 CC[0+bs*1] -= CC[0+bs*2] * tmp;
6577 CC[1+bs*1] -= CC[1+bs*2] * tmp;
6578 CC[2+bs*1] -= CC[2+bs*2] * tmp;
6579 CC[3+bs*1] -= CC[3+bs*2] * tmp;
6580 tmp = E[0+bs*2];
6581 CC[0+bs*0] -= CC[0+bs*2] * tmp;
6582 CC[1+bs*0] -= CC[1+bs*2] * tmp;
6583 CC[2+bs*0] -= CC[2+bs*2] * tmp;
6584 CC[3+bs*0] -= CC[3+bs*2] * tmp;
6585
6586 tmp = inv_diag_E[1];
6587 CC[0+bs*1] *= tmp;
6588 CC[1+bs*1] *= tmp;
6589 CC[2+bs*1] *= tmp;
6590 CC[3+bs*1] *= tmp;
6591 tmp = E[0+bs*1];
6592 CC[0+bs*0] -= CC[0+bs*1] * tmp;
6593 CC[1+bs*0] -= CC[1+bs*1] * tmp;
6594 CC[2+bs*0] -= CC[2+bs*1] * tmp;
6595 CC[3+bs*0] -= CC[3+bs*1] * tmp;
6596
6597 tmp = inv_diag_E[0];
6598 CC[0+bs*0] *= tmp;
6599 CC[1+bs*0] *= tmp;
6600 CC[2+bs*0] *= tmp;
6601 CC[3+bs*0] *= tmp;
6602
6603 D[0+bs*0] = CC[0+bs*0];
6604 D[1+bs*0] = CC[1+bs*0];
6605 D[2+bs*0] = CC[2+bs*0];
6606 D[3+bs*0] = CC[3+bs*0];
6607
6608 D[0+bs*1] = CC[0+bs*1];
6609 D[1+bs*1] = CC[1+bs*1];
6610 D[2+bs*1] = CC[2+bs*1];
6611 D[3+bs*1] = CC[3+bs*1];
6612
6613 D[0+bs*2] = CC[0+bs*2];
6614 D[1+bs*2] = CC[1+bs*2];
6615 D[2+bs*2] = CC[2+bs*2];
6616 D[3+bs*2] = CC[3+bs*2];
6617
6618 D[0+bs*3] = CC[0+bs*3];
6619 D[1+bs*3] = CC[1+bs*3];
6620 D[2+bs*3] = CC[2+bs*3];
6621 D[3+bs*3] = CC[3+bs*3];
6622
6623 return;
6624
6625 }
6626 #endif
6627
6628
6629
6630 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax,double * A,double * B,double * beta,double * C,double * D,double * E,double * inv_diag_E,int km,int kn)6631 void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
6632 {
6633
6634 const int bs = 4;
6635
6636 double tmp;
6637
6638 #if defined(TARGET_GENERIC)
6639 double CC[16] = {0};
6640 #else
6641 ALIGNED( double CC[16], 64 ) = {0};
6642 #endif
6643
6644 double alpha1 = -1.0;
6645
6646 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6647
6648 if(kn>3)
6649 {
6650 tmp = inv_diag_E[3];
6651 CC[0+bs*3] *= tmp;
6652 CC[1+bs*3] *= tmp;
6653 CC[2+bs*3] *= tmp;
6654 CC[3+bs*3] *= tmp;
6655 tmp = E[2+bs*3];
6656 CC[0+bs*2] -= CC[0+bs*3] * tmp;
6657 CC[1+bs*2] -= CC[1+bs*3] * tmp;
6658 CC[2+bs*2] -= CC[2+bs*3] * tmp;
6659 CC[3+bs*2] -= CC[3+bs*3] * tmp;
6660 tmp = E[1+bs*3];
6661 CC[0+bs*1] -= CC[0+bs*3] * tmp;
6662 CC[1+bs*1] -= CC[1+bs*3] * tmp;
6663 CC[2+bs*1] -= CC[2+bs*3] * tmp;
6664 CC[3+bs*1] -= CC[3+bs*3] * tmp;
6665 tmp = E[0+bs*3];
6666 CC[0+bs*0] -= CC[0+bs*3] * tmp;
6667 CC[1+bs*0] -= CC[1+bs*3] * tmp;
6668 CC[2+bs*0] -= CC[2+bs*3] * tmp;
6669 CC[3+bs*0] -= CC[3+bs*3] * tmp;
6670 }
6671
6672 if(kn>2)
6673 {
6674 tmp = inv_diag_E[2];
6675 CC[0+bs*2] *= tmp;
6676 CC[1+bs*2] *= tmp;
6677 CC[2+bs*2] *= tmp;
6678 CC[3+bs*2] *= tmp;
6679 tmp = E[1+bs*2];
6680 CC[0+bs*1] -= CC[0+bs*2] * tmp;
6681 CC[1+bs*1] -= CC[1+bs*2] * tmp;
6682 CC[2+bs*1] -= CC[2+bs*2] * tmp;
6683 CC[3+bs*1] -= CC[3+bs*2] * tmp;
6684 tmp = E[0+bs*2];
6685 CC[0+bs*0] -= CC[0+bs*2] * tmp;
6686 CC[1+bs*0] -= CC[1+bs*2] * tmp;
6687 CC[2+bs*0] -= CC[2+bs*2] * tmp;
6688 CC[3+bs*0] -= CC[3+bs*2] * tmp;
6689 }
6690
6691 if(kn>1)
6692 {
6693 tmp = inv_diag_E[1];
6694 CC[0+bs*1] *= tmp;
6695 CC[1+bs*1] *= tmp;
6696 CC[2+bs*1] *= tmp;
6697 CC[3+bs*1] *= tmp;
6698 tmp = E[0+bs*1];
6699 CC[0+bs*0] -= CC[0+bs*1] * tmp;
6700 CC[1+bs*0] -= CC[1+bs*1] * tmp;
6701 CC[2+bs*0] -= CC[2+bs*1] * tmp;
6702 CC[3+bs*0] -= CC[3+bs*1] * tmp;
6703 }
6704
6705 tmp = inv_diag_E[0];
6706 CC[0+bs*0] *= tmp;
6707 CC[1+bs*0] *= tmp;
6708 CC[2+bs*0] *= tmp;
6709 CC[3+bs*0] *= tmp;
6710
6711
6712 store:
6713
6714 if(km>=4)
6715 {
6716 D[0+bs*0] = CC[0+bs*0];
6717 D[1+bs*0] = CC[1+bs*0];
6718 D[2+bs*0] = CC[2+bs*0];
6719 D[3+bs*0] = CC[3+bs*0];
6720
6721 if(kn==1)
6722 return;
6723
6724 D[0+bs*1] = CC[0+bs*1];
6725 D[1+bs*1] = CC[1+bs*1];
6726 D[2+bs*1] = CC[2+bs*1];
6727 D[3+bs*1] = CC[3+bs*1];
6728
6729 if(kn==2)
6730 return;
6731
6732 D[0+bs*2] = CC[0+bs*2];
6733 D[1+bs*2] = CC[1+bs*2];
6734 D[2+bs*2] = CC[2+bs*2];
6735 D[3+bs*2] = CC[3+bs*2];
6736
6737 if(kn==3)
6738 return;
6739
6740 D[0+bs*3] = CC[0+bs*3];
6741 D[1+bs*3] = CC[1+bs*3];
6742 D[2+bs*3] = CC[2+bs*3];
6743 D[3+bs*3] = CC[3+bs*3];
6744 }
6745 else if(km>=3)
6746 {
6747 D[0+bs*0] = CC[0+bs*0];
6748 D[1+bs*0] = CC[1+bs*0];
6749 D[2+bs*0] = CC[2+bs*0];
6750
6751 if(kn==1)
6752 return;
6753
6754 D[0+bs*1] = CC[0+bs*1];
6755 D[1+bs*1] = CC[1+bs*1];
6756 D[2+bs*1] = CC[2+bs*1];
6757
6758 if(kn==2)
6759 return;
6760
6761 D[0+bs*2] = CC[0+bs*2];
6762 D[1+bs*2] = CC[1+bs*2];
6763 D[2+bs*2] = CC[2+bs*2];
6764
6765 if(kn==3)
6766 return;
6767
6768 D[0+bs*3] = CC[0+bs*3];
6769 D[1+bs*3] = CC[1+bs*3];
6770 D[2+bs*3] = CC[2+bs*3];
6771 }
6772 else if(km>=2)
6773 {
6774 D[0+bs*0] = CC[0+bs*0];
6775 D[1+bs*0] = CC[1+bs*0];
6776
6777 if(kn==1)
6778 return;
6779
6780 D[0+bs*1] = CC[0+bs*1];
6781 D[1+bs*1] = CC[1+bs*1];
6782
6783 if(kn==2)
6784 return;
6785
6786 D[0+bs*2] = CC[0+bs*2];
6787 D[1+bs*2] = CC[1+bs*2];
6788
6789 if(kn==3)
6790 return;
6791
6792 D[0+bs*3] = CC[0+bs*3];
6793 D[1+bs*3] = CC[1+bs*3];
6794 }
6795 else //if(km>=1)
6796 {
6797 D[0+bs*0] = CC[0+bs*0];
6798
6799 if(kn==1)
6800 return;
6801
6802 D[0+bs*1] = CC[0+bs*1];
6803
6804 if(kn==2)
6805 return;
6806
6807 D[0+bs*2] = CC[0+bs*2];
6808
6809 if(kn==3)
6810 return;
6811
6812 D[0+bs*3] = CC[0+bs*3];
6813 }
6814
6815 return;
6816
6817 }
6818 #endif
6819
6820
6821
6822 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_one_4x4_lib4(int kmax,double * A,double * B,double * beta,double * C,double * D,double * E)6823 void kernel_dtrsm_nt_ru_one_4x4_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E)
6824 {
6825
6826 const int bs = 4;
6827
6828 double tmp;
6829
6830 #if defined(TARGET_GENERIC)
6831 double CC[16] = {0};
6832 #else
6833 ALIGNED( double CC[16], 64 ) = {0};
6834 #endif
6835
6836 double alpha1 = -1.0;
6837
6838 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6839
6840 tmp = E[2+bs*3];
6841 CC[0+bs*2] -= CC[0+bs*3] * tmp;
6842 CC[1+bs*2] -= CC[1+bs*3] * tmp;
6843 CC[2+bs*2] -= CC[2+bs*3] * tmp;
6844 CC[3+bs*2] -= CC[3+bs*3] * tmp;
6845 tmp = E[1+bs*3];
6846 CC[0+bs*1] -= CC[0+bs*3] * tmp;
6847 CC[1+bs*1] -= CC[1+bs*3] * tmp;
6848 CC[2+bs*1] -= CC[2+bs*3] * tmp;
6849 CC[3+bs*1] -= CC[3+bs*3] * tmp;
6850 tmp = E[0+bs*3];
6851 CC[0+bs*0] -= CC[0+bs*3] * tmp;
6852 CC[1+bs*0] -= CC[1+bs*3] * tmp;
6853 CC[2+bs*0] -= CC[2+bs*3] * tmp;
6854 CC[3+bs*0] -= CC[3+bs*3] * tmp;
6855
6856 tmp = E[1+bs*2];
6857 CC[0+bs*1] -= CC[0+bs*2] * tmp;
6858 CC[1+bs*1] -= CC[1+bs*2] * tmp;
6859 CC[2+bs*1] -= CC[2+bs*2] * tmp;
6860 CC[3+bs*1] -= CC[3+bs*2] * tmp;
6861 tmp = E[0+bs*2];
6862 CC[0+bs*0] -= CC[0+bs*2] * tmp;
6863 CC[1+bs*0] -= CC[1+bs*2] * tmp;
6864 CC[2+bs*0] -= CC[2+bs*2] * tmp;
6865 CC[3+bs*0] -= CC[3+bs*2] * tmp;
6866
6867 tmp = E[0+bs*1];
6868 CC[0+bs*0] -= CC[0+bs*1] * tmp;
6869 CC[1+bs*0] -= CC[1+bs*1] * tmp;
6870 CC[2+bs*0] -= CC[2+bs*1] * tmp;
6871 CC[3+bs*0] -= CC[3+bs*1] * tmp;
6872
6873
6874 D[0+bs*0] = CC[0+bs*0];
6875 D[1+bs*0] = CC[1+bs*0];
6876 D[2+bs*0] = CC[2+bs*0];
6877 D[3+bs*0] = CC[3+bs*0];
6878
6879 D[0+bs*1] = CC[0+bs*1];
6880 D[1+bs*1] = CC[1+bs*1];
6881 D[2+bs*1] = CC[2+bs*1];
6882 D[3+bs*1] = CC[3+bs*1];
6883
6884 D[0+bs*2] = CC[0+bs*2];
6885 D[1+bs*2] = CC[1+bs*2];
6886 D[2+bs*2] = CC[2+bs*2];
6887 D[3+bs*2] = CC[3+bs*2];
6888
6889 D[0+bs*3] = CC[0+bs*3];
6890 D[1+bs*3] = CC[1+bs*3];
6891 D[2+bs*3] = CC[2+bs*3];
6892 D[3+bs*3] = CC[3+bs*3];
6893
6894 return;
6895
6896 }
6897 #endif
6898
6899
6900
6901 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nt_ru_one_4x4_vs_lib4(int kmax,double * A,double * B,double * beta,double * C,double * D,double * E,int km,int kn)6902 void kernel_dtrsm_nt_ru_one_4x4_vs_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E, int km, int kn)
6903 {
6904
6905 const int bs = 4;
6906
6907 double tmp;
6908
6909 #if defined(TARGET_GENERIC)
6910 double CC[16] = {0};
6911 #else
6912 ALIGNED( double CC[16], 64 ) = {0};
6913 #endif
6914
6915 double alpha1 = -1.0;
6916
6917 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6918
6919 if(kn>3)
6920 {
6921 tmp = E[2+bs*3];
6922 CC[0+bs*2] -= CC[0+bs*3] * tmp;
6923 CC[1+bs*2] -= CC[1+bs*3] * tmp;
6924 CC[2+bs*2] -= CC[2+bs*3] * tmp;
6925 CC[3+bs*2] -= CC[3+bs*3] * tmp;
6926 tmp = E[1+bs*3];
6927 CC[0+bs*1] -= CC[0+bs*3] * tmp;
6928 CC[1+bs*1] -= CC[1+bs*3] * tmp;
6929 CC[2+bs*1] -= CC[2+bs*3] * tmp;
6930 CC[3+bs*1] -= CC[3+bs*3] * tmp;
6931 tmp = E[0+bs*3];
6932 CC[0+bs*0] -= CC[0+bs*3] * tmp;
6933 CC[1+bs*0] -= CC[1+bs*3] * tmp;
6934 CC[2+bs*0] -= CC[2+bs*3] * tmp;
6935 CC[3+bs*0] -= CC[3+bs*3] * tmp;
6936 }
6937
6938 if(kn>2)
6939 {
6940 tmp = E[1+bs*2];
6941 CC[0+bs*1] -= CC[0+bs*2] * tmp;
6942 CC[1+bs*1] -= CC[1+bs*2] * tmp;
6943 CC[2+bs*1] -= CC[2+bs*2] * tmp;
6944 CC[3+bs*1] -= CC[3+bs*2] * tmp;
6945 tmp = E[0+bs*2];
6946 CC[0+bs*0] -= CC[0+bs*2] * tmp;
6947 CC[1+bs*0] -= CC[1+bs*2] * tmp;
6948 CC[2+bs*0] -= CC[2+bs*2] * tmp;
6949 CC[3+bs*0] -= CC[3+bs*2] * tmp;
6950 }
6951
6952 if(kn>1)
6953 {
6954 tmp = E[0+bs*1];
6955 CC[0+bs*0] -= CC[0+bs*1] * tmp;
6956 CC[1+bs*0] -= CC[1+bs*1] * tmp;
6957 CC[2+bs*0] -= CC[2+bs*1] * tmp;
6958 CC[3+bs*0] -= CC[3+bs*1] * tmp;
6959 }
6960
6961
6962 store:
6963
6964 if(km>=4)
6965 {
6966 D[0+bs*0] = CC[0+bs*0];
6967 D[1+bs*0] = CC[1+bs*0];
6968 D[2+bs*0] = CC[2+bs*0];
6969 D[3+bs*0] = CC[3+bs*0];
6970
6971 if(kn==1)
6972 return;
6973
6974 D[0+bs*1] = CC[0+bs*1];
6975 D[1+bs*1] = CC[1+bs*1];
6976 D[2+bs*1] = CC[2+bs*1];
6977 D[3+bs*1] = CC[3+bs*1];
6978
6979 if(kn==2)
6980 return;
6981
6982 D[0+bs*2] = CC[0+bs*2];
6983 D[1+bs*2] = CC[1+bs*2];
6984 D[2+bs*2] = CC[2+bs*2];
6985 D[3+bs*2] = CC[3+bs*2];
6986
6987 if(kn==3)
6988 return;
6989
6990 D[0+bs*3] = CC[0+bs*3];
6991 D[1+bs*3] = CC[1+bs*3];
6992 D[2+bs*3] = CC[2+bs*3];
6993 D[3+bs*3] = CC[3+bs*3];
6994 }
6995 else if(km>=3)
6996 {
6997 D[0+bs*0] = CC[0+bs*0];
6998 D[1+bs*0] = CC[1+bs*0];
6999 D[2+bs*0] = CC[2+bs*0];
7000
7001 if(kn==1)
7002 return;
7003
7004 D[0+bs*1] = CC[0+bs*1];
7005 D[1+bs*1] = CC[1+bs*1];
7006 D[2+bs*1] = CC[2+bs*1];
7007
7008 if(kn==2)
7009 return;
7010
7011 D[0+bs*2] = CC[0+bs*2];
7012 D[1+bs*2] = CC[1+bs*2];
7013 D[2+bs*2] = CC[2+bs*2];
7014
7015 if(kn==3)
7016 return;
7017
7018 D[0+bs*3] = CC[0+bs*3];
7019 D[1+bs*3] = CC[1+bs*3];
7020 D[2+bs*3] = CC[2+bs*3];
7021 }
7022 else if(km>=2)
7023 {
7024 D[0+bs*0] = CC[0+bs*0];
7025 D[1+bs*0] = CC[1+bs*0];
7026
7027 if(kn==1)
7028 return;
7029
7030 D[0+bs*1] = CC[0+bs*1];
7031 D[1+bs*1] = CC[1+bs*1];
7032
7033 if(kn==2)
7034 return;
7035
7036 D[0+bs*2] = CC[0+bs*2];
7037 D[1+bs*2] = CC[1+bs*2];
7038
7039 if(kn==3)
7040 return;
7041
7042 D[0+bs*3] = CC[0+bs*3];
7043 D[1+bs*3] = CC[1+bs*3];
7044 }
7045 else //if(km>=1)
7046 {
7047 D[0+bs*0] = CC[0+bs*0];
7048
7049 if(kn==1)
7050 return;
7051
7052 D[0+bs*1] = CC[0+bs*1];
7053
7054 if(kn==2)
7055 return;
7056
7057 D[0+bs*2] = CC[0+bs*2];
7058
7059 if(kn==3)
7060 return;
7061
7062 D[0+bs*3] = CC[0+bs*3];
7063 }
7064
7065 return;
7066
7067 }
7068 #endif
7069
7070
7071
7072 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgetrf_nn_4x4_lib4(int kmax,double * A,double * B,int sdb,double * C,double * D,double * inv_diag_D)7073 void kernel_dgetrf_nn_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D)
7074 {
7075
7076 const int bs = 4;
7077
7078 int k;
7079
7080 double tmp;
7081
7082 #if defined(TARGET_GENERIC)
7083 double CC[16] = {0};
7084 #else
7085 ALIGNED( double CC[16], 64 ) = {0};
7086 #endif
7087
7088 double alpha1 = -1.0;
7089 double beta1 = 1.0;
7090
7091 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
7092
7093 // factorization
7094
7095 // first column
7096 tmp = 1.0 / CC[0+bs*0];
7097 CC[1+bs*0] *= tmp;
7098 CC[2+bs*0] *= tmp;
7099 CC[3+bs*0] *= tmp;
7100
7101 inv_diag_D[0] = tmp;
7102
7103 // second column
7104 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1];
7105 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1];
7106 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1];
7107
7108 tmp = 1.0 / CC[1+bs*1];
7109 CC[2+bs*1] *= tmp;
7110 CC[3+bs*1] *= tmp;
7111
7112 inv_diag_D[1] = tmp;
7113
7114 // third column
7115 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2];
7116 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2];
7117 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2];
7118
7119 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2];
7120 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2];
7121
7122 tmp = 1.0 / CC[2+bs*2];
7123 CC[3+bs*2] *= tmp;
7124
7125 inv_diag_D[2] = tmp;
7126
7127 // fourth column
7128 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3];
7129 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3];
7130 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3];
7131
7132 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3];
7133 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3];
7134
7135 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3];
7136
7137 tmp = 1.0 / CC[3+bs*3];
7138
7139 inv_diag_D[3] = tmp;
7140
7141 D[0+bs*0] = CC[0+bs*0];
7142 D[1+bs*0] = CC[1+bs*0];
7143 D[2+bs*0] = CC[2+bs*0];
7144 D[3+bs*0] = CC[3+bs*0];
7145
7146 D[0+bs*1] = CC[0+bs*1];
7147 D[1+bs*1] = CC[1+bs*1];
7148 D[2+bs*1] = CC[2+bs*1];
7149 D[3+bs*1] = CC[3+bs*1];
7150
7151 D[0+bs*2] = CC[0+bs*2];
7152 D[1+bs*2] = CC[1+bs*2];
7153 D[2+bs*2] = CC[2+bs*2];
7154 D[3+bs*2] = CC[3+bs*2];
7155
7156 D[0+bs*3] = CC[0+bs*3];
7157 D[1+bs*3] = CC[1+bs*3];
7158 D[2+bs*3] = CC[2+bs*3];
7159 D[3+bs*3] = CC[3+bs*3];
7160
7161 return;
7162
7163 }
7164 #endif
7165
7166
7167
7168 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgetrf_nn_4x4_vs_lib4(int kmax,double * A,double * B,int sdb,double * C,double * D,double * inv_diag_D,int km,int kn)7169 void kernel_dgetrf_nn_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *inv_diag_D, int km, int kn)
7170 {
7171
7172 const int bs = 4;
7173
7174 int k;
7175
7176 double tmp;
7177
7178 #if defined(TARGET_GENERIC)
7179 double CC[16] = {0};
7180 #else
7181 ALIGNED( double CC[16], 64 ) = {0};
7182 #endif
7183
7184 double alpha1 = -1.0;
7185 double beta1 = 1.0;
7186
7187 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
7188
7189 // factorization
7190
7191 // first column
7192 tmp = 1.0 / CC[0+bs*0];
7193 CC[1+bs*0] *= tmp;
7194 CC[2+bs*0] *= tmp;
7195 CC[3+bs*0] *= tmp;
7196
7197 inv_diag_D[0] = tmp;
7198
7199 if(kn==1)
7200 goto store;
7201
7202 // second column
7203 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1];
7204 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1];
7205 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1];
7206
7207 tmp = 1.0 / CC[1+bs*1];
7208 CC[2+bs*1] *= tmp;
7209 CC[3+bs*1] *= tmp;
7210
7211 inv_diag_D[1] = tmp;
7212
7213 if(kn==2)
7214 goto store;
7215
7216 // third column
7217 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2];
7218 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2];
7219 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2];
7220
7221 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2];
7222 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2];
7223
7224 tmp = 1.0 / CC[2+bs*2];
7225 CC[3+bs*2] *= tmp;
7226
7227 inv_diag_D[2] = tmp;
7228
7229 if(kn==3)
7230 goto store;
7231
7232 // fourth column
7233 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3];
7234 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3];
7235 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3];
7236
7237 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3];
7238 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3];
7239
7240 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3];
7241
7242 tmp = 1.0 / CC[3+bs*3];
7243
7244 inv_diag_D[3] = tmp;
7245
7246 store:
7247
7248 if(km>=4)
7249 {
7250 D[0+bs*0] = CC[0+bs*0];
7251 D[1+bs*0] = CC[1+bs*0];
7252 D[2+bs*0] = CC[2+bs*0];
7253 D[3+bs*0] = CC[3+bs*0];
7254
7255 if(kn==1)
7256 return;
7257
7258 D[0+bs*1] = CC[0+bs*1];
7259 D[1+bs*1] = CC[1+bs*1];
7260 D[2+bs*1] = CC[2+bs*1];
7261 D[3+bs*1] = CC[3+bs*1];
7262
7263 if(kn==2)
7264 return;
7265
7266 D[0+bs*2] = CC[0+bs*2];
7267 D[1+bs*2] = CC[1+bs*2];
7268 D[2+bs*2] = CC[2+bs*2];
7269 D[3+bs*2] = CC[3+bs*2];
7270
7271 if(kn==3)
7272 return;
7273
7274 D[0+bs*3] = CC[0+bs*3];
7275 D[1+bs*3] = CC[1+bs*3];
7276 D[2+bs*3] = CC[2+bs*3];
7277 D[3+bs*3] = CC[3+bs*3];
7278 }
7279 else if(km>=3)
7280 {
7281 D[0+bs*0] = CC[0+bs*0];
7282 D[1+bs*0] = CC[1+bs*0];
7283 D[2+bs*0] = CC[2+bs*0];
7284
7285 if(kn==1)
7286 return;
7287
7288 D[0+bs*1] = CC[0+bs*1];
7289 D[1+bs*1] = CC[1+bs*1];
7290 D[2+bs*1] = CC[2+bs*1];
7291
7292 if(kn==2)
7293 return;
7294
7295 D[0+bs*2] = CC[0+bs*2];
7296 D[1+bs*2] = CC[1+bs*2];
7297 D[2+bs*2] = CC[2+bs*2];
7298
7299 if(kn==3)
7300 return;
7301
7302 D[0+bs*3] = CC[0+bs*3];
7303 D[1+bs*3] = CC[1+bs*3];
7304 D[2+bs*3] = CC[2+bs*3];
7305 }
7306 else if(km>=2)
7307 {
7308 D[0+bs*0] = CC[0+bs*0];
7309 D[1+bs*0] = CC[1+bs*0];
7310
7311 if(kn==1)
7312 return;
7313
7314 D[0+bs*1] = CC[0+bs*1];
7315 D[1+bs*1] = CC[1+bs*1];
7316
7317 if(kn==2)
7318 return;
7319
7320 D[0+bs*2] = CC[0+bs*2];
7321 D[1+bs*2] = CC[1+bs*2];
7322
7323 if(kn==3)
7324 return;
7325
7326 D[0+bs*3] = CC[0+bs*3];
7327 D[1+bs*3] = CC[1+bs*3];
7328 }
7329 else //if(km>=1)
7330 {
7331 D[0+bs*0] = CC[0+bs*0];
7332
7333 if(kn==1)
7334 return;
7335
7336 D[0+bs*1] = CC[0+bs*1];
7337
7338 if(kn==2)
7339 return;
7340
7341 D[0+bs*2] = CC[0+bs*2];
7342
7343 if(kn==3)
7344 return;
7345
7346 D[0+bs*3] = CC[0+bs*3];
7347 }
7348
7349 return;
7350
7351 }
7352 #endif
7353
7354
7355
7356 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgetrf_nt_4x4_lib4(int kmax,double * A,double * B,double * C,double * D,double * inv_diag_D)7357 void kernel_dgetrf_nt_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D)
7358 {
7359
7360 const int bs = 4;
7361
7362 int k;
7363
7364 double tmp;
7365
7366 #if defined(TARGET_GENERIC)
7367 double CC[16] = {0};
7368 #else
7369 ALIGNED( double CC[16], 64 ) = {0};
7370 #endif
7371
7372 double alpha1 = -1.0;
7373 double beta1 = 1.0;
7374
7375 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC);
7376
7377 // factorization
7378
7379 // first column
7380 tmp = 1.0 / CC[0+bs*0];
7381 CC[1+bs*0] *= tmp;
7382 CC[2+bs*0] *= tmp;
7383 CC[3+bs*0] *= tmp;
7384
7385 inv_diag_D[0] = tmp;
7386
7387 // second column
7388 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1];
7389 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1];
7390 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1];
7391
7392 tmp = 1.0 / CC[1+bs*1];
7393 CC[2+bs*1] *= tmp;
7394 CC[3+bs*1] *= tmp;
7395
7396 inv_diag_D[1] = tmp;
7397
7398 // third column
7399 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2];
7400 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2];
7401 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2];
7402
7403 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2];
7404 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2];
7405
7406 tmp = 1.0 / CC[2+bs*2];
7407 CC[3+bs*2] *= tmp;
7408
7409 inv_diag_D[2] = tmp;
7410
7411 // fourth column
7412 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3];
7413 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3];
7414 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3];
7415
7416 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3];
7417 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3];
7418
7419 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3];
7420
7421 tmp = 1.0 / CC[3+bs*3];
7422
7423 inv_diag_D[3] = tmp;
7424
7425 D[0+bs*0] = CC[0+bs*0];
7426 D[1+bs*0] = CC[1+bs*0];
7427 D[2+bs*0] = CC[2+bs*0];
7428 D[3+bs*0] = CC[3+bs*0];
7429
7430 D[0+bs*1] = CC[0+bs*1];
7431 D[1+bs*1] = CC[1+bs*1];
7432 D[2+bs*1] = CC[2+bs*1];
7433 D[3+bs*1] = CC[3+bs*1];
7434
7435 D[0+bs*2] = CC[0+bs*2];
7436 D[1+bs*2] = CC[1+bs*2];
7437 D[2+bs*2] = CC[2+bs*2];
7438 D[3+bs*2] = CC[3+bs*2];
7439
7440 D[0+bs*3] = CC[0+bs*3];
7441 D[1+bs*3] = CC[1+bs*3];
7442 D[2+bs*3] = CC[2+bs*3];
7443 D[3+bs*3] = CC[3+bs*3];
7444
7445 return;
7446
7447 }
7448 #endif
7449
7450
7451
7452 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dgetrf_nt_4x4_vs_lib4(int kmax,double * A,double * B,double * C,double * D,double * inv_diag_D,int km,int kn)7453 void kernel_dgetrf_nt_4x4_vs_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D, int km, int kn)
7454 {
7455
7456 const int bs = 4;
7457
7458 int k;
7459
7460 double tmp;
7461
7462 #if defined(TARGET_GENERIC)
7463 double CC[16] = {0};
7464 #else
7465 ALIGNED( double CC[16], 64 ) = {0};
7466 #endif
7467
7468 double alpha1 = -1.0;
7469 double beta1 = 1.0;
7470
7471 kernel_dgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC);
7472
7473 // factorization
7474
7475 // first column
7476 tmp = 1.0 / CC[0+bs*0];
7477 CC[1+bs*0] *= tmp;
7478 CC[2+bs*0] *= tmp;
7479 CC[3+bs*0] *= tmp;
7480
7481 inv_diag_D[0] = tmp;
7482
7483 if(kn==1)
7484 goto store;
7485
7486 // second column
7487 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1];
7488 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1];
7489 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1];
7490
7491 tmp = 1.0 / CC[1+bs*1];
7492 CC[2+bs*1] *= tmp;
7493 CC[3+bs*1] *= tmp;
7494
7495 inv_diag_D[1] = tmp;
7496
7497 if(kn==2)
7498 goto store;
7499
7500 // third column
7501 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2];
7502 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2];
7503 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2];
7504
7505 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2];
7506 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2];
7507
7508 tmp = 1.0 / CC[2+bs*2];
7509 CC[3+bs*2] *= tmp;
7510
7511 inv_diag_D[2] = tmp;
7512
7513 if(kn==3)
7514 goto store;
7515
7516 // fourth column
7517 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3];
7518 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3];
7519 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3];
7520
7521 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3];
7522 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3];
7523
7524 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3];
7525
7526 tmp = 1.0 / CC[3+bs*3];
7527
7528 inv_diag_D[3] = tmp;
7529
7530 store:
7531
7532 if(km>=4)
7533 {
7534 D[0+bs*0] = CC[0+bs*0];
7535 D[1+bs*0] = CC[1+bs*0];
7536 D[2+bs*0] = CC[2+bs*0];
7537 D[3+bs*0] = CC[3+bs*0];
7538
7539 if(kn==1)
7540 return;
7541
7542 D[0+bs*1] = CC[0+bs*1];
7543 D[1+bs*1] = CC[1+bs*1];
7544 D[2+bs*1] = CC[2+bs*1];
7545 D[3+bs*1] = CC[3+bs*1];
7546
7547 if(kn==2)
7548 return;
7549
7550 D[0+bs*2] = CC[0+bs*2];
7551 D[1+bs*2] = CC[1+bs*2];
7552 D[2+bs*2] = CC[2+bs*2];
7553 D[3+bs*2] = CC[3+bs*2];
7554
7555 if(kn==3)
7556 return;
7557
7558 D[0+bs*3] = CC[0+bs*3];
7559 D[1+bs*3] = CC[1+bs*3];
7560 D[2+bs*3] = CC[2+bs*3];
7561 D[3+bs*3] = CC[3+bs*3];
7562 }
7563 else if(km>=3)
7564 {
7565 D[0+bs*0] = CC[0+bs*0];
7566 D[1+bs*0] = CC[1+bs*0];
7567 D[2+bs*0] = CC[2+bs*0];
7568
7569 if(kn==1)
7570 return;
7571
7572 D[0+bs*1] = CC[0+bs*1];
7573 D[1+bs*1] = CC[1+bs*1];
7574 D[2+bs*1] = CC[2+bs*1];
7575
7576 if(kn==2)
7577 return;
7578
7579 D[0+bs*2] = CC[0+bs*2];
7580 D[1+bs*2] = CC[1+bs*2];
7581 D[2+bs*2] = CC[2+bs*2];
7582
7583 if(kn==3)
7584 return;
7585
7586 D[0+bs*3] = CC[0+bs*3];
7587 D[1+bs*3] = CC[1+bs*3];
7588 D[2+bs*3] = CC[2+bs*3];
7589 }
7590 else if(km>=2)
7591 {
7592 D[0+bs*0] = CC[0+bs*0];
7593 D[1+bs*0] = CC[1+bs*0];
7594
7595 if(kn==1)
7596 return;
7597
7598 D[0+bs*1] = CC[0+bs*1];
7599 D[1+bs*1] = CC[1+bs*1];
7600
7601 if(kn==2)
7602 return;
7603
7604 D[0+bs*2] = CC[0+bs*2];
7605 D[1+bs*2] = CC[1+bs*2];
7606
7607 if(kn==3)
7608 return;
7609
7610 D[0+bs*3] = CC[0+bs*3];
7611 D[1+bs*3] = CC[1+bs*3];
7612 }
7613 else //if(km>=1)
7614 {
7615 D[0+bs*0] = CC[0+bs*0];
7616
7617 if(kn==1)
7618 return;
7619
7620 D[0+bs*1] = CC[0+bs*1];
7621
7622 if(kn==2)
7623 return;
7624
7625 D[0+bs*2] = CC[0+bs*2];
7626
7627 if(kn==3)
7628 return;
7629
7630 D[0+bs*3] = CC[0+bs*3];
7631 }
7632
7633 return;
7634
7635 }
7636 #endif
7637
7638
7639
7640 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ll_inv_4x4_lib4(int kmax,double * A,double * B,int sdb,double * beta,double * C,double * D,double * E,double * inv_diag_E)7641 void kernel_dtrsm_nn_ll_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *beta, double *C, double *D, double *E, double *inv_diag_E)
7642 {
7643
7644 const int bs = 4;
7645
7646 int k;
7647
7648 double
7649 tmp,
7650 e_0, e_1, e_2, e_3;
7651
7652 #if defined(TARGET_GENERIC)
7653 double CC[16] = {0};
7654 #else
7655 ALIGNED( double CC[16], 64 ) = {0};
7656 #endif
7657
7658 double alpha1 = -1.0;
7659
7660 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC);
7661
7662 // solution
7663
7664 e_0 = inv_diag_E[0];
7665 e_1 = E[1+bs*0];
7666 e_2 = E[2+bs*0];
7667 e_3 = E[3+bs*0];
7668 CC[0+bs*0] *= e_0;
7669 CC[1+bs*0] -= e_1 * CC[0+bs*0];
7670 CC[2+bs*0] -= e_2 * CC[0+bs*0];
7671 CC[3+bs*0] -= e_3 * CC[0+bs*0];
7672 CC[0+bs*1] *= e_0;
7673 CC[1+bs*1] -= e_1 * CC[0+bs*1];
7674 CC[2+bs*1] -= e_2 * CC[0+bs*1];
7675 CC[3+bs*1] -= e_3 * CC[0+bs*1];
7676 CC[0+bs*2] *= e_0;
7677 CC[1+bs*2] -= e_1 * CC[0+bs*2];
7678 CC[2+bs*2] -= e_2 * CC[0+bs*2];
7679 CC[3+bs*2] -= e_3 * CC[0+bs*2];
7680 CC[0+bs*3] *= e_0;
7681 CC[1+bs*3] -= e_1 * CC[0+bs*3];
7682 CC[2+bs*3] -= e_2 * CC[0+bs*3];
7683 CC[3+bs*3] -= e_3 * CC[0+bs*3];
7684
7685 e_1 = inv_diag_E[1];
7686 e_2 = E[2+bs*1];
7687 e_3 = E[3+bs*1];
7688 CC[1+bs*0] *= e_1;
7689 CC[2+bs*0] -= e_2 * CC[1+bs*0];
7690 CC[3+bs*0] -= e_3 * CC[1+bs*0];
7691 CC[1+bs*1] *= e_1;
7692 CC[2+bs*1] -= e_2 * CC[1+bs*1];
7693 CC[3+bs*1] -= e_3 * CC[1+bs*1];
7694 CC[1+bs*2] *= e_1;
7695 CC[2+bs*2] -= e_2 * CC[1+bs*2];
7696 CC[3+bs*2] -= e_3 * CC[1+bs*2];
7697 CC[1+bs*3] *= e_1;
7698 CC[2+bs*3] -= e_2 * CC[1+bs*3];
7699 CC[3+bs*3] -= e_3 * CC[1+bs*3];
7700
7701 e_2 = inv_diag_E[2];
7702 e_3 = E[3+bs*2];
7703 CC[2+bs*0] *= e_2;
7704 CC[3+bs*0] -= e_3 * CC[2+bs*0];
7705 CC[2+bs*1] *= e_2;
7706 CC[3+bs*1] -= e_3 * CC[2+bs*1];
7707 CC[2+bs*2] *= e_2;
7708 CC[3+bs*2] -= e_3 * CC[2+bs*2];
7709 CC[2+bs*3] *= e_2;
7710 CC[3+bs*3] -= e_3 * CC[2+bs*3];
7711
7712 e_3 = inv_diag_E[3];
7713 CC[3+bs*0] *= e_3;
7714 CC[3+bs*1] *= e_3;
7715 CC[3+bs*2] *= e_3;
7716 CC[3+bs*3] *= e_3;
7717
7718 D[0+bs*0] = CC[0+bs*0];
7719 D[1+bs*0] = CC[1+bs*0];
7720 D[2+bs*0] = CC[2+bs*0];
7721 D[3+bs*0] = CC[3+bs*0];
7722
7723 D[0+bs*1] = CC[0+bs*1];
7724 D[1+bs*1] = CC[1+bs*1];
7725 D[2+bs*1] = CC[2+bs*1];
7726 D[3+bs*1] = CC[3+bs*1];
7727
7728 D[0+bs*2] = CC[0+bs*2];
7729 D[1+bs*2] = CC[1+bs*2];
7730 D[2+bs*2] = CC[2+bs*2];
7731 D[3+bs*2] = CC[3+bs*2];
7732
7733 D[0+bs*3] = CC[0+bs*3];
7734 D[1+bs*3] = CC[1+bs*3];
7735 D[2+bs*3] = CC[2+bs*3];
7736 D[3+bs*3] = CC[3+bs*3];
7737
7738 return;
7739
7740 }
7741 #endif
7742
7743
7744
7745 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ll_inv_4x4_vs_lib4(int kmax,double * A,double * B,int sdb,double * beta,double * C,double * D,double * E,double * inv_diag_E,int km,int kn)7746 void kernel_dtrsm_nn_ll_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *beta, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
7747 {
7748
7749 const int bs = 4;
7750
7751 int k;
7752
7753 double
7754 tmp,
7755 e_0, e_1, e_2, e_3;
7756
7757 #if defined(TARGET_GENERIC)
7758 double CC[16] = {0};
7759 #else
7760 ALIGNED( double CC[16], 64 ) = {0};
7761 #endif
7762
7763 double alpha1 = -1.0;
7764
7765 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC);
7766
7767 // solution
7768
7769 e_0 = inv_diag_E[0];
7770 CC[0+bs*0] *= e_0;
7771 CC[0+bs*1] *= e_0;
7772 CC[0+bs*2] *= e_0;
7773 CC[0+bs*3] *= e_0;
7774
7775 if(km==1)
7776 goto store;
7777
7778 e_0 = E[1+bs*0];
7779 CC[1+bs*0] -= e_0 * CC[0+bs*0];
7780 CC[1+bs*1] -= e_0 * CC[0+bs*1];
7781 CC[1+bs*2] -= e_0 * CC[0+bs*2];
7782 CC[1+bs*3] -= e_0 * CC[0+bs*3];
7783 e_1 = inv_diag_E[1];
7784 CC[1+bs*0] *= e_1;
7785 CC[1+bs*1] *= e_1;
7786 CC[1+bs*2] *= e_1;
7787 CC[1+bs*3] *= e_1;
7788
7789 if(km==2)
7790 goto store;
7791
7792 e_0 = E[2+bs*0];
7793 CC[2+bs*0] -= e_0 * CC[0+bs*0];
7794 CC[2+bs*1] -= e_0 * CC[0+bs*1];
7795 CC[2+bs*2] -= e_0 * CC[0+bs*2];
7796 CC[2+bs*3] -= e_0 * CC[0+bs*3];
7797 e_1 = E[2+bs*1];
7798 CC[2+bs*0] -= e_1 * CC[1+bs*0];
7799 CC[2+bs*1] -= e_1 * CC[1+bs*1];
7800 CC[2+bs*2] -= e_1 * CC[1+bs*2];
7801 CC[2+bs*3] -= e_1 * CC[1+bs*3];
7802 e_2 = inv_diag_E[2];
7803 CC[2+bs*0] *= e_2;
7804 CC[2+bs*1] *= e_2;
7805 CC[2+bs*2] *= e_2;
7806 CC[2+bs*3] *= e_2;
7807
7808 if(km==3)
7809 goto store;
7810
7811 e_0 = E[3+bs*0];
7812 CC[3+bs*0] -= e_0 * CC[0+bs*0];
7813 CC[3+bs*1] -= e_0 * CC[0+bs*1];
7814 CC[3+bs*2] -= e_0 * CC[0+bs*2];
7815 CC[3+bs*3] -= e_0 * CC[0+bs*3];
7816 e_1 = E[3+bs*1];
7817 CC[3+bs*0] -= e_1 * CC[1+bs*0];
7818 CC[3+bs*1] -= e_1 * CC[1+bs*1];
7819 CC[3+bs*2] -= e_1 * CC[1+bs*2];
7820 CC[3+bs*3] -= e_1 * CC[1+bs*3];
7821 e_2 = E[3+bs*2];
7822 CC[3+bs*0] -= e_2 * CC[2+bs*0];
7823 CC[3+bs*1] -= e_2 * CC[2+bs*1];
7824 CC[3+bs*2] -= e_2 * CC[2+bs*2];
7825 CC[3+bs*3] -= e_2 * CC[2+bs*3];
7826 e_3 = inv_diag_E[3];
7827 CC[3+bs*0] *= e_3;
7828 CC[3+bs*1] *= e_3;
7829 CC[3+bs*2] *= e_3;
7830 CC[3+bs*3] *= e_3;
7831
7832 store:
7833
7834 if(km>=4)
7835 {
7836 D[0+bs*0] = CC[0+bs*0];
7837 D[1+bs*0] = CC[1+bs*0];
7838 D[2+bs*0] = CC[2+bs*0];
7839 D[3+bs*0] = CC[3+bs*0];
7840
7841 if(kn==1)
7842 return;
7843
7844 D[0+bs*1] = CC[0+bs*1];
7845 D[1+bs*1] = CC[1+bs*1];
7846 D[2+bs*1] = CC[2+bs*1];
7847 D[3+bs*1] = CC[3+bs*1];
7848
7849 if(kn==2)
7850 return;
7851
7852 D[0+bs*2] = CC[0+bs*2];
7853 D[1+bs*2] = CC[1+bs*2];
7854 D[2+bs*2] = CC[2+bs*2];
7855 D[3+bs*2] = CC[3+bs*2];
7856
7857 if(kn==3)
7858 return;
7859
7860 D[0+bs*3] = CC[0+bs*3];
7861 D[1+bs*3] = CC[1+bs*3];
7862 D[2+bs*3] = CC[2+bs*3];
7863 D[3+bs*3] = CC[3+bs*3];
7864 }
7865 else if(km>=3)
7866 {
7867 D[0+bs*0] = CC[0+bs*0];
7868 D[1+bs*0] = CC[1+bs*0];
7869 D[2+bs*0] = CC[2+bs*0];
7870
7871 if(kn==1)
7872 return;
7873
7874 D[0+bs*1] = CC[0+bs*1];
7875 D[1+bs*1] = CC[1+bs*1];
7876 D[2+bs*1] = CC[2+bs*1];
7877
7878 if(kn==2)
7879 return;
7880
7881 D[0+bs*2] = CC[0+bs*2];
7882 D[1+bs*2] = CC[1+bs*2];
7883 D[2+bs*2] = CC[2+bs*2];
7884
7885 if(kn==3)
7886 return;
7887
7888 D[0+bs*3] = CC[0+bs*3];
7889 D[1+bs*3] = CC[1+bs*3];
7890 D[2+bs*3] = CC[2+bs*3];
7891 }
7892 else if(km>=2)
7893 {
7894 D[0+bs*0] = CC[0+bs*0];
7895 D[1+bs*0] = CC[1+bs*0];
7896
7897 if(kn==1)
7898 return;
7899
7900 D[0+bs*1] = CC[0+bs*1];
7901 D[1+bs*1] = CC[1+bs*1];
7902
7903 if(kn==2)
7904 return;
7905
7906 D[0+bs*2] = CC[0+bs*2];
7907 D[1+bs*2] = CC[1+bs*2];
7908
7909 if(kn==3)
7910 return;
7911
7912 D[0+bs*3] = CC[0+bs*3];
7913 D[1+bs*3] = CC[1+bs*3];
7914 }
7915 else //if(km>=1)
7916 {
7917 D[0+bs*0] = CC[0+bs*0];
7918
7919 if(kn==1)
7920 return;
7921
7922 D[0+bs*1] = CC[0+bs*1];
7923
7924 if(kn==2)
7925 return;
7926
7927 D[0+bs*2] = CC[0+bs*2];
7928
7929 if(kn==3)
7930 return;
7931
7932 D[0+bs*3] = CC[0+bs*3];
7933 }
7934
7935 return;
7936
7937 }
7938 #endif
7939
7940
7941
7942 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax,double * A,double * B,int sdb,double * beta,double * C,double * D,double * E)7943 void kernel_dtrsm_nn_ll_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *beta, double *C, double *D, double *E)
7944 {
7945
7946 const int bs = 4;
7947
7948 int k;
7949
7950 double
7951 tmp,
7952 e_1, e_2, e_3;
7953
7954 #if defined(TARGET_GENERIC)
7955 double CC[16] = {0};
7956 #else
7957 ALIGNED( double CC[16], 64 ) = {0};
7958 #endif
7959
7960 double alpha1 = -1.0;
7961
7962 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC);
7963
7964 // solution
7965
7966 e_1 = E[1+bs*0];
7967 e_2 = E[2+bs*0];
7968 e_3 = E[3+bs*0];
7969 CC[1+bs*0] -= e_1 * CC[0+bs*0];
7970 CC[2+bs*0] -= e_2 * CC[0+bs*0];
7971 CC[3+bs*0] -= e_3 * CC[0+bs*0];
7972 CC[1+bs*1] -= e_1 * CC[0+bs*1];
7973 CC[2+bs*1] -= e_2 * CC[0+bs*1];
7974 CC[3+bs*1] -= e_3 * CC[0+bs*1];
7975 CC[1+bs*2] -= e_1 * CC[0+bs*2];
7976 CC[2+bs*2] -= e_2 * CC[0+bs*2];
7977 CC[3+bs*2] -= e_3 * CC[0+bs*2];
7978 CC[1+bs*3] -= e_1 * CC[0+bs*3];
7979 CC[2+bs*3] -= e_2 * CC[0+bs*3];
7980 CC[3+bs*3] -= e_3 * CC[0+bs*3];
7981
7982 e_2 = E[2+bs*1];
7983 e_3 = E[3+bs*1];
7984 CC[2+bs*0] -= e_2 * CC[1+bs*0];
7985 CC[3+bs*0] -= e_3 * CC[1+bs*0];
7986 CC[2+bs*1] -= e_2 * CC[1+bs*1];
7987 CC[3+bs*1] -= e_3 * CC[1+bs*1];
7988 CC[2+bs*2] -= e_2 * CC[1+bs*2];
7989 CC[3+bs*2] -= e_3 * CC[1+bs*2];
7990 CC[2+bs*3] -= e_2 * CC[1+bs*3];
7991 CC[3+bs*3] -= e_3 * CC[1+bs*3];
7992
7993 e_3 = E[3+bs*2];
7994 CC[3+bs*0] -= e_3 * CC[2+bs*0];
7995 CC[3+bs*1] -= e_3 * CC[2+bs*1];
7996 CC[3+bs*2] -= e_3 * CC[2+bs*2];
7997 CC[3+bs*3] -= e_3 * CC[2+bs*3];
7998
7999 D[0+bs*0] = CC[0+bs*0];
8000 D[1+bs*0] = CC[1+bs*0];
8001 D[2+bs*0] = CC[2+bs*0];
8002 D[3+bs*0] = CC[3+bs*0];
8003
8004 D[0+bs*1] = CC[0+bs*1];
8005 D[1+bs*1] = CC[1+bs*1];
8006 D[2+bs*1] = CC[2+bs*1];
8007 D[3+bs*1] = CC[3+bs*1];
8008
8009 D[0+bs*2] = CC[0+bs*2];
8010 D[1+bs*2] = CC[1+bs*2];
8011 D[2+bs*2] = CC[2+bs*2];
8012 D[3+bs*2] = CC[3+bs*2];
8013
8014 D[0+bs*3] = CC[0+bs*3];
8015 D[1+bs*3] = CC[1+bs*3];
8016 D[2+bs*3] = CC[2+bs*3];
8017 D[3+bs*3] = CC[3+bs*3];
8018
8019 return;
8020
8021 }
8022 #endif
8023
8024
8025
8026 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax,double * A,double * B,int sdb,double * beta,double * C,double * D,double * E,int km,int kn)8027 void kernel_dtrsm_nn_ll_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *beta, double *C, double *D, double *E, int km, int kn)
8028 {
8029
8030 const int bs = 4;
8031
8032 int k;
8033
8034 double
8035 tmp,
8036 e_0, e_1, e_2, e_3;
8037
8038 #if defined(TARGET_GENERIC)
8039 double CC[16] = {0};
8040 #else
8041 ALIGNED( double CC[16], 64 ) = {0};
8042 #endif
8043
8044 double alpha1 = -1.0;
8045
8046 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC);
8047
8048 // solution
8049
8050 if(km==1)
8051 goto store;
8052
8053 e_0 = E[1+bs*0];
8054 CC[1+bs*0] -= e_0 * CC[0+bs*0];
8055 CC[1+bs*1] -= e_0 * CC[0+bs*1];
8056 CC[1+bs*2] -= e_0 * CC[0+bs*2];
8057 CC[1+bs*3] -= e_0 * CC[0+bs*3];
8058
8059 if(km==2)
8060 goto store;
8061
8062 e_0 = E[2+bs*0];
8063 CC[2+bs*0] -= e_0 * CC[0+bs*0];
8064 CC[2+bs*1] -= e_0 * CC[0+bs*1];
8065 CC[2+bs*2] -= e_0 * CC[0+bs*2];
8066 CC[2+bs*3] -= e_0 * CC[0+bs*3];
8067 e_1 = E[2+bs*1];
8068 CC[2+bs*0] -= e_1 * CC[1+bs*0];
8069 CC[2+bs*1] -= e_1 * CC[1+bs*1];
8070 CC[2+bs*2] -= e_1 * CC[1+bs*2];
8071 CC[2+bs*3] -= e_1 * CC[1+bs*3];
8072
8073 if(km==3)
8074 goto store;
8075
8076 e_0 = E[3+bs*0];
8077 CC[3+bs*0] -= e_0 * CC[0+bs*0];
8078 CC[3+bs*1] -= e_0 * CC[0+bs*1];
8079 CC[3+bs*2] -= e_0 * CC[0+bs*2];
8080 CC[3+bs*3] -= e_0 * CC[0+bs*3];
8081 e_1 = E[3+bs*1];
8082 CC[3+bs*0] -= e_1 * CC[1+bs*0];
8083 CC[3+bs*1] -= e_1 * CC[1+bs*1];
8084 CC[3+bs*2] -= e_1 * CC[1+bs*2];
8085 CC[3+bs*3] -= e_1 * CC[1+bs*3];
8086 e_2 = E[3+bs*2];
8087 CC[3+bs*0] -= e_2 * CC[2+bs*0];
8088 CC[3+bs*1] -= e_2 * CC[2+bs*1];
8089 CC[3+bs*2] -= e_2 * CC[2+bs*2];
8090 CC[3+bs*3] -= e_2 * CC[2+bs*3];
8091
8092 store:
8093
8094 if(km>=4)
8095 {
8096 D[0+bs*0] = CC[0+bs*0];
8097 D[1+bs*0] = CC[1+bs*0];
8098 D[2+bs*0] = CC[2+bs*0];
8099 D[3+bs*0] = CC[3+bs*0];
8100
8101 if(kn==1)
8102 return;
8103
8104 D[0+bs*1] = CC[0+bs*1];
8105 D[1+bs*1] = CC[1+bs*1];
8106 D[2+bs*1] = CC[2+bs*1];
8107 D[3+bs*1] = CC[3+bs*1];
8108
8109 if(kn==2)
8110 return;
8111
8112 D[0+bs*2] = CC[0+bs*2];
8113 D[1+bs*2] = CC[1+bs*2];
8114 D[2+bs*2] = CC[2+bs*2];
8115 D[3+bs*2] = CC[3+bs*2];
8116
8117 if(kn==3)
8118 return;
8119
8120 D[0+bs*3] = CC[0+bs*3];
8121 D[1+bs*3] = CC[1+bs*3];
8122 D[2+bs*3] = CC[2+bs*3];
8123 D[3+bs*3] = CC[3+bs*3];
8124 }
8125 else if(km>=3)
8126 {
8127 D[0+bs*0] = CC[0+bs*0];
8128 D[1+bs*0] = CC[1+bs*0];
8129 D[2+bs*0] = CC[2+bs*0];
8130
8131 if(kn==1)
8132 return;
8133
8134 D[0+bs*1] = CC[0+bs*1];
8135 D[1+bs*1] = CC[1+bs*1];
8136 D[2+bs*1] = CC[2+bs*1];
8137
8138 if(kn==2)
8139 return;
8140
8141 D[0+bs*2] = CC[0+bs*2];
8142 D[1+bs*2] = CC[1+bs*2];
8143 D[2+bs*2] = CC[2+bs*2];
8144
8145 if(kn==3)
8146 return;
8147
8148 D[0+bs*3] = CC[0+bs*3];
8149 D[1+bs*3] = CC[1+bs*3];
8150 D[2+bs*3] = CC[2+bs*3];
8151 }
8152 else if(km>=2)
8153 {
8154 D[0+bs*0] = CC[0+bs*0];
8155 D[1+bs*0] = CC[1+bs*0];
8156
8157 if(kn==1)
8158 return;
8159
8160 D[0+bs*1] = CC[0+bs*1];
8161 D[1+bs*1] = CC[1+bs*1];
8162
8163 if(kn==2)
8164 return;
8165
8166 D[0+bs*2] = CC[0+bs*2];
8167 D[1+bs*2] = CC[1+bs*2];
8168
8169 if(kn==3)
8170 return;
8171
8172 D[0+bs*3] = CC[0+bs*3];
8173 D[1+bs*3] = CC[1+bs*3];
8174 }
8175 else //if(km>=1)
8176 {
8177 D[0+bs*0] = CC[0+bs*0];
8178
8179 if(kn==1)
8180 return;
8181
8182 D[0+bs*1] = CC[0+bs*1];
8183
8184 if(kn==2)
8185 return;
8186
8187 D[0+bs*2] = CC[0+bs*2];
8188
8189 if(kn==3)
8190 return;
8191
8192 D[0+bs*3] = CC[0+bs*3];
8193 }
8194
8195 return;
8196
8197 }
8198 #endif
8199
8200
8201
8202 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax,double * A,double * B,int sdb,double * beta,double * C,double * D,double * E,double * inv_diag_E)8203 void kernel_dtrsm_nn_ru_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *beta, double *C, double *D, double *E, double *inv_diag_E)
8204 {
8205
8206 const int bs = 4;
8207
8208 int k;
8209
8210 double
8211 tmp,
8212 e_00, e_01, e_02, e_03,
8213 e_11, e_12, e_13,
8214 e_22, e_23,
8215 e_33;
8216
8217 #if defined(TARGET_GENERIC)
8218 double CC[16] = {0};
8219 #else
8220 ALIGNED( double CC[16], 64 ) = {0};
8221 #endif
8222
8223 double alpha1 = -1.0;
8224
8225 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC);
8226
8227 // solve
8228
8229 e_00 = inv_diag_E[0];
8230 CC[0+bs*0] *= e_00;
8231 CC[1+bs*0] *= e_00;
8232 CC[2+bs*0] *= e_00;
8233 CC[3+bs*0] *= e_00;
8234
8235 e_01 = E[0+bs*1];
8236 e_11 = inv_diag_E[1];
8237 CC[0+bs*1] -= CC[0+bs*0] * e_01;
8238 CC[1+bs*1] -= CC[1+bs*0] * e_01;
8239 CC[2+bs*1] -= CC[2+bs*0] * e_01;
8240 CC[3+bs*1] -= CC[3+bs*0] * e_01;
8241 CC[0+bs*1] *= e_11;
8242 CC[1+bs*1] *= e_11;
8243 CC[2+bs*1] *= e_11;
8244 CC[3+bs*1] *= e_11;
8245
8246 e_02 = E[0+bs*2];
8247 e_12 = E[1+bs*2];
8248 e_22 = inv_diag_E[2];
8249 CC[0+bs*2] -= CC[0+bs*0] * e_02;
8250 CC[1+bs*2] -= CC[1+bs*0] * e_02;
8251 CC[2+bs*2] -= CC[2+bs*0] * e_02;
8252 CC[3+bs*2] -= CC[3+bs*0] * e_02;
8253 CC[0+bs*2] -= CC[0+bs*1] * e_12;
8254 CC[1+bs*2] -= CC[1+bs*1] * e_12;
8255 CC[2+bs*2] -= CC[2+bs*1] * e_12;
8256 CC[3+bs*2] -= CC[3+bs*1] * e_12;
8257 CC[0+bs*2] *= e_22;
8258 CC[1+bs*2] *= e_22;
8259 CC[2+bs*2] *= e_22;
8260 CC[3+bs*2] *= e_22;
8261
8262 e_03 = E[0+bs*3];
8263 e_13 = E[1+bs*3];
8264 e_23 = E[2+bs*3];
8265 e_33 = inv_diag_E[3];
8266 CC[0+bs*3] -= CC[0+bs*0] * e_03;
8267 CC[1+bs*3] -= CC[1+bs*0] * e_03;
8268 CC[2+bs*3] -= CC[2+bs*0] * e_03;
8269 CC[3+bs*3] -= CC[3+bs*0] * e_03;
8270 CC[0+bs*3] -= CC[0+bs*1] * e_13;
8271 CC[1+bs*3] -= CC[1+bs*1] * e_13;
8272 CC[2+bs*3] -= CC[2+bs*1] * e_13;
8273 CC[3+bs*3] -= CC[3+bs*1] * e_13;
8274 CC[0+bs*3] -= CC[0+bs*2] * e_23;
8275 CC[1+bs*3] -= CC[1+bs*2] * e_23;
8276 CC[2+bs*3] -= CC[2+bs*2] * e_23;
8277 CC[3+bs*3] -= CC[3+bs*2] * e_23;
8278 CC[0+bs*3] *= e_33;
8279 CC[1+bs*3] *= e_33;
8280 CC[2+bs*3] *= e_33;
8281 CC[3+bs*3] *= e_33;
8282
8283 D[0+bs*0] = CC[0+bs*0];
8284 D[1+bs*0] = CC[1+bs*0];
8285 D[2+bs*0] = CC[2+bs*0];
8286 D[3+bs*0] = CC[3+bs*0];
8287
8288 D[0+bs*1] = CC[0+bs*1];
8289 D[1+bs*1] = CC[1+bs*1];
8290 D[2+bs*1] = CC[2+bs*1];
8291 D[3+bs*1] = CC[3+bs*1];
8292
8293 D[0+bs*2] = CC[0+bs*2];
8294 D[1+bs*2] = CC[1+bs*2];
8295 D[2+bs*2] = CC[2+bs*2];
8296 D[3+bs*2] = CC[3+bs*2];
8297
8298 D[0+bs*3] = CC[0+bs*3];
8299 D[1+bs*3] = CC[1+bs*3];
8300 D[2+bs*3] = CC[2+bs*3];
8301 D[3+bs*3] = CC[3+bs*3];
8302
8303 return;
8304
8305 }
8306 #endif
8307
8308
8309
8310 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax,double * A,double * B,int sdb,double * beta,double * C,double * D,double * E,double * inv_diag_E,int km,int kn)8311 void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *beta, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
8312 {
8313
8314 const int bs = 4;
8315
8316 int k;
8317
8318 double
8319 tmp,
8320 e_00, e_01, e_02, e_03,
8321 e_11, e_12, e_13,
8322 e_22, e_23,
8323 e_33;
8324
8325 #if defined(TARGET_GENERIC)
8326 double CC[16] = {0};
8327 #else
8328 ALIGNED( double CC[16], 64 ) = {0};
8329 #endif
8330
8331 double alpha1 = -1.0;
8332
8333 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC);
8334
8335 // solve
8336
8337 e_00 = inv_diag_E[0];
8338 CC[0+bs*0] *= e_00;
8339 CC[1+bs*0] *= e_00;
8340 CC[2+bs*0] *= e_00;
8341 CC[3+bs*0] *= e_00;
8342
8343 if(kn==1)
8344 goto store;
8345
8346 e_01 = E[0+bs*1];
8347 e_11 = inv_diag_E[1];
8348 CC[0+bs*1] -= CC[0+bs*0] * e_01;
8349 CC[1+bs*1] -= CC[1+bs*0] * e_01;
8350 CC[2+bs*1] -= CC[2+bs*0] * e_01;
8351 CC[3+bs*1] -= CC[3+bs*0] * e_01;
8352 CC[0+bs*1] *= e_11;
8353 CC[1+bs*1] *= e_11;
8354 CC[2+bs*1] *= e_11;
8355 CC[3+bs*1] *= e_11;
8356
8357 if(kn==2)
8358 goto store;
8359
8360 e_02 = E[0+bs*2];
8361 e_12 = E[1+bs*2];
8362 e_22 = inv_diag_E[2];
8363 CC[0+bs*2] -= CC[0+bs*0] * e_02;
8364 CC[1+bs*2] -= CC[1+bs*0] * e_02;
8365 CC[2+bs*2] -= CC[2+bs*0] * e_02;
8366 CC[3+bs*2] -= CC[3+bs*0] * e_02;
8367 CC[0+bs*2] -= CC[0+bs*1] * e_12;
8368 CC[1+bs*2] -= CC[1+bs*1] * e_12;
8369 CC[2+bs*2] -= CC[2+bs*1] * e_12;
8370 CC[3+bs*2] -= CC[3+bs*1] * e_12;
8371 CC[0+bs*2] *= e_22;
8372 CC[1+bs*2] *= e_22;
8373 CC[2+bs*2] *= e_22;
8374 CC[3+bs*2] *= e_22;
8375
8376 if(kn==3)
8377 goto store;
8378
8379 e_03 = E[0+bs*3];
8380 e_13 = E[1+bs*3];
8381 e_23 = E[2+bs*3];
8382 e_33 = inv_diag_E[3];
8383 CC[0+bs*3] -= CC[0+bs*0] * e_03;
8384 CC[1+bs*3] -= CC[1+bs*0] * e_03;
8385 CC[2+bs*3] -= CC[2+bs*0] * e_03;
8386 CC[3+bs*3] -= CC[3+bs*0] * e_03;
8387 CC[0+bs*3] -= CC[0+bs*1] * e_13;
8388 CC[1+bs*3] -= CC[1+bs*1] * e_13;
8389 CC[2+bs*3] -= CC[2+bs*1] * e_13;
8390 CC[3+bs*3] -= CC[3+bs*1] * e_13;
8391 CC[0+bs*3] -= CC[0+bs*2] * e_23;
8392 CC[1+bs*3] -= CC[1+bs*2] * e_23;
8393 CC[2+bs*3] -= CC[2+bs*2] * e_23;
8394 CC[3+bs*3] -= CC[3+bs*2] * e_23;
8395 CC[0+bs*3] *= e_33;
8396 CC[1+bs*3] *= e_33;
8397 CC[2+bs*3] *= e_33;
8398 CC[3+bs*3] *= e_33;
8399
8400 store:
8401
8402 if(km>=4)
8403 {
8404 D[0+bs*0] = CC[0+bs*0];
8405 D[1+bs*0] = CC[1+bs*0];
8406 D[2+bs*0] = CC[2+bs*0];
8407 D[3+bs*0] = CC[3+bs*0];
8408
8409 if(kn==1)
8410 return;
8411
8412 D[0+bs*1] = CC[0+bs*1];
8413 D[1+bs*1] = CC[1+bs*1];
8414 D[2+bs*1] = CC[2+bs*1];
8415 D[3+bs*1] = CC[3+bs*1];
8416
8417 if(kn==2)
8418 return;
8419
8420 D[0+bs*2] = CC[0+bs*2];
8421 D[1+bs*2] = CC[1+bs*2];
8422 D[2+bs*2] = CC[2+bs*2];
8423 D[3+bs*2] = CC[3+bs*2];
8424
8425 if(kn==3)
8426 return;
8427
8428 D[0+bs*3] = CC[0+bs*3];
8429 D[1+bs*3] = CC[1+bs*3];
8430 D[2+bs*3] = CC[2+bs*3];
8431 D[3+bs*3] = CC[3+bs*3];
8432 }
8433 else if(km>=3)
8434 {
8435 D[0+bs*0] = CC[0+bs*0];
8436 D[1+bs*0] = CC[1+bs*0];
8437 D[2+bs*0] = CC[2+bs*0];
8438
8439 if(kn==1)
8440 return;
8441
8442 D[0+bs*1] = CC[0+bs*1];
8443 D[1+bs*1] = CC[1+bs*1];
8444 D[2+bs*1] = CC[2+bs*1];
8445
8446 if(kn==2)
8447 return;
8448
8449 D[0+bs*2] = CC[0+bs*2];
8450 D[1+bs*2] = CC[1+bs*2];
8451 D[2+bs*2] = CC[2+bs*2];
8452
8453 if(kn==3)
8454 return;
8455
8456 D[0+bs*3] = CC[0+bs*3];
8457 D[1+bs*3] = CC[1+bs*3];
8458 D[2+bs*3] = CC[2+bs*3];
8459 }
8460 else if(km>=2)
8461 {
8462 D[0+bs*0] = CC[0+bs*0];
8463 D[1+bs*0] = CC[1+bs*0];
8464
8465 if(kn==1)
8466 return;
8467
8468 D[0+bs*1] = CC[0+bs*1];
8469 D[1+bs*1] = CC[1+bs*1];
8470
8471 if(kn==2)
8472 return;
8473
8474 D[0+bs*2] = CC[0+bs*2];
8475 D[1+bs*2] = CC[1+bs*2];
8476
8477 if(kn==3)
8478 return;
8479
8480 D[0+bs*3] = CC[0+bs*3];
8481 D[1+bs*3] = CC[1+bs*3];
8482 }
8483 else //if(km>=1)
8484 {
8485 D[0+bs*0] = CC[0+bs*0];
8486
8487 if(kn==1)
8488 return;
8489
8490 D[0+bs*1] = CC[0+bs*1];
8491
8492 if(kn==2)
8493 return;
8494
8495 D[0+bs*2] = CC[0+bs*2];
8496
8497 if(kn==3)
8498 return;
8499
8500 D[0+bs*3] = CC[0+bs*3];
8501 }
8502
8503 return;
8504
8505 }
8506 #endif
8507
8508
8509
8510 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax,double * A,double * B,int sdb,double * C,double * D,double * E,double * inv_diag_E)8511 void kernel_dtrsm_nn_lu_inv_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E)
8512 {
8513
8514 const int bs = 4;
8515
8516 int k;
8517
8518 double
8519 tmp,
8520 a_0, a_1, a_2, a_3,
8521 b_0, b_1, b_2, b_3,
8522 e_00, e_01, e_02, e_03,
8523 e_11, e_12, e_13,
8524 e_22, e_23,
8525 e_33;
8526
8527 #if defined(TARGET_GENERIC)
8528 double CC[16] = {0};
8529 #else
8530 ALIGNED( double CC[16], 64 ) = {0};
8531 #endif
8532
8533 double alpha1 = -1.0;
8534 double beta1 = 1.0;
8535
8536 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
8537
8538 // solve
8539
8540 e_03 = E[0+bs*3];
8541 e_13 = E[1+bs*3];
8542 e_23 = E[2+bs*3];
8543 e_33 = inv_diag_E[3];
8544 CC[3+bs*0] *= e_33;
8545 CC[3+bs*1] *= e_33;
8546 CC[3+bs*2] *= e_33;
8547 CC[3+bs*3] *= e_33;
8548 CC[0+bs*0] -= e_03 * CC[3+bs*0];
8549 CC[0+bs*1] -= e_03 * CC[3+bs*1];
8550 CC[0+bs*2] -= e_03 * CC[3+bs*2];
8551 CC[0+bs*3] -= e_03 * CC[3+bs*3];
8552 CC[1+bs*0] -= e_13 * CC[3+bs*0];
8553 CC[1+bs*1] -= e_13 * CC[3+bs*1];
8554 CC[1+bs*2] -= e_13 * CC[3+bs*2];
8555 CC[1+bs*3] -= e_13 * CC[3+bs*3];
8556 CC[2+bs*0] -= e_23 * CC[3+bs*0];
8557 CC[2+bs*1] -= e_23 * CC[3+bs*1];
8558 CC[2+bs*2] -= e_23 * CC[3+bs*2];
8559 CC[2+bs*3] -= e_23 * CC[3+bs*3];
8560
8561 e_02 = E[0+bs*2];
8562 e_12 = E[1+bs*2];
8563 e_22 = inv_diag_E[2];
8564 CC[2+bs*0] *= e_22;
8565 CC[2+bs*1] *= e_22;
8566 CC[2+bs*2] *= e_22;
8567 CC[2+bs*3] *= e_22;
8568 CC[0+bs*0] -= e_02 * CC[2+bs*0];
8569 CC[0+bs*1] -= e_02 * CC[2+bs*1];
8570 CC[0+bs*2] -= e_02 * CC[2+bs*2];
8571 CC[0+bs*3] -= e_02 * CC[2+bs*3];
8572 CC[1+bs*0] -= e_12 * CC[2+bs*0];
8573 CC[1+bs*1] -= e_12 * CC[2+bs*1];
8574 CC[1+bs*2] -= e_12 * CC[2+bs*2];
8575 CC[1+bs*3] -= e_12 * CC[2+bs*3];
8576
8577 e_01 = E[0+bs*1];
8578 e_11 = inv_diag_E[1];
8579 CC[1+bs*0] *= e_11;
8580 CC[1+bs*1] *= e_11;
8581 CC[1+bs*2] *= e_11;
8582 CC[1+bs*3] *= e_11;
8583 CC[0+bs*0] -= e_01 * CC[1+bs*0];
8584 CC[0+bs*1] -= e_01 * CC[1+bs*1];
8585 CC[0+bs*2] -= e_01 * CC[1+bs*2];
8586 CC[0+bs*3] -= e_01 * CC[1+bs*3];
8587
8588 e_00 = inv_diag_E[0];
8589 CC[0+bs*0] *= e_00;
8590 CC[0+bs*1] *= e_00;
8591 CC[0+bs*2] *= e_00;
8592 CC[0+bs*3] *= e_00;
8593
8594 D[0+bs*0] = CC[0+bs*0];
8595 D[1+bs*0] = CC[1+bs*0];
8596 D[2+bs*0] = CC[2+bs*0];
8597 D[3+bs*0] = CC[3+bs*0];
8598
8599 D[0+bs*1] = CC[0+bs*1];
8600 D[1+bs*1] = CC[1+bs*1];
8601 D[2+bs*1] = CC[2+bs*1];
8602 D[3+bs*1] = CC[3+bs*1];
8603
8604 D[0+bs*2] = CC[0+bs*2];
8605 D[1+bs*2] = CC[1+bs*2];
8606 D[2+bs*2] = CC[2+bs*2];
8607 D[3+bs*2] = CC[3+bs*2];
8608
8609 D[0+bs*3] = CC[0+bs*3];
8610 D[1+bs*3] = CC[1+bs*3];
8611 D[2+bs*3] = CC[2+bs*3];
8612 D[3+bs*3] = CC[3+bs*3];
8613
8614 return;
8615
8616 }
8617 #endif
8618
8619
8620
8621 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax,double * A,double * B,int sdb,double * C,double * D,double * E,double * inv_diag_E,int km,int kn)8622 void kernel_dtrsm_nn_lu_inv_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, double *inv_diag_E, int km, int kn)
8623 {
8624
8625 const int bs = 4;
8626
8627 int k;
8628
8629 double
8630 tmp,
8631 a_0, a_1, a_2, a_3,
8632 b_0, b_1, b_2, b_3,
8633 e_00, e_01, e_02, e_03,
8634 e_11, e_12, e_13,
8635 e_22, e_23,
8636 e_33;
8637
8638 #if defined(TARGET_GENERIC)
8639 double CC[16] = {0};
8640 #else
8641 ALIGNED( double CC[16], 64 ) = {0};
8642 #endif
8643
8644 double alpha1 = -1.0;
8645 double beta1 = 1.0;
8646
8647 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
8648
8649 // solve
8650
8651 if(km>3)
8652 {
8653 e_03 = E[0+bs*3];
8654 e_13 = E[1+bs*3];
8655 e_23 = E[2+bs*3];
8656 e_33 = inv_diag_E[3];
8657 CC[3+bs*0] *= e_33;
8658 CC[3+bs*1] *= e_33;
8659 CC[3+bs*2] *= e_33;
8660 CC[3+bs*3] *= e_33;
8661 CC[0+bs*0] -= e_03 * CC[3+bs*0];
8662 CC[0+bs*1] -= e_03 * CC[3+bs*1];
8663 CC[0+bs*2] -= e_03 * CC[3+bs*2];
8664 CC[0+bs*3] -= e_03 * CC[3+bs*3];
8665 CC[1+bs*0] -= e_13 * CC[3+bs*0];
8666 CC[1+bs*1] -= e_13 * CC[3+bs*1];
8667 CC[1+bs*2] -= e_13 * CC[3+bs*2];
8668 CC[1+bs*3] -= e_13 * CC[3+bs*3];
8669 CC[2+bs*0] -= e_23 * CC[3+bs*0];
8670 CC[2+bs*1] -= e_23 * CC[3+bs*1];
8671 CC[2+bs*2] -= e_23 * CC[3+bs*2];
8672 CC[2+bs*3] -= e_23 * CC[3+bs*3];
8673 }
8674
8675 if(km>2)
8676 {
8677 e_02 = E[0+bs*2];
8678 e_12 = E[1+bs*2];
8679 e_22 = inv_diag_E[2];
8680 CC[2+bs*0] *= e_22;
8681 CC[2+bs*1] *= e_22;
8682 CC[2+bs*2] *= e_22;
8683 CC[2+bs*3] *= e_22;
8684 CC[0+bs*0] -= e_02 * CC[2+bs*0];
8685 CC[0+bs*1] -= e_02 * CC[2+bs*1];
8686 CC[0+bs*2] -= e_02 * CC[2+bs*2];
8687 CC[0+bs*3] -= e_02 * CC[2+bs*3];
8688 CC[1+bs*0] -= e_12 * CC[2+bs*0];
8689 CC[1+bs*1] -= e_12 * CC[2+bs*1];
8690 CC[1+bs*2] -= e_12 * CC[2+bs*2];
8691 CC[1+bs*3] -= e_12 * CC[2+bs*3];
8692 }
8693
8694 if(km>1)
8695 {
8696 e_01 = E[0+bs*1];
8697 e_11 = inv_diag_E[1];
8698 CC[1+bs*0] *= e_11;
8699 CC[1+bs*1] *= e_11;
8700 CC[1+bs*2] *= e_11;
8701 CC[1+bs*3] *= e_11;
8702 CC[0+bs*0] -= e_01 * CC[1+bs*0];
8703 CC[0+bs*1] -= e_01 * CC[1+bs*1];
8704 CC[0+bs*2] -= e_01 * CC[1+bs*2];
8705 CC[0+bs*3] -= e_01 * CC[1+bs*3];
8706 }
8707
8708 e_00 = inv_diag_E[0];
8709 CC[0+bs*0] *= e_00;
8710 CC[0+bs*1] *= e_00;
8711 CC[0+bs*2] *= e_00;
8712 CC[0+bs*3] *= e_00;
8713
8714 store:
8715
8716 if(km>=4)
8717 {
8718 D[0+bs*0] = CC[0+bs*0];
8719 D[1+bs*0] = CC[1+bs*0];
8720 D[2+bs*0] = CC[2+bs*0];
8721 D[3+bs*0] = CC[3+bs*0];
8722
8723 if(kn==1)
8724 return;
8725
8726 D[0+bs*1] = CC[0+bs*1];
8727 D[1+bs*1] = CC[1+bs*1];
8728 D[2+bs*1] = CC[2+bs*1];
8729 D[3+bs*1] = CC[3+bs*1];
8730
8731 if(kn==2)
8732 return;
8733
8734 D[0+bs*2] = CC[0+bs*2];
8735 D[1+bs*2] = CC[1+bs*2];
8736 D[2+bs*2] = CC[2+bs*2];
8737 D[3+bs*2] = CC[3+bs*2];
8738
8739 if(kn==3)
8740 return;
8741
8742 D[0+bs*3] = CC[0+bs*3];
8743 D[1+bs*3] = CC[1+bs*3];
8744 D[2+bs*3] = CC[2+bs*3];
8745 D[3+bs*3] = CC[3+bs*3];
8746 }
8747 else if(km>=3)
8748 {
8749 D[0+bs*0] = CC[0+bs*0];
8750 D[1+bs*0] = CC[1+bs*0];
8751 D[2+bs*0] = CC[2+bs*0];
8752
8753 if(kn==1)
8754 return;
8755
8756 D[0+bs*1] = CC[0+bs*1];
8757 D[1+bs*1] = CC[1+bs*1];
8758 D[2+bs*1] = CC[2+bs*1];
8759
8760 if(kn==2)
8761 return;
8762
8763 D[0+bs*2] = CC[0+bs*2];
8764 D[1+bs*2] = CC[1+bs*2];
8765 D[2+bs*2] = CC[2+bs*2];
8766
8767 if(kn==3)
8768 return;
8769
8770 D[0+bs*3] = CC[0+bs*3];
8771 D[1+bs*3] = CC[1+bs*3];
8772 D[2+bs*3] = CC[2+bs*3];
8773 }
8774 else if(km>=2)
8775 {
8776 D[0+bs*0] = CC[0+bs*0];
8777 D[1+bs*0] = CC[1+bs*0];
8778
8779 if(kn==1)
8780 return;
8781
8782 D[0+bs*1] = CC[0+bs*1];
8783 D[1+bs*1] = CC[1+bs*1];
8784
8785 if(kn==2)
8786 return;
8787
8788 D[0+bs*2] = CC[0+bs*2];
8789 D[1+bs*2] = CC[1+bs*2];
8790
8791 if(kn==3)
8792 return;
8793
8794 D[0+bs*3] = CC[0+bs*3];
8795 D[1+bs*3] = CC[1+bs*3];
8796 }
8797 else //if(km>=1)
8798 {
8799 D[0+bs*0] = CC[0+bs*0];
8800
8801 if(kn==1)
8802 return;
8803
8804 D[0+bs*1] = CC[0+bs*1];
8805
8806 if(kn==2)
8807 return;
8808
8809 D[0+bs*2] = CC[0+bs*2];
8810
8811 if(kn==3)
8812 return;
8813
8814 D[0+bs*3] = CC[0+bs*3];
8815 }
8816
8817 return;
8818
8819 }
8820 #endif
8821
8822
8823
8824
8825
8826 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_lu_one_4x4_lib4(int kmax,double * A,double * B,int sdb,double * C,double * D,double * E)8827 void kernel_dtrsm_nn_lu_one_4x4_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E)
8828 {
8829
8830 const int bs = 4;
8831
8832 int k;
8833
8834 double
8835 tmp,
8836 a_0, a_1, a_2, a_3,
8837 b_0, b_1, b_2, b_3,
8838 e_01, e_02, e_03,
8839 e_12, e_13,
8840 e_23;
8841
8842 #if defined(TARGET_GENERIC)
8843 double CC[16] = {0};
8844 #else
8845 ALIGNED( double CC[16], 64 ) = {0};
8846 #endif
8847
8848 double alpha1 = -1.0;
8849 double beta1 = 1.0;
8850
8851 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
8852
8853 // solve
8854
8855 e_03 = E[0+bs*3];
8856 e_13 = E[1+bs*3];
8857 e_23 = E[2+bs*3];
8858 CC[0+bs*0] -= e_03 * CC[3+bs*0];
8859 CC[0+bs*1] -= e_03 * CC[3+bs*1];
8860 CC[0+bs*2] -= e_03 * CC[3+bs*2];
8861 CC[0+bs*3] -= e_03 * CC[3+bs*3];
8862 CC[1+bs*0] -= e_13 * CC[3+bs*0];
8863 CC[1+bs*1] -= e_13 * CC[3+bs*1];
8864 CC[1+bs*2] -= e_13 * CC[3+bs*2];
8865 CC[1+bs*3] -= e_13 * CC[3+bs*3];
8866 CC[2+bs*0] -= e_23 * CC[3+bs*0];
8867 CC[2+bs*1] -= e_23 * CC[3+bs*1];
8868 CC[2+bs*2] -= e_23 * CC[3+bs*2];
8869 CC[2+bs*3] -= e_23 * CC[3+bs*3];
8870
8871 e_02 = E[0+bs*2];
8872 e_12 = E[1+bs*2];
8873 CC[0+bs*0] -= e_02 * CC[2+bs*0];
8874 CC[0+bs*1] -= e_02 * CC[2+bs*1];
8875 CC[0+bs*2] -= e_02 * CC[2+bs*2];
8876 CC[0+bs*3] -= e_02 * CC[2+bs*3];
8877 CC[1+bs*0] -= e_12 * CC[2+bs*0];
8878 CC[1+bs*1] -= e_12 * CC[2+bs*1];
8879 CC[1+bs*2] -= e_12 * CC[2+bs*2];
8880 CC[1+bs*3] -= e_12 * CC[2+bs*3];
8881
8882 e_01 = E[0+bs*1];
8883 CC[0+bs*0] -= e_01 * CC[1+bs*0];
8884 CC[0+bs*1] -= e_01 * CC[1+bs*1];
8885 CC[0+bs*2] -= e_01 * CC[1+bs*2];
8886 CC[0+bs*3] -= e_01 * CC[1+bs*3];
8887
8888 D[0+bs*0] = CC[0+bs*0];
8889 D[1+bs*0] = CC[1+bs*0];
8890 D[2+bs*0] = CC[2+bs*0];
8891 D[3+bs*0] = CC[3+bs*0];
8892
8893 D[0+bs*1] = CC[0+bs*1];
8894 D[1+bs*1] = CC[1+bs*1];
8895 D[2+bs*1] = CC[2+bs*1];
8896 D[3+bs*1] = CC[3+bs*1];
8897
8898 D[0+bs*2] = CC[0+bs*2];
8899 D[1+bs*2] = CC[1+bs*2];
8900 D[2+bs*2] = CC[2+bs*2];
8901 D[3+bs*2] = CC[3+bs*2];
8902
8903 D[0+bs*3] = CC[0+bs*3];
8904 D[1+bs*3] = CC[1+bs*3];
8905 D[2+bs*3] = CC[2+bs*3];
8906 D[3+bs*3] = CC[3+bs*3];
8907
8908 return;
8909
8910 }
8911 #endif
8912
8913
8914
8915 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_dtrsm_nn_lu_one_4x4_vs_lib4(int kmax,double * A,double * B,int sdb,double * C,double * D,double * E,int km,int kn)8916 void kernel_dtrsm_nn_lu_one_4x4_vs_lib4(int kmax, double *A, double *B, int sdb, double *C, double *D, double *E, int km, int kn)
8917 {
8918
8919 const int bs = 4;
8920
8921 int k;
8922
8923 double
8924 tmp,
8925 a_0, a_1, a_2, a_3,
8926 b_0, b_1, b_2, b_3,
8927 e_01, e_02, e_03,
8928 e_12, e_13,
8929 e_23;
8930
8931 #if defined(TARGET_GENERIC)
8932 double CC[16] = {0};
8933 #else
8934 ALIGNED( double CC[16], 64 ) = {0};
8935 #endif
8936
8937 double alpha1 = -1.0;
8938 double beta1 = 1.0;
8939
8940 kernel_dgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
8941
8942 // solve
8943
8944 if(km>3)
8945 {
8946 e_03 = E[0+bs*3];
8947 e_13 = E[1+bs*3];
8948 e_23 = E[2+bs*3];
8949 CC[0+bs*0] -= e_03 * CC[3+bs*0];
8950 CC[0+bs*1] -= e_03 * CC[3+bs*1];
8951 CC[0+bs*2] -= e_03 * CC[3+bs*2];
8952 CC[0+bs*3] -= e_03 * CC[3+bs*3];
8953 CC[1+bs*0] -= e_13 * CC[3+bs*0];
8954 CC[1+bs*1] -= e_13 * CC[3+bs*1];
8955 CC[1+bs*2] -= e_13 * CC[3+bs*2];
8956 CC[1+bs*3] -= e_13 * CC[3+bs*3];
8957 CC[2+bs*0] -= e_23 * CC[3+bs*0];
8958 CC[2+bs*1] -= e_23 * CC[3+bs*1];
8959 CC[2+bs*2] -= e_23 * CC[3+bs*2];
8960 CC[2+bs*3] -= e_23 * CC[3+bs*3];
8961 }
8962
8963 if(km>2)
8964 {
8965 e_02 = E[0+bs*2];
8966 e_12 = E[1+bs*2];
8967 CC[0+bs*0] -= e_02 * CC[2+bs*0];
8968 CC[0+bs*1] -= e_02 * CC[2+bs*1];
8969 CC[0+bs*2] -= e_02 * CC[2+bs*2];
8970 CC[0+bs*3] -= e_02 * CC[2+bs*3];
8971 CC[1+bs*0] -= e_12 * CC[2+bs*0];
8972 CC[1+bs*1] -= e_12 * CC[2+bs*1];
8973 CC[1+bs*2] -= e_12 * CC[2+bs*2];
8974 CC[1+bs*3] -= e_12 * CC[2+bs*3];
8975 }
8976
8977 if(km>1)
8978 {
8979 e_01 = E[0+bs*1];
8980 CC[0+bs*0] -= e_01 * CC[1+bs*0];
8981 CC[0+bs*1] -= e_01 * CC[1+bs*1];
8982 CC[0+bs*2] -= e_01 * CC[1+bs*2];
8983 CC[0+bs*3] -= e_01 * CC[1+bs*3];
8984 }
8985
8986 store:
8987
8988 if(km>=4)
8989 {
8990 D[0+bs*0] = CC[0+bs*0];
8991 D[1+bs*0] = CC[1+bs*0];
8992 D[2+bs*0] = CC[2+bs*0];
8993 D[3+bs*0] = CC[3+bs*0];
8994
8995 if(kn==1)
8996 return;
8997
8998 D[0+bs*1] = CC[0+bs*1];
8999 D[1+bs*1] = CC[1+bs*1];
9000 D[2+bs*1] = CC[2+bs*1];
9001 D[3+bs*1] = CC[3+bs*1];
9002
9003 if(kn==2)
9004 return;
9005
9006 D[0+bs*2] = CC[0+bs*2];
9007 D[1+bs*2] = CC[1+bs*2];
9008 D[2+bs*2] = CC[2+bs*2];
9009 D[3+bs*2] = CC[3+bs*2];
9010
9011 if(kn==3)
9012 return;
9013
9014 D[0+bs*3] = CC[0+bs*3];
9015 D[1+bs*3] = CC[1+bs*3];
9016 D[2+bs*3] = CC[2+bs*3];
9017 D[3+bs*3] = CC[3+bs*3];
9018 }
9019 else if(km>=3)
9020 {
9021 D[0+bs*0] = CC[0+bs*0];
9022 D[1+bs*0] = CC[1+bs*0];
9023 D[2+bs*0] = CC[2+bs*0];
9024
9025 if(kn==1)
9026 return;
9027
9028 D[0+bs*1] = CC[0+bs*1];
9029 D[1+bs*1] = CC[1+bs*1];
9030 D[2+bs*1] = CC[2+bs*1];
9031
9032 if(kn==2)
9033 return;
9034
9035 D[0+bs*2] = CC[0+bs*2];
9036 D[1+bs*2] = CC[1+bs*2];
9037 D[2+bs*2] = CC[2+bs*2];
9038
9039 if(kn==3)
9040 return;
9041
9042 D[0+bs*3] = CC[0+bs*3];
9043 D[1+bs*3] = CC[1+bs*3];
9044 D[2+bs*3] = CC[2+bs*3];
9045 }
9046 else if(km>=2)
9047 {
9048 D[0+bs*0] = CC[0+bs*0];
9049 D[1+bs*0] = CC[1+bs*0];
9050
9051 if(kn==1)
9052 return;
9053
9054 D[0+bs*1] = CC[0+bs*1];
9055 D[1+bs*1] = CC[1+bs*1];
9056
9057 if(kn==2)
9058 return;
9059
9060 D[0+bs*2] = CC[0+bs*2];
9061 D[1+bs*2] = CC[1+bs*2];
9062
9063 if(kn==3)
9064 return;
9065
9066 D[0+bs*3] = CC[0+bs*3];
9067 D[1+bs*3] = CC[1+bs*3];
9068 }
9069 else //if(km>=1)
9070 {
9071 D[0+bs*0] = CC[0+bs*0];
9072
9073 if(kn==1)
9074 return;
9075
9076 D[0+bs*1] = CC[0+bs*1];
9077
9078 if(kn==2)
9079 return;
9080
9081 D[0+bs*2] = CC[0+bs*2];
9082
9083 if(kn==3)
9084 return;
9085
9086 D[0+bs*3] = CC[0+bs*3];
9087 }
9088
9089 return;
9090
9091 }
9092 #endif
9093
9094
9095
9096 #if defined(BLAS_API)
9097
9098 #include "kernel_dgemm_4x4_lib.c"
9099
9100 #endif
9101
9102