1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36 #include <math.h>
37
38 #include <blasfeo_common.h>
39 #include <blasfeo_s_kernel.h>
40
41
42
43 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_AMD_BULLDOZER)
kernel_sgemm_nt_4x4_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)44 void kernel_sgemm_nt_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
45 {
46
47 const int bs = 4;
48
49 float
50 a_0, a_1, a_2, a_3,
51 b_0, b_1, b_2, b_3;
52
53 #if defined(TARGET_GENERIC)
54 float CC[16] = {0};
55 #else
56 ALIGNED( float CC[16], 64 ) = {0};
57 #endif
58
59 int k;
60
61 for(k=0; k<kmax-3; k+=4)
62 {
63
64 // k = 0
65
66 a_0 = A[0];
67 a_1 = A[1];
68 a_2 = A[2];
69 a_3 = A[3];
70
71 b_0 = B[0];
72 b_1 = B[1];
73 b_2 = B[2];
74 b_3 = B[3];
75
76 CC[0+bs*0] += a_0 * b_0;
77 CC[1+bs*0] += a_1 * b_0;
78 CC[2+bs*0] += a_2 * b_0;
79 CC[3+bs*0] += a_3 * b_0;
80
81 CC[0+bs*1] += a_0 * b_1;
82 CC[1+bs*1] += a_1 * b_1;
83 CC[2+bs*1] += a_2 * b_1;
84 CC[3+bs*1] += a_3 * b_1;
85
86 CC[0+bs*2] += a_0 * b_2;
87 CC[1+bs*2] += a_1 * b_2;
88 CC[2+bs*2] += a_2 * b_2;
89 CC[3+bs*2] += a_3 * b_2;
90
91 CC[0+bs*3] += a_0 * b_3;
92 CC[1+bs*3] += a_1 * b_3;
93 CC[2+bs*3] += a_2 * b_3;
94 CC[3+bs*3] += a_3 * b_3;
95
96
97 // k = 1
98
99 a_0 = A[4];
100 a_1 = A[5];
101 a_2 = A[6];
102 a_3 = A[7];
103
104 b_0 = B[4];
105 b_1 = B[5];
106 b_2 = B[6];
107 b_3 = B[7];
108
109 CC[0+bs*0] += a_0 * b_0;
110 CC[1+bs*0] += a_1 * b_0;
111 CC[2+bs*0] += a_2 * b_0;
112 CC[3+bs*0] += a_3 * b_0;
113
114 CC[0+bs*1] += a_0 * b_1;
115 CC[1+bs*1] += a_1 * b_1;
116 CC[2+bs*1] += a_2 * b_1;
117 CC[3+bs*1] += a_3 * b_1;
118
119 CC[0+bs*2] += a_0 * b_2;
120 CC[1+bs*2] += a_1 * b_2;
121 CC[2+bs*2] += a_2 * b_2;
122 CC[3+bs*2] += a_3 * b_2;
123
124 CC[0+bs*3] += a_0 * b_3;
125 CC[1+bs*3] += a_1 * b_3;
126 CC[2+bs*3] += a_2 * b_3;
127 CC[3+bs*3] += a_3 * b_3;
128
129
130 // k = 2
131
132 a_0 = A[8];
133 a_1 = A[9];
134 a_2 = A[10];
135 a_3 = A[11];
136
137 b_0 = B[8];
138 b_1 = B[9];
139 b_2 = B[10];
140 b_3 = B[11];
141
142 CC[0+bs*0] += a_0 * b_0;
143 CC[1+bs*0] += a_1 * b_0;
144 CC[2+bs*0] += a_2 * b_0;
145 CC[3+bs*0] += a_3 * b_0;
146
147 CC[0+bs*1] += a_0 * b_1;
148 CC[1+bs*1] += a_1 * b_1;
149 CC[2+bs*1] += a_2 * b_1;
150 CC[3+bs*1] += a_3 * b_1;
151
152 CC[0+bs*2] += a_0 * b_2;
153 CC[1+bs*2] += a_1 * b_2;
154 CC[2+bs*2] += a_2 * b_2;
155 CC[3+bs*2] += a_3 * b_2;
156
157 CC[0+bs*3] += a_0 * b_3;
158 CC[1+bs*3] += a_1 * b_3;
159 CC[2+bs*3] += a_2 * b_3;
160 CC[3+bs*3] += a_3 * b_3;
161
162
163 // k = 3
164
165 a_0 = A[12];
166 a_1 = A[13];
167 a_2 = A[14];
168 a_3 = A[15];
169
170 b_0 = B[12];
171 b_1 = B[13];
172 b_2 = B[14];
173 b_3 = B[15];
174
175 CC[0+bs*0] += a_0 * b_0;
176 CC[1+bs*0] += a_1 * b_0;
177 CC[2+bs*0] += a_2 * b_0;
178 CC[3+bs*0] += a_3 * b_0;
179
180 CC[0+bs*1] += a_0 * b_1;
181 CC[1+bs*1] += a_1 * b_1;
182 CC[2+bs*1] += a_2 * b_1;
183 CC[3+bs*1] += a_3 * b_1;
184
185 CC[0+bs*2] += a_0 * b_2;
186 CC[1+bs*2] += a_1 * b_2;
187 CC[2+bs*2] += a_2 * b_2;
188 CC[3+bs*2] += a_3 * b_2;
189
190 CC[0+bs*3] += a_0 * b_3;
191 CC[1+bs*3] += a_1 * b_3;
192 CC[2+bs*3] += a_2 * b_3;
193 CC[3+bs*3] += a_3 * b_3;
194
195 A += 16;
196 B += 16;
197
198 }
199
200 for(; k<kmax; k++)
201 {
202
203 // k = 0
204
205 a_0 = A[0];
206 a_1 = A[1];
207 a_2 = A[2];
208 a_3 = A[3];
209
210 b_0 = B[0];
211 b_1 = B[1];
212 b_2 = B[2];
213 b_3 = B[3];
214
215 CC[0+bs*0] += a_0 * b_0;
216 CC[1+bs*0] += a_1 * b_0;
217 CC[2+bs*0] += a_2 * b_0;
218 CC[3+bs*0] += a_3 * b_0;
219
220 CC[0+bs*1] += a_0 * b_1;
221 CC[1+bs*1] += a_1 * b_1;
222 CC[2+bs*1] += a_2 * b_1;
223 CC[3+bs*1] += a_3 * b_1;
224
225 CC[0+bs*2] += a_0 * b_2;
226 CC[1+bs*2] += a_1 * b_2;
227 CC[2+bs*2] += a_2 * b_2;
228 CC[3+bs*2] += a_3 * b_2;
229
230 CC[0+bs*3] += a_0 * b_3;
231 CC[1+bs*3] += a_1 * b_3;
232 CC[2+bs*3] += a_2 * b_3;
233 CC[3+bs*3] += a_3 * b_3;
234
235 A += 4;
236 B += 4;
237
238 }
239
240 D[0+bs*0] = beta[0]*C[0+bs*0] + alpha[0]*CC[0+bs*0];
241 D[1+bs*0] = beta[0]*C[1+bs*0] + alpha[0]*CC[1+bs*0];
242 D[2+bs*0] = beta[0]*C[2+bs*0] + alpha[0]*CC[2+bs*0];
243 D[3+bs*0] = beta[0]*C[3+bs*0] + alpha[0]*CC[3+bs*0];
244
245 D[0+bs*1] = beta[0]*C[0+bs*1] + alpha[0]*CC[0+bs*1];
246 D[1+bs*1] = beta[0]*C[1+bs*1] + alpha[0]*CC[1+bs*1];
247 D[2+bs*1] = beta[0]*C[2+bs*1] + alpha[0]*CC[2+bs*1];
248 D[3+bs*1] = beta[0]*C[3+bs*1] + alpha[0]*CC[3+bs*1];
249
250 D[0+bs*2] = beta[0]*C[0+bs*2] + alpha[0]*CC[0+bs*2];
251 D[1+bs*2] = beta[0]*C[1+bs*2] + alpha[0]*CC[1+bs*2];
252 D[2+bs*2] = beta[0]*C[2+bs*2] + alpha[0]*CC[2+bs*2];
253 D[3+bs*2] = beta[0]*C[3+bs*2] + alpha[0]*CC[3+bs*2];
254
255 D[0+bs*3] = beta[0]*C[0+bs*3] + alpha[0]*CC[0+bs*3];
256 D[1+bs*3] = beta[0]*C[1+bs*3] + alpha[0]*CC[1+bs*3];
257 D[2+bs*3] = beta[0]*C[2+bs*3] + alpha[0]*CC[2+bs*3];
258 D[3+bs*3] = beta[0]*C[3+bs*3] + alpha[0]*CC[3+bs*3];
259
260 return;
261
262 }
263 #endif
264
265
266
267 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_sgemm_nt_4x4_vs_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D,int km,int kn)268 void kernel_sgemm_nt_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
269 {
270
271 const int bs = 4;
272
273 #if defined(TARGET_GENERIC)
274 float CC[16] = {0};
275 #else
276 ALIGNED( float CC[16], 64 ) = {0};
277 #endif
278
279 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC);
280
281 if(km>=4)
282 {
283 D[0+bs*0] = CC[0+bs*0];
284 D[1+bs*0] = CC[1+bs*0];
285 D[2+bs*0] = CC[2+bs*0];
286 D[3+bs*0] = CC[3+bs*0];
287
288 if(kn==1)
289 return;
290
291 D[0+bs*1] = CC[0+bs*1];
292 D[1+bs*1] = CC[1+bs*1];
293 D[2+bs*1] = CC[2+bs*1];
294 D[3+bs*1] = CC[3+bs*1];
295
296 if(kn==2)
297 return;
298
299 D[0+bs*2] = CC[0+bs*2];
300 D[1+bs*2] = CC[1+bs*2];
301 D[2+bs*2] = CC[2+bs*2];
302 D[3+bs*2] = CC[3+bs*2];
303
304 if(kn==3)
305 return;
306
307 D[0+bs*3] = CC[0+bs*3];
308 D[1+bs*3] = CC[1+bs*3];
309 D[2+bs*3] = CC[2+bs*3];
310 D[3+bs*3] = CC[3+bs*3];
311 }
312 else if(km>=3)
313 {
314 D[0+bs*0] = CC[0+bs*0];
315 D[1+bs*0] = CC[1+bs*0];
316 D[2+bs*0] = CC[2+bs*0];
317
318 if(kn==1)
319 return;
320
321 D[0+bs*1] = CC[0+bs*1];
322 D[1+bs*1] = CC[1+bs*1];
323 D[2+bs*1] = CC[2+bs*1];
324
325 if(kn==2)
326 return;
327
328 D[0+bs*2] = CC[0+bs*2];
329 D[1+bs*2] = CC[1+bs*2];
330 D[2+bs*2] = CC[2+bs*2];
331
332 if(kn==3)
333 return;
334
335 D[0+bs*3] = CC[0+bs*3];
336 D[1+bs*3] = CC[1+bs*3];
337 D[2+bs*3] = CC[2+bs*3];
338 }
339 else if(km>=2)
340 {
341 D[0+bs*0] = CC[0+bs*0];
342 D[1+bs*0] = CC[1+bs*0];
343
344 if(kn==1)
345 return;
346
347 D[0+bs*1] = CC[0+bs*1];
348 D[1+bs*1] = CC[1+bs*1];
349
350 if(kn==2)
351 return;
352
353 D[0+bs*2] = CC[0+bs*2];
354 D[1+bs*2] = CC[1+bs*2];
355
356 if(kn==3)
357 return;
358
359 D[0+bs*3] = CC[0+bs*3];
360 D[1+bs*3] = CC[1+bs*3];
361 }
362 else //if(km>=1)
363 {
364 D[0+bs*0] = CC[0+bs*0];
365
366 if(kn==1)
367 return;
368
369 D[0+bs*1] = CC[0+bs*1];
370
371 if(kn==2)
372 return;
373
374 D[0+bs*2] = CC[0+bs*2];
375
376 if(kn==3)
377 return;
378
379 D[0+bs*3] = CC[0+bs*3];
380 }
381
382 return;
383
384 }
385 #endif
386
387
388
389 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_nt_4x4_gen_lib4(int kmax,float * alpha,float * A,float * B,float * beta,int offsetC,float * C0,int sdc,int offsetD,float * D0,int sdd,int m0,int m1,int n0,int n1)390 void kernel_sgemm_nt_4x4_gen_lib4(int kmax, float *alpha, float *A, float *B, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
391 {
392
393 const int bs = 4;
394
395 #if defined(TARGET_GENERIC)
396 float CC[16] = {0};
397 #else
398 ALIGNED( float CC[16], 64 ) = {0};
399 #endif
400
401 float
402 *C1, *D1;
403
404 int k;
405
406 if(offsetC==0)
407 {
408 CC[0+bs*0] = beta[0]*C0[0+bs*0];
409 CC[1+bs*0] = beta[0]*C0[1+bs*0];
410 CC[2+bs*0] = beta[0]*C0[2+bs*0];
411 CC[3+bs*0] = beta[0]*C0[3+bs*0];
412
413 CC[0+bs*1] = beta[0]*C0[0+bs*1];
414 CC[1+bs*1] = beta[0]*C0[1+bs*1];
415 CC[2+bs*1] = beta[0]*C0[2+bs*1];
416 CC[3+bs*1] = beta[0]*C0[3+bs*1];
417
418 CC[0+bs*2] = beta[0]*C0[0+bs*2];
419 CC[1+bs*2] = beta[0]*C0[1+bs*2];
420 CC[2+bs*2] = beta[0]*C0[2+bs*2];
421 CC[3+bs*2] = beta[0]*C0[3+bs*2];
422
423 CC[0+bs*3] = beta[0]*C0[0+bs*3];
424 CC[1+bs*3] = beta[0]*C0[1+bs*3];
425 CC[2+bs*3] = beta[0]*C0[2+bs*3];
426 CC[3+bs*3] = beta[0]*C0[3+bs*3];
427 }
428 else if(offsetC==1)
429 {
430 C1 = C0 + sdc*bs;
431
432 CC[0+bs*0] = beta[0]*C0[1+bs*0];
433 CC[1+bs*0] = beta[0]*C0[2+bs*0];
434 CC[2+bs*0] = beta[0]*C0[3+bs*0];
435 CC[3+bs*0] = beta[0]*C1[0+bs*0];
436
437 CC[0+bs*1] = beta[0]*C0[1+bs*1];
438 CC[1+bs*1] = beta[0]*C0[2+bs*1];
439 CC[2+bs*1] = beta[0]*C0[3+bs*1];
440 CC[3+bs*1] = beta[0]*C1[0+bs*1];
441
442 CC[0+bs*2] = beta[0]*C0[1+bs*2];
443 CC[1+bs*2] = beta[0]*C0[2+bs*2];
444 CC[2+bs*2] = beta[0]*C0[3+bs*2];
445 CC[3+bs*2] = beta[0]*C1[0+bs*2];
446
447 CC[0+bs*3] = beta[0]*C0[1+bs*3];
448 CC[1+bs*3] = beta[0]*C0[2+bs*3];
449 CC[2+bs*3] = beta[0]*C0[3+bs*3];
450 CC[3+bs*3] = beta[0]*C1[0+bs*3];
451 }
452 else if(offsetC==2)
453 {
454 C1 = C0 + sdc*bs;
455
456 CC[0+bs*0] = beta[0]*C0[2+bs*0];
457 CC[1+bs*0] = beta[0]*C0[3+bs*0];
458 CC[2+bs*0] = beta[0]*C1[0+bs*0];
459 CC[3+bs*0] = beta[0]*C1[1+bs*0];
460
461 CC[0+bs*1] = beta[0]*C0[2+bs*1];
462 CC[1+bs*1] = beta[0]*C0[3+bs*1];
463 CC[2+bs*1] = beta[0]*C1[0+bs*1];
464 CC[3+bs*1] = beta[0]*C1[1+bs*1];
465
466 CC[0+bs*2] = beta[0]*C0[2+bs*2];
467 CC[1+bs*2] = beta[0]*C0[3+bs*2];
468 CC[2+bs*2] = beta[0]*C1[0+bs*2];
469 CC[3+bs*2] = beta[0]*C1[1+bs*2];
470
471 CC[0+bs*3] = beta[0]*C0[2+bs*3];
472 CC[1+bs*3] = beta[0]*C0[3+bs*3];
473 CC[2+bs*3] = beta[0]*C1[0+bs*3];
474 CC[3+bs*3] = beta[0]*C1[1+bs*3];
475 }
476 else //if(offsetC==3)
477 {
478 C1 = C0 + sdc*bs;
479
480 CC[0+bs*0] = beta[0]*C0[3+bs*0];
481 CC[1+bs*0] = beta[0]*C1[0+bs*0];
482 CC[2+bs*0] = beta[0]*C1[1+bs*0];
483 CC[3+bs*0] = beta[0]*C1[2+bs*0];
484
485 CC[0+bs*1] = beta[0]*C0[3+bs*1];
486 CC[1+bs*1] = beta[0]*C1[0+bs*1];
487 CC[2+bs*1] = beta[0]*C1[1+bs*1];
488 CC[3+bs*1] = beta[0]*C1[2+bs*1];
489
490 CC[0+bs*2] = beta[0]*C0[3+bs*2];
491 CC[1+bs*2] = beta[0]*C1[0+bs*2];
492 CC[2+bs*2] = beta[0]*C1[1+bs*2];
493 CC[3+bs*2] = beta[0]*C1[2+bs*2];
494
495 CC[0+bs*3] = beta[0]*C0[3+bs*3];
496 CC[1+bs*3] = beta[0]*C1[0+bs*3];
497 CC[2+bs*3] = beta[0]*C1[1+bs*3];
498 CC[3+bs*3] = beta[0]*C1[2+bs*3];
499 }
500
501 float beta1 = 1.0;
502
503 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC);
504
505 // shift sol for cols
506 if(n0>0)
507 {
508 if(n0==1)
509 {
510 CC[0+bs*0] = CC[0+bs*1];
511 CC[1+bs*0] = CC[1+bs*1];
512 CC[2+bs*0] = CC[2+bs*1];
513 CC[3+bs*0] = CC[3+bs*1];
514
515 CC[0+bs*1] = CC[0+bs*2];
516 CC[1+bs*1] = CC[1+bs*2];
517 CC[2+bs*1] = CC[2+bs*2];
518 CC[3+bs*1] = CC[3+bs*2];
519
520 CC[0+bs*2] = CC[0+bs*3];
521 CC[1+bs*2] = CC[1+bs*3];
522 CC[2+bs*2] = CC[2+bs*3];
523 CC[3+bs*2] = CC[3+bs*3];
524
525 D0 += 1*bs;
526 }
527 else if(n0==2)
528 {
529 CC[0+bs*0] = CC[0+bs*2];
530 CC[1+bs*0] = CC[1+bs*2];
531 CC[2+bs*0] = CC[2+bs*2];
532 CC[3+bs*0] = CC[3+bs*2];
533
534 CC[0+bs*1] = CC[0+bs*3];
535 CC[1+bs*1] = CC[1+bs*3];
536 CC[2+bs*1] = CC[2+bs*3];
537 CC[3+bs*1] = CC[3+bs*3];
538
539 D0 += 2*bs;
540 }
541 else //if(n0==3)
542 {
543 CC[0+bs*0] = CC[0+bs*3];
544 CC[1+bs*0] = CC[1+bs*3];
545 CC[2+bs*0] = CC[2+bs*3];
546 CC[3+bs*0] = CC[3+bs*3];
547
548 D0 += 3*bs;
549 }
550 }
551
552 n1 = 4<n1 ? 4 : n1;
553 int kn = n1 - n0;
554
555 if(offsetD==0)
556 {
557 if(kn<=0)
558 return;
559
560 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
561 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0];
562 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0];
563 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0];
564
565 if(kn<=1)
566 return;
567
568 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
569 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
570 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1];
571 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1];
572
573 if(kn<=2)
574 return;
575
576 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
577 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
578 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
579 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2];
580
581 if(kn<=3)
582 return;
583
584 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
585 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
586 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
587 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3];
588 }
589 else if(offsetD==1)
590 {
591 D1 = D0 + sdd*bs;
592
593 if(kn<=0)
594 return;
595
596 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0];
597 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0];
598 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0];
599 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0];
600
601 if(kn<=1)
602 return;
603
604 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1];
605 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1];
606 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1];
607 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1];
608
609 if(kn<=2)
610 return;
611
612 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2];
613 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2];
614 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2];
615 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2];
616
617 if(kn<=3)
618 return;
619
620 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3];
621 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3];
622 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3];
623 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3];
624 }
625 else if(offsetD==2)
626 {
627 D1 = D0 + sdd*bs;
628
629 if(kn<=0)
630 return;
631
632 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0];
633 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0];
634 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0];
635 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0];
636
637 if(kn<=1)
638 return;
639
640 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1];
641 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1];
642 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1];
643 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1];
644
645 if(kn<=2)
646 return;
647
648 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2];
649 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2];
650 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2];
651 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2];
652
653 if(kn<=3)
654 return;
655
656 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3];
657 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3];
658 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3];
659 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3];
660 }
661 else //if(offsetD==3)
662 {
663 D1 = D0 + sdd*bs;
664
665 if(kn<=0)
666 return;
667
668 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0];
669 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0];
670 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0];
671 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0];
672
673 if(kn<=1)
674 return;
675
676 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1];
677 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1];
678 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1];
679 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1];
680
681 if(kn<=2)
682 return;
683
684 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2];
685 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2];
686 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2];
687 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2];
688
689 if(kn<=3)
690 return;
691
692 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3];
693 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3];
694 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3];
695 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3];
696 }
697
698 return;
699
700 }
701 #endif
702
703
704
705 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_AMD_BULLDOZER)
kernel_sgemm_nn_4x4_lib4(int kmax,float * alpha,float * A,int offsetB,float * B,int sdb,float * beta,float * C,float * D)706 void kernel_sgemm_nn_4x4_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D)
707 {
708
709 const int bs = 4;
710
711 float
712 a_0, a_1, a_2, a_3,
713 b_0, b_1, b_2, b_3;
714
715 #if defined(TARGET_GENERIC)
716 float CC[16] = {0};
717 #else
718 ALIGNED( float CC[16], 64 ) = {0};
719 #endif
720
721 float
722 *C1, *D1;
723
724 int k;
725
726 k = 0;
727 if(offsetB!=0)
728 {
729 if(offsetB==1)
730 {
731
732 B += 1;
733
734 a_0 = A[0];
735 a_1 = A[1];
736 a_2 = A[2];
737 a_3 = A[3];
738
739 b_0 = B[0];
740 b_1 = B[4];
741 b_2 = B[8];
742 b_3 = B[12];
743
744 CC[0+bs*0] += a_0 * b_0;
745 CC[1+bs*0] += a_1 * b_0;
746 CC[2+bs*0] += a_2 * b_0;
747 CC[3+bs*0] += a_3 * b_0;
748
749 CC[0+bs*1] += a_0 * b_1;
750 CC[1+bs*1] += a_1 * b_1;
751 CC[2+bs*1] += a_2 * b_1;
752 CC[3+bs*1] += a_3 * b_1;
753
754 CC[0+bs*2] += a_0 * b_2;
755 CC[1+bs*2] += a_1 * b_2;
756 CC[2+bs*2] += a_2 * b_2;
757 CC[3+bs*2] += a_3 * b_2;
758
759 CC[0+bs*3] += a_0 * b_3;
760 CC[1+bs*3] += a_1 * b_3;
761 CC[2+bs*3] += a_2 * b_3;
762 CC[3+bs*3] += a_3 * b_3;
763
764 A += 4;
765 B += 1;
766 k += 1;
767
768 if(k>=kmax)
769 goto scale;
770
771 a_0 = A[0];
772 a_1 = A[1];
773 a_2 = A[2];
774 a_3 = A[3];
775
776 b_0 = B[0];
777 b_1 = B[4];
778 b_2 = B[8];
779 b_3 = B[12];
780
781 CC[0+bs*0] += a_0 * b_0;
782 CC[1+bs*0] += a_1 * b_0;
783 CC[2+bs*0] += a_2 * b_0;
784 CC[3+bs*0] += a_3 * b_0;
785
786 CC[0+bs*1] += a_0 * b_1;
787 CC[1+bs*1] += a_1 * b_1;
788 CC[2+bs*1] += a_2 * b_1;
789 CC[3+bs*1] += a_3 * b_1;
790
791 CC[0+bs*2] += a_0 * b_2;
792 CC[1+bs*2] += a_1 * b_2;
793 CC[2+bs*2] += a_2 * b_2;
794 CC[3+bs*2] += a_3 * b_2;
795
796 CC[0+bs*3] += a_0 * b_3;
797 CC[1+bs*3] += a_1 * b_3;
798 CC[2+bs*3] += a_2 * b_3;
799 CC[3+bs*3] += a_3 * b_3;
800
801 A += 4;
802 B += 1;
803 k += 1;
804
805 if(k>=kmax)
806 goto scale;
807
808 a_0 = A[0];
809 a_1 = A[1];
810 a_2 = A[2];
811 a_3 = A[3];
812
813 b_0 = B[0];
814 b_1 = B[4];
815 b_2 = B[8];
816 b_3 = B[12];
817
818 CC[0+bs*0] += a_0 * b_0;
819 CC[1+bs*0] += a_1 * b_0;
820 CC[2+bs*0] += a_2 * b_0;
821 CC[3+bs*0] += a_3 * b_0;
822
823 CC[0+bs*1] += a_0 * b_1;
824 CC[1+bs*1] += a_1 * b_1;
825 CC[2+bs*1] += a_2 * b_1;
826 CC[3+bs*1] += a_3 * b_1;
827
828 CC[0+bs*2] += a_0 * b_2;
829 CC[1+bs*2] += a_1 * b_2;
830 CC[2+bs*2] += a_2 * b_2;
831 CC[3+bs*2] += a_3 * b_2;
832
833 CC[0+bs*3] += a_0 * b_3;
834 CC[1+bs*3] += a_1 * b_3;
835 CC[2+bs*3] += a_2 * b_3;
836 CC[3+bs*3] += a_3 * b_3;
837
838 A += 4;
839 B += 1;
840 B += bs*(sdb-1);
841 k += 1;
842
843 }
844 else if(offsetB==2)
845 {
846
847 B += 2;
848
849 a_0 = A[0];
850 a_1 = A[1];
851 a_2 = A[2];
852 a_3 = A[3];
853
854 b_0 = B[0];
855 b_1 = B[4];
856 b_2 = B[8];
857 b_3 = B[12];
858
859 CC[0+bs*0] += a_0 * b_0;
860 CC[1+bs*0] += a_1 * b_0;
861 CC[2+bs*0] += a_2 * b_0;
862 CC[3+bs*0] += a_3 * b_0;
863
864 CC[0+bs*1] += a_0 * b_1;
865 CC[1+bs*1] += a_1 * b_1;
866 CC[2+bs*1] += a_2 * b_1;
867 CC[3+bs*1] += a_3 * b_1;
868
869 CC[0+bs*2] += a_0 * b_2;
870 CC[1+bs*2] += a_1 * b_2;
871 CC[2+bs*2] += a_2 * b_2;
872 CC[3+bs*2] += a_3 * b_2;
873
874 CC[0+bs*3] += a_0 * b_3;
875 CC[1+bs*3] += a_1 * b_3;
876 CC[2+bs*3] += a_2 * b_3;
877 CC[3+bs*3] += a_3 * b_3;
878
879 A += 4;
880 B += 1;
881 k += 1;
882
883 if(k>=kmax)
884 goto scale;
885
886 a_0 = A[0];
887 a_1 = A[1];
888 a_2 = A[2];
889 a_3 = A[3];
890
891 b_0 = B[0];
892 b_1 = B[4];
893 b_2 = B[8];
894 b_3 = B[12];
895
896 CC[0+bs*0] += a_0 * b_0;
897 CC[1+bs*0] += a_1 * b_0;
898 CC[2+bs*0] += a_2 * b_0;
899 CC[3+bs*0] += a_3 * b_0;
900
901 CC[0+bs*1] += a_0 * b_1;
902 CC[1+bs*1] += a_1 * b_1;
903 CC[2+bs*1] += a_2 * b_1;
904 CC[3+bs*1] += a_3 * b_1;
905
906 CC[0+bs*2] += a_0 * b_2;
907 CC[1+bs*2] += a_1 * b_2;
908 CC[2+bs*2] += a_2 * b_2;
909 CC[3+bs*2] += a_3 * b_2;
910
911 CC[0+bs*3] += a_0 * b_3;
912 CC[1+bs*3] += a_1 * b_3;
913 CC[2+bs*3] += a_2 * b_3;
914 CC[3+bs*3] += a_3 * b_3;
915
916 A += 4;
917 B += 1;
918 B += bs*(sdb-1);
919 k += 1;
920
921 }
922 else // if(offsetB==3)
923 {
924
925 B += 3;
926
927 a_0 = A[0];
928 a_1 = A[1];
929 a_2 = A[2];
930 a_3 = A[3];
931
932 b_0 = B[0];
933 b_1 = B[4];
934 b_2 = B[8];
935 b_3 = B[12];
936
937 CC[0+bs*0] += a_0 * b_0;
938 CC[1+bs*0] += a_1 * b_0;
939 CC[2+bs*0] += a_2 * b_0;
940 CC[3+bs*0] += a_3 * b_0;
941
942 CC[0+bs*1] += a_0 * b_1;
943 CC[1+bs*1] += a_1 * b_1;
944 CC[2+bs*1] += a_2 * b_1;
945 CC[3+bs*1] += a_3 * b_1;
946
947 CC[0+bs*2] += a_0 * b_2;
948 CC[1+bs*2] += a_1 * b_2;
949 CC[2+bs*2] += a_2 * b_2;
950 CC[3+bs*2] += a_3 * b_2;
951
952 CC[0+bs*3] += a_0 * b_3;
953 CC[1+bs*3] += a_1 * b_3;
954 CC[2+bs*3] += a_2 * b_3;
955 CC[3+bs*3] += a_3 * b_3;
956
957 A += 4;
958 B += 1;
959 B += bs*(sdb-1);
960 k += 1;
961
962 }
963 }
964 for(; k<kmax-3; k+=4)
965 {
966
967 // k = 0
968
969 a_0 = A[0];
970 a_1 = A[1];
971 a_2 = A[2];
972 a_3 = A[3];
973
974 b_0 = B[0];
975 b_1 = B[4];
976 b_2 = B[8];
977 b_3 = B[12];
978
979 CC[0+bs*0] += a_0 * b_0;
980 CC[1+bs*0] += a_1 * b_0;
981 CC[2+bs*0] += a_2 * b_0;
982 CC[3+bs*0] += a_3 * b_0;
983
984 CC[0+bs*1] += a_0 * b_1;
985 CC[1+bs*1] += a_1 * b_1;
986 CC[2+bs*1] += a_2 * b_1;
987 CC[3+bs*1] += a_3 * b_1;
988
989 CC[0+bs*2] += a_0 * b_2;
990 CC[1+bs*2] += a_1 * b_2;
991 CC[2+bs*2] += a_2 * b_2;
992 CC[3+bs*2] += a_3 * b_2;
993
994 CC[0+bs*3] += a_0 * b_3;
995 CC[1+bs*3] += a_1 * b_3;
996 CC[2+bs*3] += a_2 * b_3;
997 CC[3+bs*3] += a_3 * b_3;
998
999
1000 // k = 1
1001
1002 a_0 = A[4];
1003 a_1 = A[5];
1004 a_2 = A[6];
1005 a_3 = A[7];
1006
1007 b_0 = B[1];
1008 b_1 = B[5];
1009 b_2 = B[9];
1010 b_3 = B[13];
1011
1012 CC[0+bs*0] += a_0 * b_0;
1013 CC[1+bs*0] += a_1 * b_0;
1014 CC[2+bs*0] += a_2 * b_0;
1015 CC[3+bs*0] += a_3 * b_0;
1016
1017 CC[0+bs*1] += a_0 * b_1;
1018 CC[1+bs*1] += a_1 * b_1;
1019 CC[2+bs*1] += a_2 * b_1;
1020 CC[3+bs*1] += a_3 * b_1;
1021
1022 CC[0+bs*2] += a_0 * b_2;
1023 CC[1+bs*2] += a_1 * b_2;
1024 CC[2+bs*2] += a_2 * b_2;
1025 CC[3+bs*2] += a_3 * b_2;
1026
1027 CC[0+bs*3] += a_0 * b_3;
1028 CC[1+bs*3] += a_1 * b_3;
1029 CC[2+bs*3] += a_2 * b_3;
1030 CC[3+bs*3] += a_3 * b_3;
1031
1032
1033 // k = 2
1034
1035 a_0 = A[8];
1036 a_1 = A[9];
1037 a_2 = A[10];
1038 a_3 = A[11];
1039
1040 b_0 = B[2];
1041 b_1 = B[6];
1042 b_2 = B[10];
1043 b_3 = B[14];
1044
1045 CC[0+bs*0] += a_0 * b_0;
1046 CC[1+bs*0] += a_1 * b_0;
1047 CC[2+bs*0] += a_2 * b_0;
1048 CC[3+bs*0] += a_3 * b_0;
1049
1050 CC[0+bs*1] += a_0 * b_1;
1051 CC[1+bs*1] += a_1 * b_1;
1052 CC[2+bs*1] += a_2 * b_1;
1053 CC[3+bs*1] += a_3 * b_1;
1054
1055 CC[0+bs*2] += a_0 * b_2;
1056 CC[1+bs*2] += a_1 * b_2;
1057 CC[2+bs*2] += a_2 * b_2;
1058 CC[3+bs*2] += a_3 * b_2;
1059
1060 CC[0+bs*3] += a_0 * b_3;
1061 CC[1+bs*3] += a_1 * b_3;
1062 CC[2+bs*3] += a_2 * b_3;
1063 CC[3+bs*3] += a_3 * b_3;
1064
1065
1066 // k = 3
1067
1068 a_0 = A[12];
1069 a_1 = A[13];
1070 a_2 = A[14];
1071 a_3 = A[15];
1072
1073 b_0 = B[3];
1074 b_1 = B[7];
1075 b_2 = B[11];
1076 b_3 = B[15];
1077
1078 CC[0+bs*0] += a_0 * b_0;
1079 CC[1+bs*0] += a_1 * b_0;
1080 CC[2+bs*0] += a_2 * b_0;
1081 CC[3+bs*0] += a_3 * b_0;
1082
1083 CC[0+bs*1] += a_0 * b_1;
1084 CC[1+bs*1] += a_1 * b_1;
1085 CC[2+bs*1] += a_2 * b_1;
1086 CC[3+bs*1] += a_3 * b_1;
1087
1088 CC[0+bs*2] += a_0 * b_2;
1089 CC[1+bs*2] += a_1 * b_2;
1090 CC[2+bs*2] += a_2 * b_2;
1091 CC[3+bs*2] += a_3 * b_2;
1092
1093 CC[0+bs*3] += a_0 * b_3;
1094 CC[1+bs*3] += a_1 * b_3;
1095 CC[2+bs*3] += a_2 * b_3;
1096 CC[3+bs*3] += a_3 * b_3;
1097
1098 A += 16;
1099 B += 4*sdb;
1100
1101 }
1102 for(; k<kmax; k++)
1103 {
1104
1105 // k = 0
1106
1107 a_0 = A[0];
1108 a_1 = A[1];
1109 a_2 = A[2];
1110 a_3 = A[3];
1111
1112 b_0 = B[0];
1113 b_1 = B[4];
1114 b_2 = B[8];
1115 b_3 = B[12];
1116
1117 CC[0+bs*0] += a_0 * b_0;
1118 CC[1+bs*0] += a_1 * b_0;
1119 CC[2+bs*0] += a_2 * b_0;
1120 CC[3+bs*0] += a_3 * b_0;
1121
1122 CC[0+bs*1] += a_0 * b_1;
1123 CC[1+bs*1] += a_1 * b_1;
1124 CC[2+bs*1] += a_2 * b_1;
1125 CC[3+bs*1] += a_3 * b_1;
1126
1127 CC[0+bs*2] += a_0 * b_2;
1128 CC[1+bs*2] += a_1 * b_2;
1129 CC[2+bs*2] += a_2 * b_2;
1130 CC[3+bs*2] += a_3 * b_2;
1131
1132 CC[0+bs*3] += a_0 * b_3;
1133 CC[1+bs*3] += a_1 * b_3;
1134 CC[2+bs*3] += a_2 * b_3;
1135 CC[3+bs*3] += a_3 * b_3;
1136
1137 A += 4;
1138 B += 1;
1139
1140 }
1141
1142 scale:
1143
1144 D[0+bs*0] = beta[0]*C[0+bs*0] + alpha[0]*CC[0+bs*0];
1145 D[1+bs*0] = beta[0]*C[1+bs*0] + alpha[0]*CC[1+bs*0];
1146 D[2+bs*0] = beta[0]*C[2+bs*0] + alpha[0]*CC[2+bs*0];
1147 D[3+bs*0] = beta[0]*C[3+bs*0] + alpha[0]*CC[3+bs*0];
1148
1149 D[0+bs*1] = beta[0]*C[0+bs*1] + alpha[0]*CC[0+bs*1];
1150 D[1+bs*1] = beta[0]*C[1+bs*1] + alpha[0]*CC[1+bs*1];
1151 D[2+bs*1] = beta[0]*C[2+bs*1] + alpha[0]*CC[2+bs*1];
1152 D[3+bs*1] = beta[0]*C[3+bs*1] + alpha[0]*CC[3+bs*1];
1153
1154 D[0+bs*2] = beta[0]*C[0+bs*2] + alpha[0]*CC[0+bs*2];
1155 D[1+bs*2] = beta[0]*C[1+bs*2] + alpha[0]*CC[1+bs*2];
1156 D[2+bs*2] = beta[0]*C[2+bs*2] + alpha[0]*CC[2+bs*2];
1157 D[3+bs*2] = beta[0]*C[3+bs*2] + alpha[0]*CC[3+bs*2];
1158
1159 D[0+bs*3] = beta[0]*C[0+bs*3] + alpha[0]*CC[0+bs*3];
1160 D[1+bs*3] = beta[0]*C[1+bs*3] + alpha[0]*CC[1+bs*3];
1161 D[2+bs*3] = beta[0]*C[2+bs*3] + alpha[0]*CC[2+bs*3];
1162 D[3+bs*3] = beta[0]*C[3+bs*3] + alpha[0]*CC[3+bs*3];
1163
1164 return;
1165
1166 }
1167 #endif
1168
1169
1170
1171 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9)
kernel_sgemm_nn_4x4_vs_lib4(int kmax,float * alpha,float * A,int offsetB,float * B,int sdb,float * beta,float * C,float * D,int km,int kn)1172 void kernel_sgemm_nn_4x4_vs_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn)
1173 {
1174
1175 const int bs = 4;
1176
1177 #if defined(TARGET_GENERIC)
1178 float CC[16] = {0};
1179 #else
1180 ALIGNED( float CC[16], 64 ) = {0};
1181 #endif
1182
1183 kernel_sgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, beta, C, CC);
1184
1185 if(km>=4)
1186 {
1187 D[0+bs*0] = CC[0+bs*0];
1188 D[1+bs*0] = CC[1+bs*0];
1189 D[2+bs*0] = CC[2+bs*0];
1190 D[3+bs*0] = CC[3+bs*0];
1191
1192 if(kn==1)
1193 return;
1194
1195 D[0+bs*1] = CC[0+bs*1];
1196 D[1+bs*1] = CC[1+bs*1];
1197 D[2+bs*1] = CC[2+bs*1];
1198 D[3+bs*1] = CC[3+bs*1];
1199
1200 if(kn==2)
1201 return;
1202
1203 D[0+bs*2] = CC[0+bs*2];
1204 D[1+bs*2] = CC[1+bs*2];
1205 D[2+bs*2] = CC[2+bs*2];
1206 D[3+bs*2] = CC[3+bs*2];
1207
1208 if(kn==3)
1209 return;
1210
1211 D[0+bs*3] = CC[0+bs*3];
1212 D[1+bs*3] = CC[1+bs*3];
1213 D[2+bs*3] = CC[2+bs*3];
1214 D[3+bs*3] = CC[3+bs*3];
1215 }
1216 else if(km>=3)
1217 {
1218 D[0+bs*0] = CC[0+bs*0];
1219 D[1+bs*0] = CC[1+bs*0];
1220 D[2+bs*0] = CC[2+bs*0];
1221
1222 if(kn==1)
1223 return;
1224
1225 D[0+bs*1] = CC[0+bs*1];
1226 D[1+bs*1] = CC[1+bs*1];
1227 D[2+bs*1] = CC[2+bs*1];
1228
1229 if(kn==2)
1230 return;
1231
1232 D[0+bs*2] = CC[0+bs*2];
1233 D[1+bs*2] = CC[1+bs*2];
1234 D[2+bs*2] = CC[2+bs*2];
1235
1236 if(kn==3)
1237 return;
1238
1239 D[0+bs*3] = CC[0+bs*3];
1240 D[1+bs*3] = CC[1+bs*3];
1241 D[2+bs*3] = CC[2+bs*3];
1242 }
1243 else if(km>=2)
1244 {
1245 D[0+bs*0] = CC[0+bs*0];
1246 D[1+bs*0] = CC[1+bs*0];
1247
1248 if(kn==1)
1249 return;
1250
1251 D[0+bs*1] = CC[0+bs*1];
1252 D[1+bs*1] = CC[1+bs*1];
1253
1254 if(kn==2)
1255 return;
1256
1257 D[0+bs*2] = CC[0+bs*2];
1258 D[1+bs*2] = CC[1+bs*2];
1259
1260 if(kn==3)
1261 return;
1262
1263 D[0+bs*3] = CC[0+bs*3];
1264 D[1+bs*3] = CC[1+bs*3];
1265 }
1266 else //if(km>=1)
1267 {
1268 D[0+bs*0] = CC[0+bs*0];
1269
1270 if(kn==1)
1271 return;
1272
1273 D[0+bs*1] = CC[0+bs*1];
1274
1275 if(kn==2)
1276 return;
1277
1278 D[0+bs*2] = CC[0+bs*2];
1279
1280 if(kn==3)
1281 return;
1282
1283 D[0+bs*3] = CC[0+bs*3];
1284 }
1285
1286 return;
1287
1288 }
1289 #endif
1290
1291
1292
1293 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_nn_4x4_gen_lib4(int kmax,float * alpha,float * A,int offsetB,float * B,int sdb,float * beta,int offsetC,float * C0,int sdc,int offsetD,float * D0,int sdd,int m0,int m1,int n0,int n1)1294 void kernel_sgemm_nn_4x4_gen_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
1295 {
1296
1297 const int bs = 4;
1298
1299 #if defined(TARGET_GENERIC)
1300 float CC[16] = {0};
1301 #else
1302 ALIGNED( float CC[16], 64 ) = {0};
1303 #endif
1304
1305 float
1306 *C1, *D1;
1307
1308 if(offsetC==0)
1309 {
1310 CC[0+bs*0] = beta[0]*C0[0+bs*0];
1311 CC[1+bs*0] = beta[0]*C0[1+bs*0];
1312 CC[2+bs*0] = beta[0]*C0[2+bs*0];
1313 CC[3+bs*0] = beta[0]*C0[3+bs*0];
1314
1315 CC[0+bs*1] = beta[0]*C0[0+bs*1];
1316 CC[1+bs*1] = beta[0]*C0[1+bs*1];
1317 CC[2+bs*1] = beta[0]*C0[2+bs*1];
1318 CC[3+bs*1] = beta[0]*C0[3+bs*1];
1319
1320 CC[0+bs*2] = beta[0]*C0[0+bs*2];
1321 CC[1+bs*2] = beta[0]*C0[1+bs*2];
1322 CC[2+bs*2] = beta[0]*C0[2+bs*2];
1323 CC[3+bs*2] = beta[0]*C0[3+bs*2];
1324
1325 CC[0+bs*3] = beta[0]*C0[0+bs*3];
1326 CC[1+bs*3] = beta[0]*C0[1+bs*3];
1327 CC[2+bs*3] = beta[0]*C0[2+bs*3];
1328 CC[3+bs*3] = beta[0]*C0[3+bs*3];
1329 }
1330 else if(offsetC==1)
1331 {
1332 C1 = C0 + sdc*bs;
1333
1334 CC[0+bs*0] = beta[0]*C0[1+bs*0];
1335 CC[1+bs*0] = beta[0]*C0[2+bs*0];
1336 CC[2+bs*0] = beta[0]*C0[3+bs*0];
1337 CC[3+bs*0] = beta[0]*C1[0+bs*0];
1338
1339 CC[0+bs*1] = beta[0]*C0[1+bs*1];
1340 CC[1+bs*1] = beta[0]*C0[2+bs*1];
1341 CC[2+bs*1] = beta[0]*C0[3+bs*1];
1342 CC[3+bs*1] = beta[0]*C1[0+bs*1];
1343
1344 CC[0+bs*2] = beta[0]*C0[1+bs*2];
1345 CC[1+bs*2] = beta[0]*C0[2+bs*2];
1346 CC[2+bs*2] = beta[0]*C0[3+bs*2];
1347 CC[3+bs*2] = beta[0]*C1[0+bs*2];
1348
1349 CC[0+bs*3] = beta[0]*C0[1+bs*3];
1350 CC[1+bs*3] = beta[0]*C0[2+bs*3];
1351 CC[2+bs*3] = beta[0]*C0[3+bs*3];
1352 CC[3+bs*3] = beta[0]*C1[0+bs*3];
1353 }
1354 else if(offsetC==2)
1355 {
1356 C1 = C0 + sdc*bs;
1357
1358 CC[0+bs*0] = beta[0]*C0[2+bs*0];
1359 CC[1+bs*0] = beta[0]*C0[3+bs*0];
1360 CC[2+bs*0] = beta[0]*C1[0+bs*0];
1361 CC[3+bs*0] = beta[0]*C1[1+bs*0];
1362
1363 CC[0+bs*1] = beta[0]*C0[2+bs*1];
1364 CC[1+bs*1] = beta[0]*C0[3+bs*1];
1365 CC[2+bs*1] = beta[0]*C1[0+bs*1];
1366 CC[3+bs*1] = beta[0]*C1[1+bs*1];
1367
1368 CC[0+bs*2] = beta[0]*C0[2+bs*2];
1369 CC[1+bs*2] = beta[0]*C0[3+bs*2];
1370 CC[2+bs*2] = beta[0]*C1[0+bs*2];
1371 CC[3+bs*2] = beta[0]*C1[1+bs*2];
1372
1373 CC[0+bs*3] = beta[0]*C0[2+bs*3];
1374 CC[1+bs*3] = beta[0]*C0[3+bs*3];
1375 CC[2+bs*3] = beta[0]*C1[0+bs*3];
1376 CC[3+bs*3] = beta[0]*C1[1+bs*3];
1377 }
1378 else //if(offsetC==3)
1379 {
1380 C1 = C0 + sdc*bs;
1381
1382 CC[0+bs*0] = beta[0]*C0[3+bs*0];
1383 CC[1+bs*0] = beta[0]*C1[0+bs*0];
1384 CC[2+bs*0] = beta[0]*C1[1+bs*0];
1385 CC[3+bs*0] = beta[0]*C1[2+bs*0];
1386
1387 CC[0+bs*1] = beta[0]*C0[3+bs*1];
1388 CC[1+bs*1] = beta[0]*C1[0+bs*1];
1389 CC[2+bs*1] = beta[0]*C1[1+bs*1];
1390 CC[3+bs*1] = beta[0]*C1[2+bs*1];
1391
1392 CC[0+bs*2] = beta[0]*C0[3+bs*2];
1393 CC[1+bs*2] = beta[0]*C1[0+bs*2];
1394 CC[2+bs*2] = beta[0]*C1[1+bs*2];
1395 CC[3+bs*2] = beta[0]*C1[2+bs*2];
1396
1397 CC[0+bs*3] = beta[0]*C0[3+bs*3];
1398 CC[1+bs*3] = beta[0]*C1[0+bs*3];
1399 CC[2+bs*3] = beta[0]*C1[1+bs*3];
1400 CC[3+bs*3] = beta[0]*C1[2+bs*3];
1401 }
1402
1403 float beta1 = 1.0;
1404
1405 kernel_sgemm_nn_4x4_lib4(kmax, alpha, A, offsetB, B, sdb, &beta1, CC, CC);
1406
1407 // shift sol for cols
1408 if(n0>0)
1409 {
1410 if(n0==1)
1411 {
1412 CC[0+bs*0] = CC[0+bs*1];
1413 CC[1+bs*0] = CC[1+bs*1];
1414 CC[2+bs*0] = CC[2+bs*1];
1415 CC[3+bs*0] = CC[3+bs*1];
1416
1417 CC[0+bs*1] = CC[0+bs*2];
1418 CC[1+bs*1] = CC[1+bs*2];
1419 CC[2+bs*1] = CC[2+bs*2];
1420 CC[3+bs*1] = CC[3+bs*2];
1421
1422 CC[0+bs*2] = CC[0+bs*3];
1423 CC[1+bs*2] = CC[1+bs*3];
1424 CC[2+bs*2] = CC[2+bs*3];
1425 CC[3+bs*2] = CC[3+bs*3];
1426
1427 D0 += 1*bs;
1428 }
1429 else if(n0==2)
1430 {
1431 CC[0+bs*0] = CC[0+bs*2];
1432 CC[1+bs*0] = CC[1+bs*2];
1433 CC[2+bs*0] = CC[2+bs*2];
1434 CC[3+bs*0] = CC[3+bs*2];
1435
1436 CC[0+bs*1] = CC[0+bs*3];
1437 CC[1+bs*1] = CC[1+bs*3];
1438 CC[2+bs*1] = CC[2+bs*3];
1439 CC[3+bs*1] = CC[3+bs*3];
1440
1441 D0 += 2*bs;
1442 }
1443 else //if(n0==3)
1444 {
1445 CC[0+bs*0] = CC[0+bs*3];
1446 CC[1+bs*0] = CC[1+bs*3];
1447 CC[2+bs*0] = CC[2+bs*3];
1448 CC[3+bs*0] = CC[3+bs*3];
1449
1450 D0 += 3*bs;
1451 }
1452 }
1453
1454 n1 = 4<n1 ? 4 : n1;
1455 int kn = n1 - n0;
1456
1457 if(offsetD==0)
1458 {
1459 if(kn<=0)
1460 return;
1461
1462 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
1463 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0];
1464 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0];
1465 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0];
1466
1467 if(kn<=1)
1468 return;
1469
1470 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
1471 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
1472 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1];
1473 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1];
1474
1475 if(kn<=2)
1476 return;
1477
1478 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
1479 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
1480 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
1481 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2];
1482
1483 if(kn<=3)
1484 return;
1485
1486 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
1487 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
1488 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
1489 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3];
1490 }
1491 else if(offsetD==1)
1492 {
1493 D1 = D0 + sdd*bs;
1494
1495 if(kn<=0)
1496 return;
1497
1498 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0];
1499 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0];
1500 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0];
1501 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0];
1502
1503 if(kn<=1)
1504 return;
1505
1506 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1];
1507 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1];
1508 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1];
1509 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1];
1510
1511 if(kn<=2)
1512 return;
1513
1514 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2];
1515 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2];
1516 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2];
1517 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2];
1518
1519 if(kn<=3)
1520 return;
1521
1522 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3];
1523 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3];
1524 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3];
1525 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3];
1526 }
1527 else if(offsetD==2)
1528 {
1529 D1 = D0 + sdd*bs;
1530
1531 if(kn<=0)
1532 return;
1533
1534 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0];
1535 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0];
1536 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0];
1537 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0];
1538
1539 if(kn<=1)
1540 return;
1541
1542 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1];
1543 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1];
1544 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1];
1545 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1];
1546
1547 if(kn<=2)
1548 return;
1549
1550 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2];
1551 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2];
1552 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2];
1553 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2];
1554
1555 if(kn<=3)
1556 return;
1557
1558 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3];
1559 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3];
1560 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3];
1561 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3];
1562 }
1563 else //if(offsetD==3)
1564 {
1565 D1 = D0 + sdd*bs;
1566
1567 if(kn<=0)
1568 return;
1569
1570 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0];
1571 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0];
1572 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0];
1573 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0];
1574
1575 if(kn<=1)
1576 return;
1577
1578 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1];
1579 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1];
1580 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1];
1581 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1];
1582
1583 if(kn<=2)
1584 return;
1585
1586 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2];
1587 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2];
1588 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2];
1589 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2];
1590
1591 if(kn<=3)
1592 return;
1593
1594 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3];
1595 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3];
1596 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3];
1597 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3];
1598 }
1599
1600 return;
1601
1602 }
1603 #endif
1604
1605
1606
1607 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_ssyrk_nt_l_4x4_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)1608 void kernel_ssyrk_nt_l_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
1609 {
1610
1611 const int bs = 4;
1612
1613 #if defined(TARGET_GENERIC)
1614 float CC[16] = {0};
1615 #else
1616 ALIGNED( float CC[16], 64 ) = {0};
1617 #endif
1618
1619 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC);
1620
1621 D[0+bs*0] = CC[0+bs*0];
1622 D[1+bs*0] = CC[1+bs*0];
1623 D[2+bs*0] = CC[2+bs*0];
1624 D[3+bs*0] = CC[3+bs*0];
1625
1626 D[1+bs*1] = CC[1+bs*1];
1627 D[2+bs*1] = CC[2+bs*1];
1628 D[3+bs*1] = CC[3+bs*1];
1629
1630 D[2+bs*2] = CC[2+bs*2];
1631 D[3+bs*2] = CC[3+bs*2];
1632
1633 D[3+bs*3] = CC[3+bs*3];
1634
1635 return;
1636
1637 }
1638 #endif
1639
1640
1641
1642 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_ssyrk_nt_l_4x4_vs_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D,int km,int kn)1643 void kernel_ssyrk_nt_l_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn)
1644 {
1645
1646 const int bs = 4;
1647
1648 #if defined(TARGET_GENERIC)
1649 float CC[16] = {0};
1650 #else
1651 ALIGNED( float CC[16], 64 ) = {0};
1652 #endif
1653
1654 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, beta, C, CC);
1655
1656 if(km>=4)
1657 {
1658 D[0+bs*0] = CC[0+bs*0];
1659 D[1+bs*0] = CC[1+bs*0];
1660 D[2+bs*0] = CC[2+bs*0];
1661 D[3+bs*0] = CC[3+bs*0];
1662
1663 if(kn==1)
1664 return;
1665
1666 D[1+bs*1] = CC[1+bs*1];
1667 D[2+bs*1] = CC[2+bs*1];
1668 D[3+bs*1] = CC[3+bs*1];
1669
1670 if(kn==2)
1671 return;
1672
1673 D[2+bs*2] = CC[2+bs*2];
1674 D[3+bs*2] = CC[3+bs*2];
1675
1676 if(kn==3)
1677 return;
1678
1679 D[3+bs*3] = CC[3+bs*3];
1680 }
1681 else if(km>=3)
1682 {
1683 D[0+bs*0] = CC[0+bs*0];
1684 D[1+bs*0] = CC[1+bs*0];
1685 D[2+bs*0] = CC[2+bs*0];
1686
1687 if(kn==1)
1688 return;
1689
1690 D[1+bs*1] = CC[1+bs*1];
1691 D[2+bs*1] = CC[2+bs*1];
1692
1693 if(kn==2)
1694 return;
1695
1696 D[2+bs*2] = CC[2+bs*2];
1697 }
1698 else if(km>=2)
1699 {
1700 D[0+bs*0] = CC[0+bs*0];
1701 D[1+bs*0] = CC[1+bs*0];
1702
1703 if(kn==1)
1704 return;
1705
1706 D[1+bs*1] = CC[1+bs*1];
1707 }
1708 else //if(km>=1)
1709 {
1710 D[0+bs*0] = CC[0+bs*0];
1711 }
1712
1713 return;
1714
1715 }
1716 #endif
1717
1718
1719
1720 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_ssyrk_nt_l_4x4_gen_lib4(int kmax,float * alpha,float * A,float * B,float * beta,int offsetC,float * C0,int sdc,int offsetD,float * D0,int sdd,int m0,int m1,int n0,int n1)1721 void kernel_ssyrk_nt_l_4x4_gen_lib4(int kmax, float *alpha, float *A, float *B, float *beta, int offsetC, float *C0, int sdc, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
1722 {
1723
1724 const int bs = 4;
1725
1726 #if defined(TARGET_GENERIC)
1727 float CC[16] = {0};
1728 #else
1729 ALIGNED( float CC[16], 64 ) = {0};
1730 #endif
1731
1732 float
1733 *C1, *D1;
1734
1735 if(offsetC==0)
1736 {
1737 CC[0+bs*0] = beta[0]*C0[0+bs*0];
1738 CC[1+bs*0] = beta[0]*C0[1+bs*0];
1739 CC[2+bs*0] = beta[0]*C0[2+bs*0];
1740 CC[3+bs*0] = beta[0]*C0[3+bs*0];
1741
1742 CC[1+bs*1] = beta[0]*C0[1+bs*1];
1743 CC[2+bs*1] = beta[0]*C0[2+bs*1];
1744 CC[3+bs*1] = beta[0]*C0[3+bs*1];
1745
1746 CC[2+bs*2] = beta[0]*C0[2+bs*2];
1747 CC[3+bs*2] = beta[0]*C0[3+bs*2];
1748
1749 CC[3+bs*3] = beta[0]*C0[3+bs*3];
1750 }
1751 else if(offsetC==1)
1752 {
1753 C1 = C0 + sdc*bs;
1754
1755 CC[0+bs*0] = beta[0]*C0[1+bs*0];
1756 CC[1+bs*0] = beta[0]*C0[2+bs*0];
1757 CC[2+bs*0] = beta[0]*C0[3+bs*0];
1758 CC[3+bs*0] = beta[0]*C1[0+bs*0];
1759
1760 CC[1+bs*1] = beta[0]*C0[2+bs*1];
1761 CC[2+bs*1] = beta[0]*C0[3+bs*1];
1762 CC[3+bs*1] = beta[0]*C1[0+bs*1];
1763
1764 CC[2+bs*2] = beta[0]*C0[3+bs*2];
1765 CC[3+bs*2] = beta[0]*C1[0+bs*2];
1766
1767 CC[3+bs*3] = beta[0]*C1[0+bs*3];
1768 }
1769 else if(offsetC==2)
1770 {
1771 C1 = C0 + sdc*bs;
1772
1773 CC[0+bs*0] = beta[0]*C0[2+bs*0];
1774 CC[1+bs*0] = beta[0]*C0[3+bs*0];
1775 CC[2+bs*0] = beta[0]*C1[0+bs*0];
1776 CC[3+bs*0] = beta[0]*C1[1+bs*0];
1777
1778 CC[1+bs*1] = beta[0]*C0[3+bs*1];
1779 CC[2+bs*1] = beta[0]*C1[0+bs*1];
1780 CC[3+bs*1] = beta[0]*C1[1+bs*1];
1781
1782 CC[2+bs*2] = beta[0]*C1[0+bs*2];
1783 CC[3+bs*2] = beta[0]*C1[1+bs*2];
1784
1785 CC[3+bs*3] = beta[0]*C1[1+bs*3];
1786 }
1787 else //if(offsetC==3)
1788 {
1789 C1 = C0 + sdc*bs;
1790
1791 CC[0+bs*0] = beta[0]*C0[3+bs*0];
1792 CC[1+bs*0] = beta[0]*C1[0+bs*0];
1793 CC[2+bs*0] = beta[0]*C1[1+bs*0];
1794 CC[3+bs*0] = beta[0]*C1[2+bs*0];
1795
1796 CC[1+bs*1] = beta[0]*C1[0+bs*1];
1797 CC[2+bs*1] = beta[0]*C1[1+bs*1];
1798 CC[3+bs*1] = beta[0]*C1[2+bs*1];
1799
1800 CC[2+bs*2] = beta[0]*C1[1+bs*2];
1801 CC[3+bs*2] = beta[0]*C1[2+bs*2];
1802
1803 CC[3+bs*3] = beta[0]*C1[2+bs*3];
1804 }
1805
1806 float beta1 = 1.0;
1807
1808 kernel_sgemm_nt_4x4_lib4(kmax, alpha, A, B, &beta1, CC, CC);
1809
1810 // shift sol for cols
1811 if(n0>0)
1812 {
1813 if(n0==1)
1814 {
1815 CC[0+bs*0] = CC[0+bs*1];
1816 CC[1+bs*0] = CC[1+bs*1];
1817 CC[2+bs*0] = CC[2+bs*1];
1818 CC[3+bs*0] = CC[3+bs*1];
1819
1820 CC[0+bs*1] = CC[0+bs*2];
1821 CC[1+bs*1] = CC[1+bs*2];
1822 CC[2+bs*1] = CC[2+bs*2];
1823 CC[3+bs*1] = CC[3+bs*2];
1824
1825 CC[0+bs*2] = CC[0+bs*3];
1826 CC[1+bs*2] = CC[1+bs*3];
1827 CC[2+bs*2] = CC[2+bs*3];
1828 CC[3+bs*2] = CC[3+bs*3];
1829
1830 D0 += 1*bs;
1831 }
1832 else if(n0==2)
1833 {
1834 CC[0+bs*0] = CC[0+bs*2];
1835 CC[1+bs*0] = CC[1+bs*2];
1836 CC[2+bs*0] = CC[2+bs*2];
1837 CC[3+bs*0] = CC[3+bs*2];
1838
1839 CC[0+bs*1] = CC[0+bs*3];
1840 CC[1+bs*1] = CC[1+bs*3];
1841 CC[2+bs*1] = CC[2+bs*3];
1842 CC[3+bs*1] = CC[3+bs*3];
1843
1844 D0 += 2*bs;
1845 }
1846 else //if(n0==3)
1847 {
1848 CC[0+bs*0] = CC[0+bs*3];
1849 CC[1+bs*0] = CC[1+bs*3];
1850 CC[2+bs*0] = CC[2+bs*3];
1851 CC[3+bs*0] = CC[3+bs*3];
1852
1853 D0 += 3*bs;
1854 }
1855 }
1856
1857 n1 = 4<n1 ? 4 : n1;
1858 int kn = n1 - n0;
1859
1860 if(offsetD==0)
1861 {
1862 if(m0<=0)
1863 {
1864 if(kn<=0)
1865 return;
1866
1867 if(m1>0) D0[0+bs*0] = CC[0+bs*0];
1868 if(m1>1) D0[1+bs*0] = CC[1+bs*0];
1869 if(m1>2) D0[2+bs*0] = CC[2+bs*0];
1870 if(m1>3) D0[3+bs*0] = CC[3+bs*0];
1871
1872 if(kn<=1)
1873 return;
1874
1875 if(m1>1) D0[1+bs*1] = CC[1+bs*1];
1876 if(m1>2) D0[2+bs*1] = CC[2+bs*1];
1877 if(m1>3) D0[3+bs*1] = CC[3+bs*1];
1878
1879 if(kn<=2)
1880 return;
1881
1882 if(m1>2) D0[2+bs*2] = CC[2+bs*2];
1883 if(m1>3) D0[3+bs*2] = CC[3+bs*2];
1884
1885 if(kn<=3)
1886 return;
1887
1888 if(m1>3) D0[3+bs*3] = CC[3+bs*3];
1889 }
1890 else if(m0<=1)
1891 {
1892 if(kn<=0)
1893 return;
1894
1895 if(m1>1) D0[1+bs*0] = CC[1+bs*0];
1896 if(m1>2) D0[2+bs*0] = CC[2+bs*0];
1897 if(m1>3) D0[3+bs*0] = CC[3+bs*0];
1898
1899 if(kn<=1)
1900 return;
1901
1902 if(m1>2) D0[2+bs*1] = CC[2+bs*1];
1903 if(m1>3) D0[3+bs*1] = CC[3+bs*1];
1904
1905 if(kn<=2)
1906 return;
1907
1908 if(m1>3) D0[3+bs*2] = CC[3+bs*2];
1909 }
1910 else if(m0<=2)
1911 {
1912 if(kn<=0)
1913 return;
1914
1915 if(m1>2) D0[2+bs*0] = CC[2+bs*0];
1916 if(m1>3) D0[3+bs*0] = CC[3+bs*0];
1917
1918 if(kn<=1)
1919 return;
1920
1921 if(m1>3) D0[3+bs*1] = CC[3+bs*1];
1922 }
1923 else if(m0<=3)
1924 {
1925 if(kn<=0)
1926 return;
1927
1928 if(m1>3) D0[3+bs*0] = CC[3+bs*0];
1929 }
1930 }
1931 else if(offsetD==1)
1932 {
1933 D1 = D0 + sdd*bs;
1934 if(m0<=0)
1935 {
1936 if(kn<=0)
1937 return;
1938
1939 if(m1>0) D0[1+bs*0] = CC[0+bs*0];
1940 if(m1>1) D0[2+bs*0] = CC[1+bs*0];
1941 if(m1>2) D0[3+bs*0] = CC[2+bs*0];
1942 if(m1>3) D1[0+bs*0] = CC[3+bs*0];
1943
1944 if(kn<=1)
1945 return;
1946
1947 if(m1>1) D0[2+bs*1] = CC[1+bs*1];
1948 if(m1>2) D0[3+bs*1] = CC[2+bs*1];
1949 if(m1>3) D1[0+bs*1] = CC[3+bs*1];
1950
1951 if(kn<=2)
1952 return;
1953
1954 if(m1>2) D0[3+bs*2] = CC[2+bs*2];
1955 if(m1>3) D1[0+bs*2] = CC[3+bs*2];
1956
1957 if(kn<=3)
1958 return;
1959
1960 if(m1>3) D1[0+bs*3] = CC[3+bs*3];
1961 }
1962 else if(m0<=1)
1963 {
1964 if(kn<=0)
1965 return;
1966
1967 if(m1>1) D0[2+bs*0] = CC[1+bs*0];
1968 if(m1>2) D0[3+bs*0] = CC[2+bs*0];
1969 if(m1>3) D1[0+bs*0] = CC[3+bs*0];
1970
1971 if(kn<=1)
1972 return;
1973
1974 if(m1>2) D0[3+bs*1] = CC[2+bs*1];
1975 if(m1>3) D1[0+bs*1] = CC[3+bs*1];
1976
1977 if(kn<=2)
1978 return;
1979
1980 if(m1>3) D1[0+bs*2] = CC[3+bs*2];
1981 }
1982 else if(m0<=2)
1983 {
1984 if(kn<=0)
1985 return;
1986
1987 if(m1>2) D0[3+bs*0] = CC[2+bs*0];
1988 if(m1>3) D1[0+bs*0] = CC[3+bs*0];
1989
1990 if(kn<=1)
1991 return;
1992
1993 if(m1>3) D1[0+bs*1] = CC[3+bs*1];
1994 }
1995 else if(m0<=3)
1996 {
1997 if(kn<=0)
1998 return;
1999
2000 if(m1>3) D1[0+bs*0] = CC[3+bs*0];
2001 }
2002 }
2003 else if(offsetD==2)
2004 {
2005 D1 = D0 + sdd*bs;
2006 if(m0<=0)
2007 {
2008 if(kn<=0)
2009 return;
2010
2011 if(m1>0) D0[2+bs*0] = CC[0+bs*0];
2012 if(m1>1) D0[3+bs*0] = CC[1+bs*0];
2013 if(m1>2) D1[0+bs*0] = CC[2+bs*0];
2014 if(m1>3) D1[1+bs*0] = CC[3+bs*0];
2015
2016 if(kn<=1)
2017 return;
2018
2019 if(m1>1) D0[3+bs*1] = CC[1+bs*1];
2020 if(m1>2) D1[0+bs*1] = CC[2+bs*1];
2021 if(m1>3) D1[1+bs*1] = CC[3+bs*1];
2022
2023 if(kn<=2)
2024 return;
2025
2026 if(m1>2) D1[0+bs*2] = CC[2+bs*2];
2027 if(m1>3) D1[1+bs*2] = CC[3+bs*2];
2028
2029 if(kn<=3)
2030 return;
2031
2032 if(m1>3) D1[1+bs*3] = CC[3+bs*3];
2033 }
2034 else if(m0<=1)
2035 {
2036 if(kn<=0)
2037 return;
2038
2039 if(m1>1) D0[3+bs*0] = CC[1+bs*0];
2040 if(m1>2) D1[0+bs*0] = CC[2+bs*0];
2041 if(m1>3) D1[1+bs*0] = CC[3+bs*0];
2042
2043 if(kn<=1)
2044 return;
2045
2046 if(m1>2) D1[0+bs*1] = CC[2+bs*1];
2047 if(m1>3) D1[1+bs*1] = CC[3+bs*1];
2048
2049 if(kn<=2)
2050 return;
2051
2052 if(m1>3) D1[1+bs*2] = CC[3+bs*2];
2053 }
2054 else if(m0<=2)
2055 {
2056 if(kn<=0)
2057 return;
2058
2059 if(m1>2) D1[0+bs*0] = CC[2+bs*0];
2060 if(m1>3) D1[1+bs*0] = CC[3+bs*0];
2061
2062 if(kn<=1)
2063 return;
2064
2065 if(m1>3) D1[1+bs*1] = CC[3+bs*1];
2066 }
2067 else if(m0<=3)
2068 {
2069 if(kn<=0)
2070 return;
2071
2072 if(m1>3) D1[1+bs*0] = CC[3+bs*0];
2073 }
2074 }
2075 else //if(offsetD==3)
2076 {
2077 D1 = D0 + sdd*bs;
2078 if(m0<=0)
2079 {
2080 if(kn<=0)
2081 return;
2082
2083 if(m1>0) D0[3+bs*0] = CC[0+bs*0];
2084 if(m1>1) D1[0+bs*0] = CC[1+bs*0];
2085 if(m1>2) D1[1+bs*0] = CC[2+bs*0];
2086 if(m1>3) D1[2+bs*0] = CC[3+bs*0];
2087
2088 if(kn<=1)
2089 return;
2090
2091 if(m1>1) D1[0+bs*1] = CC[1+bs*1];
2092 if(m1>2) D1[1+bs*1] = CC[2+bs*1];
2093 if(m1>3) D1[2+bs*1] = CC[3+bs*1];
2094
2095 if(kn<=2)
2096 return;
2097
2098 if(m1>2) D1[1+bs*2] = CC[2+bs*2];
2099 if(m1>3) D1[2+bs*2] = CC[3+bs*2];
2100
2101 if(kn<=3)
2102 return;
2103
2104 if(m1>3) D1[2+bs*3] = CC[3+bs*3];
2105 }
2106 else if(m0<=1)
2107 {
2108 if(kn<=0)
2109 return;
2110
2111 if(m1>1) D1[0+bs*0] = CC[1+bs*0];
2112 if(m1>2) D1[1+bs*0] = CC[2+bs*0];
2113 if(m1>3) D1[2+bs*0] = CC[3+bs*0];
2114
2115 if(kn<=1)
2116 return;
2117
2118 if(m1>2) D1[1+bs*1] = CC[2+bs*1];
2119 if(m1>3) D1[2+bs*1] = CC[3+bs*1];
2120
2121 if(kn<=2)
2122 return;
2123
2124 if(m1>3) D1[2+bs*2] = CC[3+bs*2];
2125 }
2126 else if(m0<=2)
2127 {
2128 if(kn<=0)
2129 return;
2130
2131 if(m1>2) D1[1+bs*0] = CC[2+bs*0];
2132 if(m1>3) D1[2+bs*0] = CC[3+bs*0];
2133
2134 if(kn<=1)
2135 return;
2136
2137 if(m1>3) D1[2+bs*1] = CC[3+bs*1];
2138 }
2139 else if(m0<=3)
2140 {
2141 if(kn<=0)
2142 return;
2143
2144 if(m1>3) D1[2+bs*0] = CC[3+bs*0];
2145 }
2146 }
2147
2148 return;
2149
2150 }
2151 #endif
2152
2153
2154
2155 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmm_nt_ru_4x4_lib4(int kmax,float * alpha,float * A,float * B,float * D)2156 void kernel_strmm_nt_ru_4x4_lib4(int kmax, float *alpha, float *A, float *B, float *D)
2157 {
2158
2159 const int bs = 4;
2160
2161 float
2162 a_0, a_1, a_2, a_3,
2163 b_0, b_1, b_2, b_3;
2164
2165 #if defined(TARGET_GENERIC)
2166 float CC[16] = {0};
2167 #else
2168 ALIGNED( float CC[16], 64 ) = {0};
2169 #endif
2170
2171 int k;
2172
2173 k = 0;
2174
2175 // k = 0
2176 if(kmax>0)
2177 {
2178 a_0 = A[0];
2179 a_1 = A[1];
2180 a_2 = A[2];
2181 a_3 = A[3];
2182
2183 b_0 = B[0];
2184
2185 CC[0+bs*0] += a_0 * b_0;
2186 CC[1+bs*0] += a_1 * b_0;
2187 CC[2+bs*0] += a_2 * b_0;
2188 CC[3+bs*0] += a_3 * b_0;
2189
2190 A += 4;
2191 B += 4;
2192 k++;
2193 }
2194
2195 // k = 1
2196 if(kmax>1)
2197 {
2198 a_0 = A[0];
2199 a_1 = A[1];
2200 a_2 = A[2];
2201 a_3 = A[3];
2202
2203 b_0 = B[0];
2204 b_1 = B[1];
2205
2206 CC[0+bs*0] += a_0 * b_0;
2207 CC[1+bs*0] += a_1 * b_0;
2208 CC[2+bs*0] += a_2 * b_0;
2209 CC[3+bs*0] += a_3 * b_0;
2210
2211 CC[0+bs*1] += a_0 * b_1;
2212 CC[1+bs*1] += a_1 * b_1;
2213 CC[2+bs*1] += a_2 * b_1;
2214 CC[3+bs*1] += a_3 * b_1;
2215
2216 A += 4;
2217 B += 4;
2218 k++;
2219 }
2220
2221 // k = 2
2222 if(kmax>2)
2223 {
2224 a_0 = A[0];
2225 a_1 = A[1];
2226 a_2 = A[2];
2227 a_3 = A[3];
2228
2229 b_0 = B[0];
2230 b_1 = B[1];
2231 b_2 = B[2];
2232
2233 CC[0+bs*0] += a_0 * b_0;
2234 CC[1+bs*0] += a_1 * b_0;
2235 CC[2+bs*0] += a_2 * b_0;
2236 CC[3+bs*0] += a_3 * b_0;
2237
2238 CC[0+bs*1] += a_0 * b_1;
2239 CC[1+bs*1] += a_1 * b_1;
2240 CC[2+bs*1] += a_2 * b_1;
2241 CC[3+bs*1] += a_3 * b_1;
2242
2243 CC[0+bs*2] += a_0 * b_2;
2244 CC[1+bs*2] += a_1 * b_2;
2245 CC[2+bs*2] += a_2 * b_2;
2246 CC[3+bs*2] += a_3 * b_2;
2247
2248 A += 4;
2249 B += 4;
2250 k++;
2251 }
2252
2253 kernel_sgemm_nt_4x4_lib4(kmax-k, alpha, A, B, alpha, CC, D);
2254
2255 return;
2256
2257 }
2258 #endif
2259
2260
2261
2262 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmm_nt_ru_4x4_vs_lib4(int kmax,float * alpha,float * A,float * B,float * D,int km,int kn)2263 void kernel_strmm_nt_ru_4x4_vs_lib4(int kmax, float *alpha, float *A, float *B, float *D, int km, int kn)
2264 {
2265
2266 const int bs = 4;
2267
2268 float
2269 a_0, a_1, a_2, a_3,
2270 b_0, b_1, b_2, b_3;
2271
2272 #if defined(TARGET_GENERIC)
2273 float CC[16] = {0};
2274 #else
2275 ALIGNED( float CC[16], 64 ) = {0};
2276 #endif
2277
2278 int k;
2279
2280 k = 0;
2281
2282 // k = 0
2283 if(kmax>0)
2284 {
2285 a_0 = A[0];
2286 a_1 = A[1];
2287 a_2 = A[2];
2288 a_3 = A[3];
2289
2290 b_0 = B[0];
2291
2292 CC[0+bs*0] += a_0 * b_0;
2293 CC[1+bs*0] += a_1 * b_0;
2294 CC[2+bs*0] += a_2 * b_0;
2295 CC[3+bs*0] += a_3 * b_0;
2296
2297 A += 4;
2298 B += 4;
2299 k++;
2300 }
2301
2302 // k = 1
2303 if(kmax>1)
2304 {
2305 a_0 = A[0];
2306 a_1 = A[1];
2307 a_2 = A[2];
2308 a_3 = A[3];
2309
2310 b_0 = B[0];
2311 b_1 = B[1];
2312
2313 CC[0+bs*0] += a_0 * b_0;
2314 CC[1+bs*0] += a_1 * b_0;
2315 CC[2+bs*0] += a_2 * b_0;
2316 CC[3+bs*0] += a_3 * b_0;
2317
2318 CC[0+bs*1] += a_0 * b_1;
2319 CC[1+bs*1] += a_1 * b_1;
2320 CC[2+bs*1] += a_2 * b_1;
2321 CC[3+bs*1] += a_3 * b_1;
2322
2323 A += 4;
2324 B += 4;
2325 k++;
2326 }
2327
2328 // k = 2
2329 if(kmax>2)
2330 {
2331 a_0 = A[0];
2332 a_1 = A[1];
2333 a_2 = A[2];
2334 a_3 = A[3];
2335
2336 b_0 = B[0];
2337 b_1 = B[1];
2338 b_2 = B[2];
2339
2340 CC[0+bs*0] += a_0 * b_0;
2341 CC[1+bs*0] += a_1 * b_0;
2342 CC[2+bs*0] += a_2 * b_0;
2343 CC[3+bs*0] += a_3 * b_0;
2344
2345 CC[0+bs*1] += a_0 * b_1;
2346 CC[1+bs*1] += a_1 * b_1;
2347 CC[2+bs*1] += a_2 * b_1;
2348 CC[3+bs*1] += a_3 * b_1;
2349
2350 CC[0+bs*2] += a_0 * b_2;
2351 CC[1+bs*2] += a_1 * b_2;
2352 CC[2+bs*2] += a_2 * b_2;
2353 CC[3+bs*2] += a_3 * b_2;
2354
2355 A += 4;
2356 B += 4;
2357 k++;
2358 }
2359
2360 kernel_sgemm_nt_4x4_lib4(kmax-k, alpha, A, B, alpha, CC, CC);
2361
2362 if(km>=4)
2363 {
2364 D[0+bs*0] = CC[0+bs*0];
2365 D[1+bs*0] = CC[1+bs*0];
2366 D[2+bs*0] = CC[2+bs*0];
2367 D[3+bs*0] = CC[3+bs*0];
2368
2369 if(kn==1)
2370 return;
2371
2372 D[0+bs*1] = CC[0+bs*1];
2373 D[1+bs*1] = CC[1+bs*1];
2374 D[2+bs*1] = CC[2+bs*1];
2375 D[3+bs*1] = CC[3+bs*1];
2376
2377 if(kn==2)
2378 return;
2379
2380 D[0+bs*2] = CC[0+bs*2];
2381 D[1+bs*2] = CC[1+bs*2];
2382 D[2+bs*2] = CC[2+bs*2];
2383 D[3+bs*2] = CC[3+bs*2];
2384
2385 if(kn==3)
2386 return;
2387
2388 D[0+bs*3] = CC[0+bs*3];
2389 D[1+bs*3] = CC[1+bs*3];
2390 D[2+bs*3] = CC[2+bs*3];
2391 D[3+bs*3] = CC[3+bs*3];
2392 }
2393 else if(km>=3)
2394 {
2395 D[0+bs*0] = CC[0+bs*0];
2396 D[1+bs*0] = CC[1+bs*0];
2397 D[2+bs*0] = CC[2+bs*0];
2398
2399 if(kn==1)
2400 return;
2401
2402 D[0+bs*1] = CC[0+bs*1];
2403 D[1+bs*1] = CC[1+bs*1];
2404 D[2+bs*1] = CC[2+bs*1];
2405
2406 if(kn==2)
2407 return;
2408
2409 D[0+bs*2] = CC[0+bs*2];
2410 D[1+bs*2] = CC[1+bs*2];
2411 D[2+bs*2] = CC[2+bs*2];
2412
2413 if(kn==3)
2414 return;
2415
2416 D[0+bs*3] = CC[0+bs*3];
2417 D[1+bs*3] = CC[1+bs*3];
2418 D[2+bs*3] = CC[2+bs*3];
2419 }
2420 else if(km>=2)
2421 {
2422 D[0+bs*0] = CC[0+bs*0];
2423 D[1+bs*0] = CC[1+bs*0];
2424
2425 if(kn==1)
2426 return;
2427
2428 D[0+bs*1] = CC[0+bs*1];
2429 D[1+bs*1] = CC[1+bs*1];
2430
2431 if(kn==2)
2432 return;
2433
2434 D[0+bs*2] = CC[0+bs*2];
2435 D[1+bs*2] = CC[1+bs*2];
2436
2437 if(kn==3)
2438 return;
2439
2440 D[0+bs*3] = CC[0+bs*3];
2441 D[1+bs*3] = CC[1+bs*3];
2442 }
2443 else //if(km>=1)
2444 {
2445 D[0+bs*0] = CC[0+bs*0];
2446
2447 if(kn==1)
2448 return;
2449
2450 D[0+bs*1] = CC[0+bs*1];
2451
2452 if(kn==2)
2453 return;
2454
2455 D[0+bs*2] = CC[0+bs*2];
2456
2457 if(kn==3)
2458 return;
2459
2460 D[0+bs*3] = CC[0+bs*3];
2461 }
2462
2463 return;
2464
2465 }
2466 #endif
2467
2468
2469
2470
2471 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmm_nn_rl_4x4_lib4(int kmax,float * alpha,float * A,int offsetB,float * B,int sdb,float * D)2472 void kernel_strmm_nn_rl_4x4_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *D)
2473 {
2474
2475 const int bs = 4;
2476
2477 float
2478 a_0, a_1, a_2, a_3,
2479 b_0, b_1, b_2, b_3;
2480
2481 #if defined(TARGET_GENERIC)
2482 float CC[16] = {0};
2483 #else
2484 ALIGNED( float CC[16], 64 ) = {0};
2485 #endif
2486
2487 float *D1;
2488
2489 int k;
2490
2491 B += offsetB;
2492
2493 k = 0;
2494
2495 if(offsetB==0)
2496 {
2497
2498 // k = 0
2499
2500 a_0 = A[0];
2501 a_1 = A[1];
2502 a_2 = A[2];
2503 a_3 = A[3];
2504
2505 b_0 = B[0];
2506 CC[0+bs*0] += a_0 * b_0;
2507 CC[1+bs*0] += a_1 * b_0;
2508 CC[2+bs*0] += a_2 * b_0;
2509 CC[3+bs*0] += a_3 * b_0;
2510
2511 A += 4;
2512 B += 1;
2513 k += 1;
2514
2515 if(k>=kmax)
2516 goto store;
2517
2518 // k = 1
2519
2520 a_0 = A[0];
2521 a_1 = A[1];
2522 a_2 = A[2];
2523 a_3 = A[3];
2524
2525 b_0 = B[0];
2526 CC[0+bs*0] += a_0 * b_0;
2527 CC[1+bs*0] += a_1 * b_0;
2528 CC[2+bs*0] += a_2 * b_0;
2529 CC[3+bs*0] += a_3 * b_0;
2530
2531 b_1 = B[4];
2532 CC[0+bs*1] += a_0 * b_1;
2533 CC[1+bs*1] += a_1 * b_1;
2534 CC[2+bs*1] += a_2 * b_1;
2535 CC[3+bs*1] += a_3 * b_1;
2536
2537 A += 4;
2538 B += 1;
2539 k += 1;
2540
2541 if(k>=kmax)
2542 goto store;
2543
2544 // k = 2
2545
2546 a_0 = A[0];
2547 a_1 = A[1];
2548 a_2 = A[2];
2549 a_3 = A[3];
2550
2551 b_0 = B[0];
2552 CC[0+bs*0] += a_0 * b_0;
2553 CC[1+bs*0] += a_1 * b_0;
2554 CC[2+bs*0] += a_2 * b_0;
2555 CC[3+bs*0] += a_3 * b_0;
2556
2557 b_1 = B[4];
2558 CC[0+bs*1] += a_0 * b_1;
2559 CC[1+bs*1] += a_1 * b_1;
2560 CC[2+bs*1] += a_2 * b_1;
2561 CC[3+bs*1] += a_3 * b_1;
2562
2563 b_2 = B[8];
2564 CC[0+bs*2] += a_0 * b_2;
2565 CC[1+bs*2] += a_1 * b_2;
2566 CC[2+bs*2] += a_2 * b_2;
2567 CC[3+bs*2] += a_3 * b_2;
2568
2569 A += 4;
2570 B += 1;
2571 k += 1;
2572
2573 if(k>=kmax)
2574 goto store;
2575
2576 // k = 3
2577
2578 a_0 = A[0];
2579 a_1 = A[1];
2580 a_2 = A[2];
2581 a_3 = A[3];
2582
2583 b_0 = B[0];
2584 CC[0+bs*0] += a_0 * b_0;
2585 CC[1+bs*0] += a_1 * b_0;
2586 CC[2+bs*0] += a_2 * b_0;
2587 CC[3+bs*0] += a_3 * b_0;
2588
2589 b_1 = B[4];
2590 CC[0+bs*1] += a_0 * b_1;
2591 CC[1+bs*1] += a_1 * b_1;
2592 CC[2+bs*1] += a_2 * b_1;
2593 CC[3+bs*1] += a_3 * b_1;
2594
2595 b_2 = B[8];
2596 CC[0+bs*2] += a_0 * b_2;
2597 CC[1+bs*2] += a_1 * b_2;
2598 CC[2+bs*2] += a_2 * b_2;
2599 CC[3+bs*2] += a_3 * b_2;
2600
2601 b_3 = B[12];
2602 CC[0+bs*3] += a_0 * b_3;
2603 CC[1+bs*3] += a_1 * b_3;
2604 CC[2+bs*3] += a_2 * b_3;
2605 CC[3+bs*3] += a_3 * b_3;
2606
2607 A += 4;
2608 B += 4*sdb-3;
2609 k += 1;
2610
2611 }
2612 else if(offsetB==1)
2613 {
2614
2615 // k = 0
2616
2617 a_0 = A[0];
2618 a_1 = A[1];
2619 a_2 = A[2];
2620 a_3 = A[3];
2621
2622 b_0 = B[0];
2623 CC[0+bs*0] += a_0 * b_0;
2624 CC[1+bs*0] += a_1 * b_0;
2625 CC[2+bs*0] += a_2 * b_0;
2626 CC[3+bs*0] += a_3 * b_0;
2627
2628 A += 4;
2629 B += 1;
2630 k += 1;
2631
2632 if(k>=kmax)
2633 goto store;
2634
2635 // k = 1
2636
2637 a_0 = A[0];
2638 a_1 = A[1];
2639 a_2 = A[2];
2640 a_3 = A[3];
2641
2642 b_0 = B[0];
2643 CC[0+bs*0] += a_0 * b_0;
2644 CC[1+bs*0] += a_1 * b_0;
2645 CC[2+bs*0] += a_2 * b_0;
2646 CC[3+bs*0] += a_3 * b_0;
2647
2648 b_1 = B[4];
2649 CC[0+bs*1] += a_0 * b_1;
2650 CC[1+bs*1] += a_1 * b_1;
2651 CC[2+bs*1] += a_2 * b_1;
2652 CC[3+bs*1] += a_3 * b_1;
2653
2654 A += 4;
2655 B += 1;
2656 k += 1;
2657
2658 if(k>=kmax)
2659 goto store;
2660
2661 // k = 2
2662
2663 a_0 = A[0];
2664 a_1 = A[1];
2665 a_2 = A[2];
2666 a_3 = A[3];
2667
2668 b_0 = B[0];
2669 CC[0+bs*0] += a_0 * b_0;
2670 CC[1+bs*0] += a_1 * b_0;
2671 CC[2+bs*0] += a_2 * b_0;
2672 CC[3+bs*0] += a_3 * b_0;
2673
2674 b_1 = B[4];
2675 CC[0+bs*1] += a_0 * b_1;
2676 CC[1+bs*1] += a_1 * b_1;
2677 CC[2+bs*1] += a_2 * b_1;
2678 CC[3+bs*1] += a_3 * b_1;
2679
2680 b_2 = B[8];
2681 CC[0+bs*2] += a_0 * b_2;
2682 CC[1+bs*2] += a_1 * b_2;
2683 CC[2+bs*2] += a_2 * b_2;
2684 CC[3+bs*2] += a_3 * b_2;
2685
2686 A += 4;
2687 B += 4*sdb-3;
2688 k += 1;
2689
2690 }
2691 else if(offsetB==2)
2692 {
2693
2694 // k = 0
2695
2696 a_0 = A[0];
2697 a_1 = A[1];
2698 a_2 = A[2];
2699 a_3 = A[3];
2700
2701 b_0 = B[0];
2702 CC[0+bs*0] += a_0 * b_0;
2703 CC[1+bs*0] += a_1 * b_0;
2704 CC[2+bs*0] += a_2 * b_0;
2705 CC[3+bs*0] += a_3 * b_0;
2706
2707 A += 4;
2708 B += 1;
2709 k += 1;
2710
2711 if(k>=kmax)
2712 goto store;
2713
2714 // k = 1
2715
2716 a_0 = A[0];
2717 a_1 = A[1];
2718 a_2 = A[2];
2719 a_3 = A[3];
2720
2721 b_0 = B[0];
2722 CC[0+bs*0] += a_0 * b_0;
2723 CC[1+bs*0] += a_1 * b_0;
2724 CC[2+bs*0] += a_2 * b_0;
2725 CC[3+bs*0] += a_3 * b_0;
2726
2727 b_1 = B[4];
2728 CC[0+bs*1] += a_0 * b_1;
2729 CC[1+bs*1] += a_1 * b_1;
2730 CC[2+bs*1] += a_2 * b_1;
2731 CC[3+bs*1] += a_3 * b_1;
2732
2733 A += 4;
2734 B += 4*sdb-3;
2735 k += 1;
2736
2737 if(k>=kmax)
2738 goto store;
2739
2740 // k = 2
2741
2742 a_0 = A[0];
2743 a_1 = A[1];
2744 a_2 = A[2];
2745 a_3 = A[3];
2746
2747 b_0 = B[0];
2748 CC[0+bs*0] += a_0 * b_0;
2749 CC[1+bs*0] += a_1 * b_0;
2750 CC[2+bs*0] += a_2 * b_0;
2751 CC[3+bs*0] += a_3 * b_0;
2752
2753 b_1 = B[4];
2754 CC[0+bs*1] += a_0 * b_1;
2755 CC[1+bs*1] += a_1 * b_1;
2756 CC[2+bs*1] += a_2 * b_1;
2757 CC[3+bs*1] += a_3 * b_1;
2758
2759 b_2 = B[8];
2760 CC[0+bs*2] += a_0 * b_2;
2761 CC[1+bs*2] += a_1 * b_2;
2762 CC[2+bs*2] += a_2 * b_2;
2763 CC[3+bs*2] += a_3 * b_2;
2764
2765 A += 4;
2766 B += 1;
2767 k += 1;
2768
2769 if(k>=kmax)
2770 goto store;
2771
2772 // k = 3
2773
2774 a_0 = A[0];
2775 a_1 = A[1];
2776 a_2 = A[2];
2777 a_3 = A[3];
2778
2779 b_0 = B[0];
2780 CC[0+bs*0] += a_0 * b_0;
2781 CC[1+bs*0] += a_1 * b_0;
2782 CC[2+bs*0] += a_2 * b_0;
2783 CC[3+bs*0] += a_3 * b_0;
2784
2785 b_1 = B[4];
2786 CC[0+bs*1] += a_0 * b_1;
2787 CC[1+bs*1] += a_1 * b_1;
2788 CC[2+bs*1] += a_2 * b_1;
2789 CC[3+bs*1] += a_3 * b_1;
2790
2791 b_2 = B[8];
2792 CC[0+bs*2] += a_0 * b_2;
2793 CC[1+bs*2] += a_1 * b_2;
2794 CC[2+bs*2] += a_2 * b_2;
2795 CC[3+bs*2] += a_3 * b_2;
2796
2797 b_3 = B[12];
2798 CC[0+bs*3] += a_0 * b_3;
2799 CC[1+bs*3] += a_1 * b_3;
2800 CC[2+bs*3] += a_2 * b_3;
2801 CC[3+bs*3] += a_3 * b_3;
2802
2803 A += 4;
2804 B += 1;
2805 k += 1;
2806
2807 if(k>=kmax)
2808 goto store;
2809
2810 // k = 4
2811
2812 a_0 = A[0];
2813 a_1 = A[1];
2814 a_2 = A[2];
2815 a_3 = A[3];
2816
2817 b_0 = B[0];
2818 CC[0+bs*0] += a_0 * b_0;
2819 CC[1+bs*0] += a_1 * b_0;
2820 CC[2+bs*0] += a_2 * b_0;
2821 CC[3+bs*0] += a_3 * b_0;
2822
2823 b_1 = B[4];
2824 CC[0+bs*1] += a_0 * b_1;
2825 CC[1+bs*1] += a_1 * b_1;
2826 CC[2+bs*1] += a_2 * b_1;
2827 CC[3+bs*1] += a_3 * b_1;
2828
2829 b_2 = B[8];
2830 CC[0+bs*2] += a_0 * b_2;
2831 CC[1+bs*2] += a_1 * b_2;
2832 CC[2+bs*2] += a_2 * b_2;
2833 CC[3+bs*2] += a_3 * b_2;
2834
2835 b_3 = B[12];
2836 CC[0+bs*3] += a_0 * b_3;
2837 CC[1+bs*3] += a_1 * b_3;
2838 CC[2+bs*3] += a_2 * b_3;
2839 CC[3+bs*3] += a_3 * b_3;
2840
2841 A += 4;
2842 B += 1;
2843 k += 1;
2844
2845 if(k>=kmax)
2846 goto store;
2847
2848 // k = 5
2849
2850 a_0 = A[0];
2851 a_1 = A[1];
2852 a_2 = A[2];
2853 a_3 = A[3];
2854
2855 b_0 = B[0];
2856 CC[0+bs*0] += a_0 * b_0;
2857 CC[1+bs*0] += a_1 * b_0;
2858 CC[2+bs*0] += a_2 * b_0;
2859 CC[3+bs*0] += a_3 * b_0;
2860
2861 b_1 = B[4];
2862 CC[0+bs*1] += a_0 * b_1;
2863 CC[1+bs*1] += a_1 * b_1;
2864 CC[2+bs*1] += a_2 * b_1;
2865 CC[3+bs*1] += a_3 * b_1;
2866
2867 b_2 = B[8];
2868 CC[0+bs*2] += a_0 * b_2;
2869 CC[1+bs*2] += a_1 * b_2;
2870 CC[2+bs*2] += a_2 * b_2;
2871 CC[3+bs*2] += a_3 * b_2;
2872
2873 b_3 = B[12];
2874 CC[0+bs*3] += a_0 * b_3;
2875 CC[1+bs*3] += a_1 * b_3;
2876 CC[2+bs*3] += a_2 * b_3;
2877 CC[3+bs*3] += a_3 * b_3;
2878
2879 A += 4;
2880 B += 4*sdb-3;
2881 k += 1;
2882
2883 }
2884 else // if(offetB==3)
2885 {
2886
2887 // k = 0
2888
2889 a_0 = A[0];
2890 a_1 = A[1];
2891 a_2 = A[2];
2892 a_3 = A[3];
2893
2894 b_0 = B[0];
2895 CC[0+bs*0] += a_0 * b_0;
2896 CC[1+bs*0] += a_1 * b_0;
2897 CC[2+bs*0] += a_2 * b_0;
2898 CC[3+bs*0] += a_3 * b_0;
2899
2900 A += 4;
2901 B += 4*sdb-3;
2902 k += 1;
2903
2904 if(k>=kmax)
2905 goto store;
2906
2907 // k = 1
2908
2909 a_0 = A[0];
2910 a_1 = A[1];
2911 a_2 = A[2];
2912 a_3 = A[3];
2913
2914 b_0 = B[0];
2915 CC[0+bs*0] += a_0 * b_0;
2916 CC[1+bs*0] += a_1 * b_0;
2917 CC[2+bs*0] += a_2 * b_0;
2918 CC[3+bs*0] += a_3 * b_0;
2919
2920 b_1 = B[4];
2921 CC[0+bs*1] += a_0 * b_1;
2922 CC[1+bs*1] += a_1 * b_1;
2923 CC[2+bs*1] += a_2 * b_1;
2924 CC[3+bs*1] += a_3 * b_1;
2925
2926 A += 4;
2927 B += 1;
2928 k += 1;
2929
2930 if(k>=kmax)
2931 goto store;
2932
2933 // k = 2
2934
2935 a_0 = A[0];
2936 a_1 = A[1];
2937 a_2 = A[2];
2938 a_3 = A[3];
2939
2940 b_0 = B[0];
2941 CC[0+bs*0] += a_0 * b_0;
2942 CC[1+bs*0] += a_1 * b_0;
2943 CC[2+bs*0] += a_2 * b_0;
2944 CC[3+bs*0] += a_3 * b_0;
2945
2946 b_1 = B[4];
2947 CC[0+bs*1] += a_0 * b_1;
2948 CC[1+bs*1] += a_1 * b_1;
2949 CC[2+bs*1] += a_2 * b_1;
2950 CC[3+bs*1] += a_3 * b_1;
2951
2952 b_2 = B[8];
2953 CC[0+bs*2] += a_0 * b_2;
2954 CC[1+bs*2] += a_1 * b_2;
2955 CC[2+bs*2] += a_2 * b_2;
2956 CC[3+bs*2] += a_3 * b_2;
2957
2958 A += 4;
2959 B += 1;
2960 k += 1;
2961
2962 if(k>=kmax)
2963 goto store;
2964
2965 // k = 3
2966
2967 a_0 = A[0];
2968 a_1 = A[1];
2969 a_2 = A[2];
2970 a_3 = A[3];
2971
2972 b_0 = B[0];
2973 CC[0+bs*0] += a_0 * b_0;
2974 CC[1+bs*0] += a_1 * b_0;
2975 CC[2+bs*0] += a_2 * b_0;
2976 CC[3+bs*0] += a_3 * b_0;
2977
2978 b_1 = B[4];
2979 CC[0+bs*1] += a_0 * b_1;
2980 CC[1+bs*1] += a_1 * b_1;
2981 CC[2+bs*1] += a_2 * b_1;
2982 CC[3+bs*1] += a_3 * b_1;
2983
2984 b_2 = B[8];
2985 CC[0+bs*2] += a_0 * b_2;
2986 CC[1+bs*2] += a_1 * b_2;
2987 CC[2+bs*2] += a_2 * b_2;
2988 CC[3+bs*2] += a_3 * b_2;
2989
2990 b_3 = B[12];
2991 CC[0+bs*3] += a_0 * b_3;
2992 CC[1+bs*3] += a_1 * b_3;
2993 CC[2+bs*3] += a_2 * b_3;
2994 CC[3+bs*3] += a_3 * b_3;
2995
2996 A += 4;
2997 B += 1;
2998 k += 1;
2999
3000 if(k>=kmax)
3001 goto store;
3002
3003 // k = 4
3004
3005 a_0 = A[0];
3006 a_1 = A[1];
3007 a_2 = A[2];
3008 a_3 = A[3];
3009
3010 b_0 = B[0];
3011 CC[0+bs*0] += a_0 * b_0;
3012 CC[1+bs*0] += a_1 * b_0;
3013 CC[2+bs*0] += a_2 * b_0;
3014 CC[3+bs*0] += a_3 * b_0;
3015
3016 b_1 = B[4];
3017 CC[0+bs*1] += a_0 * b_1;
3018 CC[1+bs*1] += a_1 * b_1;
3019 CC[2+bs*1] += a_2 * b_1;
3020 CC[3+bs*1] += a_3 * b_1;
3021
3022 b_2 = B[8];
3023 CC[0+bs*2] += a_0 * b_2;
3024 CC[1+bs*2] += a_1 * b_2;
3025 CC[2+bs*2] += a_2 * b_2;
3026 CC[3+bs*2] += a_3 * b_2;
3027
3028 b_3 = B[12];
3029 CC[0+bs*3] += a_0 * b_3;
3030 CC[1+bs*3] += a_1 * b_3;
3031 CC[2+bs*3] += a_2 * b_3;
3032 CC[3+bs*3] += a_3 * b_3;
3033
3034 A += 4;
3035 B += 4*sdb-3;
3036 k += 1;
3037
3038 }
3039
3040 store:
3041
3042 CC[0+bs*0] = alpha[0]*CC[0+bs*0];
3043 CC[1+bs*0] = alpha[0]*CC[1+bs*0];
3044 CC[2+bs*0] = alpha[0]*CC[2+bs*0];
3045 CC[3+bs*0] = alpha[0]*CC[3+bs*0];
3046
3047 CC[0+bs*1] = alpha[0]*CC[0+bs*1];
3048 CC[1+bs*1] = alpha[0]*CC[1+bs*1];
3049 CC[2+bs*1] = alpha[0]*CC[2+bs*1];
3050 CC[3+bs*1] = alpha[0]*CC[3+bs*1];
3051
3052 CC[0+bs*2] = alpha[0]*CC[0+bs*2];
3053 CC[1+bs*2] = alpha[0]*CC[1+bs*2];
3054 CC[2+bs*2] = alpha[0]*CC[2+bs*2];
3055 CC[3+bs*2] = alpha[0]*CC[3+bs*2];
3056
3057 CC[0+bs*3] = alpha[0]*CC[0+bs*3];
3058 CC[1+bs*3] = alpha[0]*CC[1+bs*3];
3059 CC[2+bs*3] = alpha[0]*CC[2+bs*3];
3060 CC[3+bs*3] = alpha[0]*CC[3+bs*3];
3061
3062 float beta1 = 1.0;
3063
3064 kernel_sgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, D);
3065
3066 return;
3067
3068 }
3069 #endif
3070
3071
3072
3073 #if defined(TARGET_GENERIC) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmm_nn_rl_4x4_vs_lib4(int kmax,float * alpha,float * A,int offsetB,float * B,int sdb,float * D,int m1,int n1)3074 void kernel_strmm_nn_rl_4x4_vs_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, float *D, int m1, int n1)
3075 {
3076
3077 const int bs = 4;
3078
3079 float
3080 a_0, a_1, a_2, a_3,
3081 b_0, b_1, b_2, b_3;
3082
3083 #if defined(TARGET_GENERIC)
3084 float CC[16] = {0};
3085 #else
3086 ALIGNED( float CC[16], 64 ) = {0};
3087 #endif
3088
3089 float *D1;
3090
3091 int k;
3092
3093 B += offsetB;
3094
3095 k = 0;
3096
3097 if(offsetB==0)
3098 {
3099
3100 // k = 0
3101
3102 a_0 = A[0];
3103 a_1 = A[1];
3104 a_2 = A[2];
3105 a_3 = A[3];
3106
3107 b_0 = B[0];
3108 CC[0+bs*0] += a_0 * b_0;
3109 CC[1+bs*0] += a_1 * b_0;
3110 CC[2+bs*0] += a_2 * b_0;
3111 CC[3+bs*0] += a_3 * b_0;
3112
3113 A += 4;
3114 B += 1;
3115 k += 1;
3116
3117 if(k>=kmax)
3118 goto store;
3119
3120 // k = 1
3121
3122 a_0 = A[0];
3123 a_1 = A[1];
3124 a_2 = A[2];
3125 a_3 = A[3];
3126
3127 b_0 = B[0];
3128 CC[0+bs*0] += a_0 * b_0;
3129 CC[1+bs*0] += a_1 * b_0;
3130 CC[2+bs*0] += a_2 * b_0;
3131 CC[3+bs*0] += a_3 * b_0;
3132
3133 b_1 = B[4];
3134 CC[0+bs*1] += a_0 * b_1;
3135 CC[1+bs*1] += a_1 * b_1;
3136 CC[2+bs*1] += a_2 * b_1;
3137 CC[3+bs*1] += a_3 * b_1;
3138
3139 A += 4;
3140 B += 1;
3141 k += 1;
3142
3143 if(k>=kmax)
3144 goto store;
3145
3146 // k = 2
3147
3148 a_0 = A[0];
3149 a_1 = A[1];
3150 a_2 = A[2];
3151 a_3 = A[3];
3152
3153 b_0 = B[0];
3154 CC[0+bs*0] += a_0 * b_0;
3155 CC[1+bs*0] += a_1 * b_0;
3156 CC[2+bs*0] += a_2 * b_0;
3157 CC[3+bs*0] += a_3 * b_0;
3158
3159 b_1 = B[4];
3160 CC[0+bs*1] += a_0 * b_1;
3161 CC[1+bs*1] += a_1 * b_1;
3162 CC[2+bs*1] += a_2 * b_1;
3163 CC[3+bs*1] += a_3 * b_1;
3164
3165 b_2 = B[8];
3166 CC[0+bs*2] += a_0 * b_2;
3167 CC[1+bs*2] += a_1 * b_2;
3168 CC[2+bs*2] += a_2 * b_2;
3169 CC[3+bs*2] += a_3 * b_2;
3170
3171 A += 4;
3172 B += 1;
3173 k += 1;
3174
3175 if(k>=kmax)
3176 goto store;
3177
3178 // k = 3
3179
3180 a_0 = A[0];
3181 a_1 = A[1];
3182 a_2 = A[2];
3183 a_3 = A[3];
3184
3185 b_0 = B[0];
3186 CC[0+bs*0] += a_0 * b_0;
3187 CC[1+bs*0] += a_1 * b_0;
3188 CC[2+bs*0] += a_2 * b_0;
3189 CC[3+bs*0] += a_3 * b_0;
3190
3191 b_1 = B[4];
3192 CC[0+bs*1] += a_0 * b_1;
3193 CC[1+bs*1] += a_1 * b_1;
3194 CC[2+bs*1] += a_2 * b_1;
3195 CC[3+bs*1] += a_3 * b_1;
3196
3197 b_2 = B[8];
3198 CC[0+bs*2] += a_0 * b_2;
3199 CC[1+bs*2] += a_1 * b_2;
3200 CC[2+bs*2] += a_2 * b_2;
3201 CC[3+bs*2] += a_3 * b_2;
3202
3203 b_3 = B[12];
3204 CC[0+bs*3] += a_0 * b_3;
3205 CC[1+bs*3] += a_1 * b_3;
3206 CC[2+bs*3] += a_2 * b_3;
3207 CC[3+bs*3] += a_3 * b_3;
3208
3209 A += 4;
3210 B += 4*sdb-3;
3211 k += 1;
3212
3213 }
3214 else if(offsetB==1)
3215 {
3216
3217 // k = 0
3218
3219 a_0 = A[0];
3220 a_1 = A[1];
3221 a_2 = A[2];
3222 a_3 = A[3];
3223
3224 b_0 = B[0];
3225 CC[0+bs*0] += a_0 * b_0;
3226 CC[1+bs*0] += a_1 * b_0;
3227 CC[2+bs*0] += a_2 * b_0;
3228 CC[3+bs*0] += a_3 * b_0;
3229
3230 A += 4;
3231 B += 1;
3232 k += 1;
3233
3234 if(k>=kmax)
3235 goto store;
3236
3237 // k = 1
3238
3239 a_0 = A[0];
3240 a_1 = A[1];
3241 a_2 = A[2];
3242 a_3 = A[3];
3243
3244 b_0 = B[0];
3245 CC[0+bs*0] += a_0 * b_0;
3246 CC[1+bs*0] += a_1 * b_0;
3247 CC[2+bs*0] += a_2 * b_0;
3248 CC[3+bs*0] += a_3 * b_0;
3249
3250 b_1 = B[4];
3251 CC[0+bs*1] += a_0 * b_1;
3252 CC[1+bs*1] += a_1 * b_1;
3253 CC[2+bs*1] += a_2 * b_1;
3254 CC[3+bs*1] += a_3 * b_1;
3255
3256 A += 4;
3257 B += 1;
3258 k += 1;
3259
3260 if(k>=kmax)
3261 goto store;
3262
3263 // k = 2
3264
3265 a_0 = A[0];
3266 a_1 = A[1];
3267 a_2 = A[2];
3268 a_3 = A[3];
3269
3270 b_0 = B[0];
3271 CC[0+bs*0] += a_0 * b_0;
3272 CC[1+bs*0] += a_1 * b_0;
3273 CC[2+bs*0] += a_2 * b_0;
3274 CC[3+bs*0] += a_3 * b_0;
3275
3276 b_1 = B[4];
3277 CC[0+bs*1] += a_0 * b_1;
3278 CC[1+bs*1] += a_1 * b_1;
3279 CC[2+bs*1] += a_2 * b_1;
3280 CC[3+bs*1] += a_3 * b_1;
3281
3282 b_2 = B[8];
3283 CC[0+bs*2] += a_0 * b_2;
3284 CC[1+bs*2] += a_1 * b_2;
3285 CC[2+bs*2] += a_2 * b_2;
3286 CC[3+bs*2] += a_3 * b_2;
3287
3288 A += 4;
3289 B += 4*sdb-3;
3290 k += 1;
3291
3292 }
3293 else if(offsetB==2)
3294 {
3295
3296 // k = 0
3297
3298 a_0 = A[0];
3299 a_1 = A[1];
3300 a_2 = A[2];
3301 a_3 = A[3];
3302
3303 b_0 = B[0];
3304 CC[0+bs*0] += a_0 * b_0;
3305 CC[1+bs*0] += a_1 * b_0;
3306 CC[2+bs*0] += a_2 * b_0;
3307 CC[3+bs*0] += a_3 * b_0;
3308
3309 A += 4;
3310 B += 1;
3311 k += 1;
3312
3313 if(k>=kmax)
3314 goto store;
3315
3316 // k = 1
3317
3318 a_0 = A[0];
3319 a_1 = A[1];
3320 a_2 = A[2];
3321 a_3 = A[3];
3322
3323 b_0 = B[0];
3324 CC[0+bs*0] += a_0 * b_0;
3325 CC[1+bs*0] += a_1 * b_0;
3326 CC[2+bs*0] += a_2 * b_0;
3327 CC[3+bs*0] += a_3 * b_0;
3328
3329 b_1 = B[4];
3330 CC[0+bs*1] += a_0 * b_1;
3331 CC[1+bs*1] += a_1 * b_1;
3332 CC[2+bs*1] += a_2 * b_1;
3333 CC[3+bs*1] += a_3 * b_1;
3334
3335 A += 4;
3336 B += 4*sdb-3;
3337 k += 1;
3338
3339 if(k>=kmax)
3340 goto store;
3341
3342 // k = 2
3343
3344 a_0 = A[0];
3345 a_1 = A[1];
3346 a_2 = A[2];
3347 a_3 = A[3];
3348
3349 b_0 = B[0];
3350 CC[0+bs*0] += a_0 * b_0;
3351 CC[1+bs*0] += a_1 * b_0;
3352 CC[2+bs*0] += a_2 * b_0;
3353 CC[3+bs*0] += a_3 * b_0;
3354
3355 b_1 = B[4];
3356 CC[0+bs*1] += a_0 * b_1;
3357 CC[1+bs*1] += a_1 * b_1;
3358 CC[2+bs*1] += a_2 * b_1;
3359 CC[3+bs*1] += a_3 * b_1;
3360
3361 b_2 = B[8];
3362 CC[0+bs*2] += a_0 * b_2;
3363 CC[1+bs*2] += a_1 * b_2;
3364 CC[2+bs*2] += a_2 * b_2;
3365 CC[3+bs*2] += a_3 * b_2;
3366
3367 A += 4;
3368 B += 1;
3369 k += 1;
3370
3371 if(k>=kmax)
3372 goto store;
3373
3374 // k = 3
3375
3376 a_0 = A[0];
3377 a_1 = A[1];
3378 a_2 = A[2];
3379 a_3 = A[3];
3380
3381 b_0 = B[0];
3382 CC[0+bs*0] += a_0 * b_0;
3383 CC[1+bs*0] += a_1 * b_0;
3384 CC[2+bs*0] += a_2 * b_0;
3385 CC[3+bs*0] += a_3 * b_0;
3386
3387 b_1 = B[4];
3388 CC[0+bs*1] += a_0 * b_1;
3389 CC[1+bs*1] += a_1 * b_1;
3390 CC[2+bs*1] += a_2 * b_1;
3391 CC[3+bs*1] += a_3 * b_1;
3392
3393 b_2 = B[8];
3394 CC[0+bs*2] += a_0 * b_2;
3395 CC[1+bs*2] += a_1 * b_2;
3396 CC[2+bs*2] += a_2 * b_2;
3397 CC[3+bs*2] += a_3 * b_2;
3398
3399 b_3 = B[12];
3400 CC[0+bs*3] += a_0 * b_3;
3401 CC[1+bs*3] += a_1 * b_3;
3402 CC[2+bs*3] += a_2 * b_3;
3403 CC[3+bs*3] += a_3 * b_3;
3404
3405 A += 4;
3406 B += 1;
3407 k += 1;
3408
3409 if(k>=kmax)
3410 goto store;
3411
3412 // k = 4
3413
3414 a_0 = A[0];
3415 a_1 = A[1];
3416 a_2 = A[2];
3417 a_3 = A[3];
3418
3419 b_0 = B[0];
3420 CC[0+bs*0] += a_0 * b_0;
3421 CC[1+bs*0] += a_1 * b_0;
3422 CC[2+bs*0] += a_2 * b_0;
3423 CC[3+bs*0] += a_3 * b_0;
3424
3425 b_1 = B[4];
3426 CC[0+bs*1] += a_0 * b_1;
3427 CC[1+bs*1] += a_1 * b_1;
3428 CC[2+bs*1] += a_2 * b_1;
3429 CC[3+bs*1] += a_3 * b_1;
3430
3431 b_2 = B[8];
3432 CC[0+bs*2] += a_0 * b_2;
3433 CC[1+bs*2] += a_1 * b_2;
3434 CC[2+bs*2] += a_2 * b_2;
3435 CC[3+bs*2] += a_3 * b_2;
3436
3437 b_3 = B[12];
3438 CC[0+bs*3] += a_0 * b_3;
3439 CC[1+bs*3] += a_1 * b_3;
3440 CC[2+bs*3] += a_2 * b_3;
3441 CC[3+bs*3] += a_3 * b_3;
3442
3443 A += 4;
3444 B += 1;
3445 k += 1;
3446
3447 if(k>=kmax)
3448 goto store;
3449
3450 // k = 5
3451
3452 a_0 = A[0];
3453 a_1 = A[1];
3454 a_2 = A[2];
3455 a_3 = A[3];
3456
3457 b_0 = B[0];
3458 CC[0+bs*0] += a_0 * b_0;
3459 CC[1+bs*0] += a_1 * b_0;
3460 CC[2+bs*0] += a_2 * b_0;
3461 CC[3+bs*0] += a_3 * b_0;
3462
3463 b_1 = B[4];
3464 CC[0+bs*1] += a_0 * b_1;
3465 CC[1+bs*1] += a_1 * b_1;
3466 CC[2+bs*1] += a_2 * b_1;
3467 CC[3+bs*1] += a_3 * b_1;
3468
3469 b_2 = B[8];
3470 CC[0+bs*2] += a_0 * b_2;
3471 CC[1+bs*2] += a_1 * b_2;
3472 CC[2+bs*2] += a_2 * b_2;
3473 CC[3+bs*2] += a_3 * b_2;
3474
3475 b_3 = B[12];
3476 CC[0+bs*3] += a_0 * b_3;
3477 CC[1+bs*3] += a_1 * b_3;
3478 CC[2+bs*3] += a_2 * b_3;
3479 CC[3+bs*3] += a_3 * b_3;
3480
3481 A += 4;
3482 B += 4*sdb-3;
3483 k += 1;
3484
3485 }
3486 else // if(offetB==3)
3487 {
3488
3489 // k = 0
3490
3491 a_0 = A[0];
3492 a_1 = A[1];
3493 a_2 = A[2];
3494 a_3 = A[3];
3495
3496 b_0 = B[0];
3497 CC[0+bs*0] += a_0 * b_0;
3498 CC[1+bs*0] += a_1 * b_0;
3499 CC[2+bs*0] += a_2 * b_0;
3500 CC[3+bs*0] += a_3 * b_0;
3501
3502 A += 4;
3503 B += 4*sdb-3;
3504 k += 1;
3505
3506 if(k>=kmax)
3507 goto store;
3508
3509 // k = 1
3510
3511 a_0 = A[0];
3512 a_1 = A[1];
3513 a_2 = A[2];
3514 a_3 = A[3];
3515
3516 b_0 = B[0];
3517 CC[0+bs*0] += a_0 * b_0;
3518 CC[1+bs*0] += a_1 * b_0;
3519 CC[2+bs*0] += a_2 * b_0;
3520 CC[3+bs*0] += a_3 * b_0;
3521
3522 b_1 = B[4];
3523 CC[0+bs*1] += a_0 * b_1;
3524 CC[1+bs*1] += a_1 * b_1;
3525 CC[2+bs*1] += a_2 * b_1;
3526 CC[3+bs*1] += a_3 * b_1;
3527
3528 A += 4;
3529 B += 1;
3530 k += 1;
3531
3532 if(k>=kmax)
3533 goto store;
3534
3535 // k = 2
3536
3537 a_0 = A[0];
3538 a_1 = A[1];
3539 a_2 = A[2];
3540 a_3 = A[3];
3541
3542 b_0 = B[0];
3543 CC[0+bs*0] += a_0 * b_0;
3544 CC[1+bs*0] += a_1 * b_0;
3545 CC[2+bs*0] += a_2 * b_0;
3546 CC[3+bs*0] += a_3 * b_0;
3547
3548 b_1 = B[4];
3549 CC[0+bs*1] += a_0 * b_1;
3550 CC[1+bs*1] += a_1 * b_1;
3551 CC[2+bs*1] += a_2 * b_1;
3552 CC[3+bs*1] += a_3 * b_1;
3553
3554 b_2 = B[8];
3555 CC[0+bs*2] += a_0 * b_2;
3556 CC[1+bs*2] += a_1 * b_2;
3557 CC[2+bs*2] += a_2 * b_2;
3558 CC[3+bs*2] += a_3 * b_2;
3559
3560 A += 4;
3561 B += 1;
3562 k += 1;
3563
3564 if(k>=kmax)
3565 goto store;
3566
3567 // k = 3
3568
3569 a_0 = A[0];
3570 a_1 = A[1];
3571 a_2 = A[2];
3572 a_3 = A[3];
3573
3574 b_0 = B[0];
3575 CC[0+bs*0] += a_0 * b_0;
3576 CC[1+bs*0] += a_1 * b_0;
3577 CC[2+bs*0] += a_2 * b_0;
3578 CC[3+bs*0] += a_3 * b_0;
3579
3580 b_1 = B[4];
3581 CC[0+bs*1] += a_0 * b_1;
3582 CC[1+bs*1] += a_1 * b_1;
3583 CC[2+bs*1] += a_2 * b_1;
3584 CC[3+bs*1] += a_3 * b_1;
3585
3586 b_2 = B[8];
3587 CC[0+bs*2] += a_0 * b_2;
3588 CC[1+bs*2] += a_1 * b_2;
3589 CC[2+bs*2] += a_2 * b_2;
3590 CC[3+bs*2] += a_3 * b_2;
3591
3592 b_3 = B[12];
3593 CC[0+bs*3] += a_0 * b_3;
3594 CC[1+bs*3] += a_1 * b_3;
3595 CC[2+bs*3] += a_2 * b_3;
3596 CC[3+bs*3] += a_3 * b_3;
3597
3598 A += 4;
3599 B += 1;
3600 k += 1;
3601
3602 if(k>=kmax)
3603 goto store;
3604
3605 // k = 4
3606
3607 a_0 = A[0];
3608 a_1 = A[1];
3609 a_2 = A[2];
3610 a_3 = A[3];
3611
3612 b_0 = B[0];
3613 CC[0+bs*0] += a_0 * b_0;
3614 CC[1+bs*0] += a_1 * b_0;
3615 CC[2+bs*0] += a_2 * b_0;
3616 CC[3+bs*0] += a_3 * b_0;
3617
3618 b_1 = B[4];
3619 CC[0+bs*1] += a_0 * b_1;
3620 CC[1+bs*1] += a_1 * b_1;
3621 CC[2+bs*1] += a_2 * b_1;
3622 CC[3+bs*1] += a_3 * b_1;
3623
3624 b_2 = B[8];
3625 CC[0+bs*2] += a_0 * b_2;
3626 CC[1+bs*2] += a_1 * b_2;
3627 CC[2+bs*2] += a_2 * b_2;
3628 CC[3+bs*2] += a_3 * b_2;
3629
3630 b_3 = B[12];
3631 CC[0+bs*3] += a_0 * b_3;
3632 CC[1+bs*3] += a_1 * b_3;
3633 CC[2+bs*3] += a_2 * b_3;
3634 CC[3+bs*3] += a_3 * b_3;
3635
3636 A += 4;
3637 B += 4*sdb-3;
3638 k += 1;
3639
3640 }
3641
3642 store:
3643
3644 CC[0+bs*0] = alpha[0]*CC[0+bs*0];
3645 CC[1+bs*0] = alpha[0]*CC[1+bs*0];
3646 CC[2+bs*0] = alpha[0]*CC[2+bs*0];
3647 CC[3+bs*0] = alpha[0]*CC[3+bs*0];
3648
3649 CC[0+bs*1] = alpha[0]*CC[0+bs*1];
3650 CC[1+bs*1] = alpha[0]*CC[1+bs*1];
3651 CC[2+bs*1] = alpha[0]*CC[2+bs*1];
3652 CC[3+bs*1] = alpha[0]*CC[3+bs*1];
3653
3654 CC[0+bs*2] = alpha[0]*CC[0+bs*2];
3655 CC[1+bs*2] = alpha[0]*CC[1+bs*2];
3656 CC[2+bs*2] = alpha[0]*CC[2+bs*2];
3657 CC[3+bs*2] = alpha[0]*CC[3+bs*2];
3658
3659 CC[0+bs*3] = alpha[0]*CC[0+bs*3];
3660 CC[1+bs*3] = alpha[0]*CC[1+bs*3];
3661 CC[2+bs*3] = alpha[0]*CC[2+bs*3];
3662 CC[3+bs*3] = alpha[0]*CC[3+bs*3];
3663
3664 float beta1 = 1.0;
3665
3666 kernel_sgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, CC);
3667
3668 if(m1>=4)
3669 {
3670 D[0+bs*0] = CC[0+bs*0];
3671 D[1+bs*0] = CC[1+bs*0];
3672 D[2+bs*0] = CC[2+bs*0];
3673 D[3+bs*0] = CC[3+bs*0];
3674
3675 if(n1==1)
3676 return;
3677
3678 D[0+bs*1] = CC[0+bs*1];
3679 D[1+bs*1] = CC[1+bs*1];
3680 D[2+bs*1] = CC[2+bs*1];
3681 D[3+bs*1] = CC[3+bs*1];
3682
3683 if(n1==2)
3684 return;
3685
3686 D[0+bs*2] = CC[0+bs*2];
3687 D[1+bs*2] = CC[1+bs*2];
3688 D[2+bs*2] = CC[2+bs*2];
3689 D[3+bs*2] = CC[3+bs*2];
3690
3691 if(n1==3)
3692 return;
3693
3694 D[0+bs*3] = CC[0+bs*3];
3695 D[1+bs*3] = CC[1+bs*3];
3696 D[2+bs*3] = CC[2+bs*3];
3697 D[3+bs*3] = CC[3+bs*3];
3698 }
3699 else if(m1>=3)
3700 {
3701 D[0+bs*0] = CC[0+bs*0];
3702 D[1+bs*0] = CC[1+bs*0];
3703 D[2+bs*0] = CC[2+bs*0];
3704
3705 if(n1==1)
3706 return;
3707
3708 D[0+bs*1] = CC[0+bs*1];
3709 D[1+bs*1] = CC[1+bs*1];
3710 D[2+bs*1] = CC[2+bs*1];
3711
3712 if(n1==2)
3713 return;
3714
3715 D[0+bs*2] = CC[0+bs*2];
3716 D[1+bs*2] = CC[1+bs*2];
3717 D[2+bs*2] = CC[2+bs*2];
3718
3719 if(n1==3)
3720 return;
3721
3722 D[0+bs*3] = CC[0+bs*3];
3723 D[1+bs*3] = CC[1+bs*3];
3724 D[2+bs*3] = CC[2+bs*3];
3725 }
3726 else if(m1>=2)
3727 {
3728 D[0+bs*0] = CC[0+bs*0];
3729 D[1+bs*0] = CC[1+bs*0];
3730
3731 if(n1==1)
3732 return;
3733
3734 D[0+bs*1] = CC[0+bs*1];
3735 D[1+bs*1] = CC[1+bs*1];
3736
3737 if(n1==2)
3738 return;
3739
3740 D[0+bs*2] = CC[0+bs*2];
3741 D[1+bs*2] = CC[1+bs*2];
3742
3743 if(n1==3)
3744 return;
3745
3746 D[0+bs*3] = CC[0+bs*3];
3747 D[1+bs*3] = CC[1+bs*3];
3748 }
3749 else //if(m1>=1)
3750 {
3751 D[0+bs*0] = CC[0+bs*0];
3752
3753 if(n1==1)
3754 return;
3755
3756 D[0+bs*1] = CC[0+bs*1];
3757
3758 if(n1==2)
3759 return;
3760
3761 D[0+bs*2] = CC[0+bs*2];
3762
3763 if(n1==3)
3764 return;
3765
3766 D[0+bs*3] = CC[0+bs*3];
3767 }
3768
3769 return;
3770
3771 }
3772 #endif
3773
3774
3775
3776 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strmm_nn_rl_4x4_gen_lib4(int kmax,float * alpha,float * A,int offsetB,float * B,int sdb,int offsetD,float * D0,int sdd,int m0,int m1,int n0,int n1)3777 void kernel_strmm_nn_rl_4x4_gen_lib4(int kmax, float *alpha, float *A, int offsetB, float *B, int sdb, int offsetD, float *D0, int sdd, int m0, int m1, int n0, int n1)
3778 {
3779
3780 const int bs = 4;
3781
3782 float
3783 a_0, a_1, a_2, a_3,
3784 b_0, b_1, b_2, b_3;
3785
3786 #if defined(TARGET_GENERIC)
3787 float CC[16] = {0};
3788 #else
3789 ALIGNED( float CC[16], 64 ) = {0};
3790 #endif
3791
3792 float *D1;
3793
3794 int k;
3795
3796 B += offsetB;
3797
3798 k = 0;
3799
3800 if(offsetB==0)
3801 {
3802
3803 // k = 0
3804
3805 a_0 = A[0];
3806 a_1 = A[1];
3807 a_2 = A[2];
3808 a_3 = A[3];
3809
3810 b_0 = B[0];
3811 CC[0+bs*0] += a_0 * b_0;
3812 CC[1+bs*0] += a_1 * b_0;
3813 CC[2+bs*0] += a_2 * b_0;
3814 CC[3+bs*0] += a_3 * b_0;
3815
3816 A += 4;
3817 B += 1;
3818 k += 1;
3819
3820 if(k>=kmax)
3821 goto store;
3822
3823 // k = 1
3824
3825 a_0 = A[0];
3826 a_1 = A[1];
3827 a_2 = A[2];
3828 a_3 = A[3];
3829
3830 b_0 = B[0];
3831 CC[0+bs*0] += a_0 * b_0;
3832 CC[1+bs*0] += a_1 * b_0;
3833 CC[2+bs*0] += a_2 * b_0;
3834 CC[3+bs*0] += a_3 * b_0;
3835
3836 b_1 = B[4];
3837 CC[0+bs*1] += a_0 * b_1;
3838 CC[1+bs*1] += a_1 * b_1;
3839 CC[2+bs*1] += a_2 * b_1;
3840 CC[3+bs*1] += a_3 * b_1;
3841
3842 A += 4;
3843 B += 1;
3844 k += 1;
3845
3846 if(k>=kmax)
3847 goto store;
3848
3849 // k = 2
3850
3851 a_0 = A[0];
3852 a_1 = A[1];
3853 a_2 = A[2];
3854 a_3 = A[3];
3855
3856 b_0 = B[0];
3857 CC[0+bs*0] += a_0 * b_0;
3858 CC[1+bs*0] += a_1 * b_0;
3859 CC[2+bs*0] += a_2 * b_0;
3860 CC[3+bs*0] += a_3 * b_0;
3861
3862 b_1 = B[4];
3863 CC[0+bs*1] += a_0 * b_1;
3864 CC[1+bs*1] += a_1 * b_1;
3865 CC[2+bs*1] += a_2 * b_1;
3866 CC[3+bs*1] += a_3 * b_1;
3867
3868 b_2 = B[8];
3869 CC[0+bs*2] += a_0 * b_2;
3870 CC[1+bs*2] += a_1 * b_2;
3871 CC[2+bs*2] += a_2 * b_2;
3872 CC[3+bs*2] += a_3 * b_2;
3873
3874 A += 4;
3875 B += 1;
3876 k += 1;
3877
3878 if(k>=kmax)
3879 goto store;
3880
3881 // k = 3
3882
3883 a_0 = A[0];
3884 a_1 = A[1];
3885 a_2 = A[2];
3886 a_3 = A[3];
3887
3888 b_0 = B[0];
3889 CC[0+bs*0] += a_0 * b_0;
3890 CC[1+bs*0] += a_1 * b_0;
3891 CC[2+bs*0] += a_2 * b_0;
3892 CC[3+bs*0] += a_3 * b_0;
3893
3894 b_1 = B[4];
3895 CC[0+bs*1] += a_0 * b_1;
3896 CC[1+bs*1] += a_1 * b_1;
3897 CC[2+bs*1] += a_2 * b_1;
3898 CC[3+bs*1] += a_3 * b_1;
3899
3900 b_2 = B[8];
3901 CC[0+bs*2] += a_0 * b_2;
3902 CC[1+bs*2] += a_1 * b_2;
3903 CC[2+bs*2] += a_2 * b_2;
3904 CC[3+bs*2] += a_3 * b_2;
3905
3906 b_3 = B[12];
3907 CC[0+bs*3] += a_0 * b_3;
3908 CC[1+bs*3] += a_1 * b_3;
3909 CC[2+bs*3] += a_2 * b_3;
3910 CC[3+bs*3] += a_3 * b_3;
3911
3912 A += 4;
3913 B += 4*sdb-3;
3914 k += 1;
3915
3916 }
3917 else if(offsetB==1)
3918 {
3919
3920 // k = 0
3921
3922 a_0 = A[0];
3923 a_1 = A[1];
3924 a_2 = A[2];
3925 a_3 = A[3];
3926
3927 b_0 = B[0];
3928 CC[0+bs*0] += a_0 * b_0;
3929 CC[1+bs*0] += a_1 * b_0;
3930 CC[2+bs*0] += a_2 * b_0;
3931 CC[3+bs*0] += a_3 * b_0;
3932
3933 A += 4;
3934 B += 1;
3935 k += 1;
3936
3937 if(k>=kmax)
3938 goto store;
3939
3940 // k = 1
3941
3942 a_0 = A[0];
3943 a_1 = A[1];
3944 a_2 = A[2];
3945 a_3 = A[3];
3946
3947 b_0 = B[0];
3948 CC[0+bs*0] += a_0 * b_0;
3949 CC[1+bs*0] += a_1 * b_0;
3950 CC[2+bs*0] += a_2 * b_0;
3951 CC[3+bs*0] += a_3 * b_0;
3952
3953 b_1 = B[4];
3954 CC[0+bs*1] += a_0 * b_1;
3955 CC[1+bs*1] += a_1 * b_1;
3956 CC[2+bs*1] += a_2 * b_1;
3957 CC[3+bs*1] += a_3 * b_1;
3958
3959 A += 4;
3960 B += 1;
3961 k += 1;
3962
3963 if(k>=kmax)
3964 goto store;
3965
3966 // k = 2
3967
3968 a_0 = A[0];
3969 a_1 = A[1];
3970 a_2 = A[2];
3971 a_3 = A[3];
3972
3973 b_0 = B[0];
3974 CC[0+bs*0] += a_0 * b_0;
3975 CC[1+bs*0] += a_1 * b_0;
3976 CC[2+bs*0] += a_2 * b_0;
3977 CC[3+bs*0] += a_3 * b_0;
3978
3979 b_1 = B[4];
3980 CC[0+bs*1] += a_0 * b_1;
3981 CC[1+bs*1] += a_1 * b_1;
3982 CC[2+bs*1] += a_2 * b_1;
3983 CC[3+bs*1] += a_3 * b_1;
3984
3985 b_2 = B[8];
3986 CC[0+bs*2] += a_0 * b_2;
3987 CC[1+bs*2] += a_1 * b_2;
3988 CC[2+bs*2] += a_2 * b_2;
3989 CC[3+bs*2] += a_3 * b_2;
3990
3991 A += 4;
3992 B += 4*sdb-3;
3993 k += 1;
3994
3995 }
3996 else if(offsetB==2)
3997 {
3998
3999 // k = 0
4000
4001 a_0 = A[0];
4002 a_1 = A[1];
4003 a_2 = A[2];
4004 a_3 = A[3];
4005
4006 b_0 = B[0];
4007 CC[0+bs*0] += a_0 * b_0;
4008 CC[1+bs*0] += a_1 * b_0;
4009 CC[2+bs*0] += a_2 * b_0;
4010 CC[3+bs*0] += a_3 * b_0;
4011
4012 A += 4;
4013 B += 1;
4014 k += 1;
4015
4016 if(k>=kmax)
4017 goto store;
4018
4019 // k = 1
4020
4021 a_0 = A[0];
4022 a_1 = A[1];
4023 a_2 = A[2];
4024 a_3 = A[3];
4025
4026 b_0 = B[0];
4027 CC[0+bs*0] += a_0 * b_0;
4028 CC[1+bs*0] += a_1 * b_0;
4029 CC[2+bs*0] += a_2 * b_0;
4030 CC[3+bs*0] += a_3 * b_0;
4031
4032 b_1 = B[4];
4033 CC[0+bs*1] += a_0 * b_1;
4034 CC[1+bs*1] += a_1 * b_1;
4035 CC[2+bs*1] += a_2 * b_1;
4036 CC[3+bs*1] += a_3 * b_1;
4037
4038 A += 4;
4039 B += 4*sdb-3;
4040 k += 1;
4041
4042 if(k>=kmax)
4043 goto store;
4044
4045 // k = 2
4046
4047 a_0 = A[0];
4048 a_1 = A[1];
4049 a_2 = A[2];
4050 a_3 = A[3];
4051
4052 b_0 = B[0];
4053 CC[0+bs*0] += a_0 * b_0;
4054 CC[1+bs*0] += a_1 * b_0;
4055 CC[2+bs*0] += a_2 * b_0;
4056 CC[3+bs*0] += a_3 * b_0;
4057
4058 b_1 = B[4];
4059 CC[0+bs*1] += a_0 * b_1;
4060 CC[1+bs*1] += a_1 * b_1;
4061 CC[2+bs*1] += a_2 * b_1;
4062 CC[3+bs*1] += a_3 * b_1;
4063
4064 b_2 = B[8];
4065 CC[0+bs*2] += a_0 * b_2;
4066 CC[1+bs*2] += a_1 * b_2;
4067 CC[2+bs*2] += a_2 * b_2;
4068 CC[3+bs*2] += a_3 * b_2;
4069
4070 A += 4;
4071 B += 1;
4072 k += 1;
4073
4074 if(k>=kmax)
4075 goto store;
4076
4077 // k = 3
4078
4079 a_0 = A[0];
4080 a_1 = A[1];
4081 a_2 = A[2];
4082 a_3 = A[3];
4083
4084 b_0 = B[0];
4085 CC[0+bs*0] += a_0 * b_0;
4086 CC[1+bs*0] += a_1 * b_0;
4087 CC[2+bs*0] += a_2 * b_0;
4088 CC[3+bs*0] += a_3 * b_0;
4089
4090 b_1 = B[4];
4091 CC[0+bs*1] += a_0 * b_1;
4092 CC[1+bs*1] += a_1 * b_1;
4093 CC[2+bs*1] += a_2 * b_1;
4094 CC[3+bs*1] += a_3 * b_1;
4095
4096 b_2 = B[8];
4097 CC[0+bs*2] += a_0 * b_2;
4098 CC[1+bs*2] += a_1 * b_2;
4099 CC[2+bs*2] += a_2 * b_2;
4100 CC[3+bs*2] += a_3 * b_2;
4101
4102 b_3 = B[12];
4103 CC[0+bs*3] += a_0 * b_3;
4104 CC[1+bs*3] += a_1 * b_3;
4105 CC[2+bs*3] += a_2 * b_3;
4106 CC[3+bs*3] += a_3 * b_3;
4107
4108 A += 4;
4109 B += 1;
4110 k += 1;
4111
4112 if(k>=kmax)
4113 goto store;
4114
4115 // k = 4
4116
4117 a_0 = A[0];
4118 a_1 = A[1];
4119 a_2 = A[2];
4120 a_3 = A[3];
4121
4122 b_0 = B[0];
4123 CC[0+bs*0] += a_0 * b_0;
4124 CC[1+bs*0] += a_1 * b_0;
4125 CC[2+bs*0] += a_2 * b_0;
4126 CC[3+bs*0] += a_3 * b_0;
4127
4128 b_1 = B[4];
4129 CC[0+bs*1] += a_0 * b_1;
4130 CC[1+bs*1] += a_1 * b_1;
4131 CC[2+bs*1] += a_2 * b_1;
4132 CC[3+bs*1] += a_3 * b_1;
4133
4134 b_2 = B[8];
4135 CC[0+bs*2] += a_0 * b_2;
4136 CC[1+bs*2] += a_1 * b_2;
4137 CC[2+bs*2] += a_2 * b_2;
4138 CC[3+bs*2] += a_3 * b_2;
4139
4140 b_3 = B[12];
4141 CC[0+bs*3] += a_0 * b_3;
4142 CC[1+bs*3] += a_1 * b_3;
4143 CC[2+bs*3] += a_2 * b_3;
4144 CC[3+bs*3] += a_3 * b_3;
4145
4146 A += 4;
4147 B += 1;
4148 k += 1;
4149
4150 if(k>=kmax)
4151 goto store;
4152
4153 // k = 5
4154
4155 a_0 = A[0];
4156 a_1 = A[1];
4157 a_2 = A[2];
4158 a_3 = A[3];
4159
4160 b_0 = B[0];
4161 CC[0+bs*0] += a_0 * b_0;
4162 CC[1+bs*0] += a_1 * b_0;
4163 CC[2+bs*0] += a_2 * b_0;
4164 CC[3+bs*0] += a_3 * b_0;
4165
4166 b_1 = B[4];
4167 CC[0+bs*1] += a_0 * b_1;
4168 CC[1+bs*1] += a_1 * b_1;
4169 CC[2+bs*1] += a_2 * b_1;
4170 CC[3+bs*1] += a_3 * b_1;
4171
4172 b_2 = B[8];
4173 CC[0+bs*2] += a_0 * b_2;
4174 CC[1+bs*2] += a_1 * b_2;
4175 CC[2+bs*2] += a_2 * b_2;
4176 CC[3+bs*2] += a_3 * b_2;
4177
4178 b_3 = B[12];
4179 CC[0+bs*3] += a_0 * b_3;
4180 CC[1+bs*3] += a_1 * b_3;
4181 CC[2+bs*3] += a_2 * b_3;
4182 CC[3+bs*3] += a_3 * b_3;
4183
4184 A += 4;
4185 B += 4*sdb-3;
4186 k += 1;
4187
4188 }
4189 else // if(offetB==3)
4190 {
4191
4192 // k = 0
4193
4194 a_0 = A[0];
4195 a_1 = A[1];
4196 a_2 = A[2];
4197 a_3 = A[3];
4198
4199 b_0 = B[0];
4200 CC[0+bs*0] += a_0 * b_0;
4201 CC[1+bs*0] += a_1 * b_0;
4202 CC[2+bs*0] += a_2 * b_0;
4203 CC[3+bs*0] += a_3 * b_0;
4204
4205 A += 4;
4206 B += 4*sdb-3;
4207 k += 1;
4208
4209 if(k>=kmax)
4210 goto store;
4211
4212 // k = 1
4213
4214 a_0 = A[0];
4215 a_1 = A[1];
4216 a_2 = A[2];
4217 a_3 = A[3];
4218
4219 b_0 = B[0];
4220 CC[0+bs*0] += a_0 * b_0;
4221 CC[1+bs*0] += a_1 * b_0;
4222 CC[2+bs*0] += a_2 * b_0;
4223 CC[3+bs*0] += a_3 * b_0;
4224
4225 b_1 = B[4];
4226 CC[0+bs*1] += a_0 * b_1;
4227 CC[1+bs*1] += a_1 * b_1;
4228 CC[2+bs*1] += a_2 * b_1;
4229 CC[3+bs*1] += a_3 * b_1;
4230
4231 A += 4;
4232 B += 1;
4233 k += 1;
4234
4235 if(k>=kmax)
4236 goto store;
4237
4238 // k = 2
4239
4240 a_0 = A[0];
4241 a_1 = A[1];
4242 a_2 = A[2];
4243 a_3 = A[3];
4244
4245 b_0 = B[0];
4246 CC[0+bs*0] += a_0 * b_0;
4247 CC[1+bs*0] += a_1 * b_0;
4248 CC[2+bs*0] += a_2 * b_0;
4249 CC[3+bs*0] += a_3 * b_0;
4250
4251 b_1 = B[4];
4252 CC[0+bs*1] += a_0 * b_1;
4253 CC[1+bs*1] += a_1 * b_1;
4254 CC[2+bs*1] += a_2 * b_1;
4255 CC[3+bs*1] += a_3 * b_1;
4256
4257 b_2 = B[8];
4258 CC[0+bs*2] += a_0 * b_2;
4259 CC[1+bs*2] += a_1 * b_2;
4260 CC[2+bs*2] += a_2 * b_2;
4261 CC[3+bs*2] += a_3 * b_2;
4262
4263 A += 4;
4264 B += 1;
4265 k += 1;
4266
4267 if(k>=kmax)
4268 goto store;
4269
4270 // k = 3
4271
4272 a_0 = A[0];
4273 a_1 = A[1];
4274 a_2 = A[2];
4275 a_3 = A[3];
4276
4277 b_0 = B[0];
4278 CC[0+bs*0] += a_0 * b_0;
4279 CC[1+bs*0] += a_1 * b_0;
4280 CC[2+bs*0] += a_2 * b_0;
4281 CC[3+bs*0] += a_3 * b_0;
4282
4283 b_1 = B[4];
4284 CC[0+bs*1] += a_0 * b_1;
4285 CC[1+bs*1] += a_1 * b_1;
4286 CC[2+bs*1] += a_2 * b_1;
4287 CC[3+bs*1] += a_3 * b_1;
4288
4289 b_2 = B[8];
4290 CC[0+bs*2] += a_0 * b_2;
4291 CC[1+bs*2] += a_1 * b_2;
4292 CC[2+bs*2] += a_2 * b_2;
4293 CC[3+bs*2] += a_3 * b_2;
4294
4295 b_3 = B[12];
4296 CC[0+bs*3] += a_0 * b_3;
4297 CC[1+bs*3] += a_1 * b_3;
4298 CC[2+bs*3] += a_2 * b_3;
4299 CC[3+bs*3] += a_3 * b_3;
4300
4301 A += 4;
4302 B += 1;
4303 k += 1;
4304
4305 if(k>=kmax)
4306 goto store;
4307
4308 // k = 4
4309
4310 a_0 = A[0];
4311 a_1 = A[1];
4312 a_2 = A[2];
4313 a_3 = A[3];
4314
4315 b_0 = B[0];
4316 CC[0+bs*0] += a_0 * b_0;
4317 CC[1+bs*0] += a_1 * b_0;
4318 CC[2+bs*0] += a_2 * b_0;
4319 CC[3+bs*0] += a_3 * b_0;
4320
4321 b_1 = B[4];
4322 CC[0+bs*1] += a_0 * b_1;
4323 CC[1+bs*1] += a_1 * b_1;
4324 CC[2+bs*1] += a_2 * b_1;
4325 CC[3+bs*1] += a_3 * b_1;
4326
4327 b_2 = B[8];
4328 CC[0+bs*2] += a_0 * b_2;
4329 CC[1+bs*2] += a_1 * b_2;
4330 CC[2+bs*2] += a_2 * b_2;
4331 CC[3+bs*2] += a_3 * b_2;
4332
4333 b_3 = B[12];
4334 CC[0+bs*3] += a_0 * b_3;
4335 CC[1+bs*3] += a_1 * b_3;
4336 CC[2+bs*3] += a_2 * b_3;
4337 CC[3+bs*3] += a_3 * b_3;
4338
4339 A += 4;
4340 B += 4*sdb-3;
4341 k += 1;
4342
4343 }
4344
4345 store:
4346
4347 CC[0+bs*0] = alpha[0]*CC[0+bs*0];
4348 CC[1+bs*0] = alpha[0]*CC[1+bs*0];
4349 CC[2+bs*0] = alpha[0]*CC[2+bs*0];
4350 CC[3+bs*0] = alpha[0]*CC[3+bs*0];
4351
4352 CC[0+bs*1] = alpha[0]*CC[0+bs*1];
4353 CC[1+bs*1] = alpha[0]*CC[1+bs*1];
4354 CC[2+bs*1] = alpha[0]*CC[2+bs*1];
4355 CC[3+bs*1] = alpha[0]*CC[3+bs*1];
4356
4357 CC[0+bs*2] = alpha[0]*CC[0+bs*2];
4358 CC[1+bs*2] = alpha[0]*CC[1+bs*2];
4359 CC[2+bs*2] = alpha[0]*CC[2+bs*2];
4360 CC[3+bs*2] = alpha[0]*CC[3+bs*2];
4361
4362 CC[0+bs*3] = alpha[0]*CC[0+bs*3];
4363 CC[1+bs*3] = alpha[0]*CC[1+bs*3];
4364 CC[2+bs*3] = alpha[0]*CC[2+bs*3];
4365 CC[3+bs*3] = alpha[0]*CC[3+bs*3];
4366
4367 float beta1 = 1.0;
4368
4369 kernel_sgemm_nn_4x4_lib4(kmax-k, alpha, A, 0, B, sdb, &beta1, CC, CC);
4370
4371 // shift sol for cols
4372 if(n0>0)
4373 {
4374 if(n0==1)
4375 {
4376 CC[0+bs*0] = CC[0+bs*1];
4377 CC[1+bs*0] = CC[1+bs*1];
4378 CC[2+bs*0] = CC[2+bs*1];
4379 CC[3+bs*0] = CC[3+bs*1];
4380
4381 CC[0+bs*1] = CC[0+bs*2];
4382 CC[1+bs*1] = CC[1+bs*2];
4383 CC[2+bs*1] = CC[2+bs*2];
4384 CC[3+bs*1] = CC[3+bs*2];
4385
4386 CC[0+bs*2] = CC[0+bs*3];
4387 CC[1+bs*2] = CC[1+bs*3];
4388 CC[2+bs*2] = CC[2+bs*3];
4389 CC[3+bs*2] = CC[3+bs*3];
4390
4391 D0 += 1*bs;
4392 }
4393 else if(n0==2)
4394 {
4395 CC[0+bs*0] = CC[0+bs*2];
4396 CC[1+bs*0] = CC[1+bs*2];
4397 CC[2+bs*0] = CC[2+bs*2];
4398 CC[3+bs*0] = CC[3+bs*2];
4399
4400 CC[0+bs*1] = CC[0+bs*3];
4401 CC[1+bs*1] = CC[1+bs*3];
4402 CC[2+bs*1] = CC[2+bs*3];
4403 CC[3+bs*1] = CC[3+bs*3];
4404
4405 D0 += 2*bs;
4406 }
4407 else //if(n0==3)
4408 {
4409 CC[0+bs*0] = CC[0+bs*3];
4410 CC[1+bs*0] = CC[1+bs*3];
4411 CC[2+bs*0] = CC[2+bs*3];
4412 CC[3+bs*0] = CC[3+bs*3];
4413
4414 D0 += 3*bs;
4415 }
4416 }
4417
4418 n1 = 4<n1 ? 4 : n1;
4419 int kn = n1 - n0;
4420
4421 if(offsetD==0)
4422 {
4423 if(kn<=0)
4424 return;
4425
4426 if(m0<=0 & m1>0) D0[0+bs*0] = CC[0+bs*0];
4427 if(m0<=1 & m1>1) D0[1+bs*0] = CC[1+bs*0];
4428 if(m0<=2 & m1>2) D0[2+bs*0] = CC[2+bs*0];
4429 if(m0<=3 & m1>3) D0[3+bs*0] = CC[3+bs*0];
4430
4431 if(kn<=1)
4432 return;
4433
4434 if(m0<=0 & m1>0) D0[0+bs*1] = CC[0+bs*1];
4435 if(m0<=1 & m1>1) D0[1+bs*1] = CC[1+bs*1];
4436 if(m0<=2 & m1>2) D0[2+bs*1] = CC[2+bs*1];
4437 if(m0<=3 & m1>3) D0[3+bs*1] = CC[3+bs*1];
4438
4439 if(kn<=2)
4440 return;
4441
4442 if(m0<=0 & m1>0) D0[0+bs*2] = CC[0+bs*2];
4443 if(m0<=1 & m1>1) D0[1+bs*2] = CC[1+bs*2];
4444 if(m0<=2 & m1>2) D0[2+bs*2] = CC[2+bs*2];
4445 if(m0<=3 & m1>3) D0[3+bs*2] = CC[3+bs*2];
4446
4447 if(kn<=3)
4448 return;
4449
4450 if(m0<=0 & m1>0) D0[0+bs*3] = CC[0+bs*3];
4451 if(m0<=1 & m1>1) D0[1+bs*3] = CC[1+bs*3];
4452 if(m0<=2 & m1>2) D0[2+bs*3] = CC[2+bs*3];
4453 if(m0<=3 & m1>3) D0[3+bs*3] = CC[3+bs*3];
4454 }
4455 else if(offsetD==1)
4456 {
4457 D1 = D0 + sdd*bs;
4458
4459 if(kn<=0)
4460 return;
4461
4462 if(m0<=0 & m1>0) D0[1+bs*0] = CC[0+bs*0];
4463 if(m0<=1 & m1>1) D0[2+bs*0] = CC[1+bs*0];
4464 if(m0<=2 & m1>2) D0[3+bs*0] = CC[2+bs*0];
4465 if(m0<=3 & m1>3) D1[0+bs*0] = CC[3+bs*0];
4466
4467 if(kn<=1)
4468 return;
4469
4470 if(m0<=0 & m1>0) D0[1+bs*1] = CC[0+bs*1];
4471 if(m0<=1 & m1>1) D0[2+bs*1] = CC[1+bs*1];
4472 if(m0<=2 & m1>2) D0[3+bs*1] = CC[2+bs*1];
4473 if(m0<=3 & m1>3) D1[0+bs*1] = CC[3+bs*1];
4474
4475 if(kn<=2)
4476 return;
4477
4478 if(m0<=0 & m1>0) D0[1+bs*2] = CC[0+bs*2];
4479 if(m0<=1 & m1>1) D0[2+bs*2] = CC[1+bs*2];
4480 if(m0<=2 & m1>2) D0[3+bs*2] = CC[2+bs*2];
4481 if(m0<=3 & m1>3) D1[0+bs*2] = CC[3+bs*2];
4482
4483 if(kn<=3)
4484 return;
4485
4486 if(m0<=0 & m1>0) D0[1+bs*3] = CC[0+bs*3];
4487 if(m0<=1 & m1>1) D0[2+bs*3] = CC[1+bs*3];
4488 if(m0<=2 & m1>2) D0[3+bs*3] = CC[2+bs*3];
4489 if(m0<=3 & m1>3) D1[0+bs*3] = CC[3+bs*3];
4490 }
4491 else if(offsetD==2)
4492 {
4493 D1 = D0 + sdd*bs;
4494
4495 if(kn<=0)
4496 return;
4497
4498 if(m0<=0 & m1>0) D0[2+bs*0] = CC[0+bs*0];
4499 if(m0<=1 & m1>1) D0[3+bs*0] = CC[1+bs*0];
4500 if(m0<=2 & m1>2) D1[0+bs*0] = CC[2+bs*0];
4501 if(m0<=3 & m1>3) D1[1+bs*0] = CC[3+bs*0];
4502
4503 if(kn<=1)
4504 return;
4505
4506 if(m0<=0 & m1>0) D0[2+bs*1] = CC[0+bs*1];
4507 if(m0<=1 & m1>1) D0[3+bs*1] = CC[1+bs*1];
4508 if(m0<=2 & m1>2) D1[0+bs*1] = CC[2+bs*1];
4509 if(m0<=3 & m1>3) D1[1+bs*1] = CC[3+bs*1];
4510
4511 if(kn<=2)
4512 return;
4513
4514 if(m0<=0 & m1>0) D0[2+bs*2] = CC[0+bs*2];
4515 if(m0<=1 & m1>1) D0[3+bs*2] = CC[1+bs*2];
4516 if(m0<=2 & m1>2) D1[0+bs*2] = CC[2+bs*2];
4517 if(m0<=3 & m1>3) D1[1+bs*2] = CC[3+bs*2];
4518
4519 if(kn<=3)
4520 return;
4521
4522 if(m0<=0 & m1>0) D0[2+bs*3] = CC[0+bs*3];
4523 if(m0<=1 & m1>1) D0[3+bs*3] = CC[1+bs*3];
4524 if(m0<=2 & m1>2) D1[0+bs*3] = CC[2+bs*3];
4525 if(m0<=3 & m1>3) D1[1+bs*3] = CC[3+bs*3];
4526 }
4527 else //if(offsetD==3)
4528 {
4529 D1 = D0 + sdd*bs;
4530
4531 if(kn<=0)
4532 return;
4533
4534 if(m0<=0 & m1>0) D0[3+bs*0] = CC[0+bs*0];
4535 if(m0<=1 & m1>1) D1[0+bs*0] = CC[1+bs*0];
4536 if(m0<=2 & m1>2) D1[1+bs*0] = CC[2+bs*0];
4537 if(m0<=3 & m1>3) D1[2+bs*0] = CC[3+bs*0];
4538
4539 if(kn<=1)
4540 return;
4541
4542 if(m0<=0 & m1>0) D0[3+bs*1] = CC[0+bs*1];
4543 if(m0<=1 & m1>1) D1[0+bs*1] = CC[1+bs*1];
4544 if(m0<=2 & m1>2) D1[1+bs*1] = CC[2+bs*1];
4545 if(m0<=3 & m1>3) D1[2+bs*1] = CC[3+bs*1];
4546
4547 if(kn<=2)
4548 return;
4549
4550 if(m0<=0 & m1>0) D0[3+bs*2] = CC[0+bs*2];
4551 if(m0<=1 & m1>1) D1[0+bs*2] = CC[1+bs*2];
4552 if(m0<=2 & m1>2) D1[1+bs*2] = CC[2+bs*2];
4553 if(m0<=3 & m1>3) D1[2+bs*2] = CC[3+bs*2];
4554
4555 if(kn<=3)
4556 return;
4557
4558 if(m0<=0 & m1>0) D0[3+bs*3] = CC[0+bs*3];
4559 if(m0<=1 & m1>1) D1[0+bs*3] = CC[1+bs*3];
4560 if(m0<=2 & m1>2) D1[1+bs*3] = CC[2+bs*3];
4561 if(m0<=3 & m1>3) D1[2+bs*3] = CC[3+bs*3];
4562 }
4563
4564 return;
4565
4566 }
4567 #endif
4568
4569
4570
4571 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
kernel_spotrf_nt_l_4x4_lib4(int kmax,float * A,float * B,float * C,float * D,float * inv_diag_D)4572 void kernel_spotrf_nt_l_4x4_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D)
4573 {
4574
4575 const int bs = 4;
4576
4577 float
4578 a_0, a_1, a_2, a_3,
4579 b_0, b_1, b_2, b_3,
4580 tmp;
4581
4582 #if defined(TARGET_GENERIC)
4583 float CC[16] = {0};
4584 #else
4585 ALIGNED( float CC[16], 64 ) = {0};
4586 #endif
4587
4588 int k;
4589
4590 float alpha1 = -1.0;
4591 float beta1 = 1.0;
4592
4593 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC);
4594
4595 if(CC[0+bs*0]>0)
4596 {
4597 CC[0+bs*0] = sqrt(CC[0+bs*0]);
4598 tmp = 1.0/CC[0+bs*0];
4599 }
4600 else
4601 {
4602 CC[0+bs*0] = 0.0;
4603 tmp = 0.0;
4604 }
4605 CC[1+bs*0] *= tmp;
4606 CC[2+bs*0] *= tmp;
4607 CC[3+bs*0] *= tmp;
4608 inv_diag_D[0] = tmp;
4609
4610 CC[1+bs*1] -= CC[1+bs*0] * CC[1+bs*0];
4611 CC[2+bs*1] -= CC[2+bs*0] * CC[1+bs*0];
4612 CC[3+bs*1] -= CC[3+bs*0] * CC[1+bs*0];
4613 if(CC[1+bs*1]>0)
4614 {
4615 CC[1+bs*1] = sqrt(CC[1+bs*1]);
4616 tmp = 1.0/CC[1+bs*1];
4617 }
4618 else
4619 {
4620 CC[1+bs*1] = 0.0;
4621 tmp = 0.0;
4622 }
4623 CC[2+bs*1] *= tmp;
4624 CC[3+bs*1] *= tmp;
4625 inv_diag_D[1] = tmp;
4626
4627 CC[2+bs*2] -= CC[2+bs*0] * CC[2+bs*0];
4628 CC[3+bs*2] -= CC[3+bs*0] * CC[2+bs*0];
4629 CC[2+bs*2] -= CC[2+bs*1] * CC[2+bs*1];
4630 CC[3+bs*2] -= CC[3+bs*1] * CC[2+bs*1];
4631 if(CC[2+bs*2]>0)
4632 {
4633 CC[2+bs*2] = sqrt(CC[2+bs*2]);
4634 tmp = 1.0/CC[2+bs*2];
4635 }
4636 else
4637 {
4638 CC[2+bs*2] = 0.0;
4639 tmp = 0.0;
4640 }
4641 CC[3+bs*2] *= tmp;
4642 inv_diag_D[2] = tmp;
4643
4644 CC[3+bs*3] -= CC[3+bs*0] * CC[3+bs*0];
4645 CC[3+bs*3] -= CC[3+bs*1] * CC[3+bs*1];
4646 CC[3+bs*3] -= CC[3+bs*2] * CC[3+bs*2];
4647 if(CC[3+bs*3]>0)
4648 {
4649 CC[3+bs*3] = sqrt(CC[3+bs*3]);
4650 tmp = 1.0/CC[3+bs*3];
4651 }
4652 else
4653 {
4654 CC[3+bs*3] = 0.0;
4655 tmp = 0.0;
4656 }
4657 inv_diag_D[3] = tmp;
4658
4659 D[0+bs*0] = CC[0+bs*0];
4660 D[1+bs*0] = CC[1+bs*0];
4661 D[2+bs*0] = CC[2+bs*0];
4662 D[3+bs*0] = CC[3+bs*0];
4663
4664 D[1+bs*1] = CC[1+bs*1];
4665 D[2+bs*1] = CC[2+bs*1];
4666 D[3+bs*1] = CC[3+bs*1];
4667
4668 D[2+bs*2] = CC[2+bs*2];
4669 D[3+bs*2] = CC[3+bs*2];
4670
4671 D[3+bs*3] = CC[3+bs*3];
4672
4673 return;
4674
4675 }
4676 #endif
4677
4678
4679
4680 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) //|| defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_spotrf_nt_l_4x4_vs_lib4(int kmax,float * A,float * B,float * C,float * D,float * inv_diag_D,int km,int kn)4681 void kernel_spotrf_nt_l_4x4_vs_lib4(int kmax, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn)
4682 {
4683
4684 const int bs = 4;
4685
4686 float tmp;
4687
4688 #if defined(TARGET_GENERIC)
4689 float CC[16] = {0};
4690 #else
4691 ALIGNED( float CC[16], 64 ) = {0};
4692 #endif
4693
4694 float alpha1 = -1.0;
4695 float beta1 = 1.0;
4696
4697 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, &beta1, C, CC);
4698
4699 if(CC[0+bs*0]>0)
4700 {
4701 CC[0+bs*0] = sqrt(CC[0+bs*0]);
4702 tmp = 1.0/CC[0+bs*0];
4703 }
4704 else
4705 {
4706 CC[0+bs*0] = 0.0;
4707 tmp = 0.0;
4708 }
4709 CC[1+bs*0] *= tmp;
4710 CC[2+bs*0] *= tmp;
4711 CC[3+bs*0] *= tmp;
4712 inv_diag_D[0] = tmp;
4713
4714 if(kn==1)
4715 goto store;
4716
4717 CC[1+bs*1] -= CC[1+bs*0] * CC[1+bs*0];
4718 CC[2+bs*1] -= CC[2+bs*0] * CC[1+bs*0];
4719 CC[3+bs*1] -= CC[3+bs*0] * CC[1+bs*0];
4720 if(CC[1+bs*1]>0)
4721 {
4722 CC[1+bs*1] = sqrt(CC[1+bs*1]);
4723 tmp = 1.0/CC[1+bs*1];
4724 }
4725 else
4726 {
4727 CC[1+bs*1] = 0.0;
4728 tmp = 0.0;
4729 }
4730 CC[2+bs*1] *= tmp;
4731 CC[3+bs*1] *= tmp;
4732 inv_diag_D[1] = tmp;
4733
4734 if(kn==2)
4735 goto store;
4736
4737 CC[2+bs*2] -= CC[2+bs*0] * CC[2+bs*0];
4738 CC[3+bs*2] -= CC[3+bs*0] * CC[2+bs*0];
4739 CC[2+bs*2] -= CC[2+bs*1] * CC[2+bs*1];
4740 CC[3+bs*2] -= CC[3+bs*1] * CC[2+bs*1];
4741 if(CC[2+bs*2]>0)
4742 {
4743 CC[2+bs*2] = sqrt(CC[2+bs*2]);
4744 tmp = 1.0/CC[2+bs*2];
4745 }
4746 else
4747 {
4748 CC[2+bs*2] = 0.0;
4749 tmp = 0.0;
4750 }
4751 CC[3+bs*2] *= tmp;
4752 inv_diag_D[2] = tmp;
4753
4754 if(kn==3)
4755 goto store;
4756
4757 CC[3+bs*3] -= CC[3+bs*0] * CC[3+bs*0];
4758 CC[3+bs*3] -= CC[3+bs*1] * CC[3+bs*1];
4759 CC[3+bs*3] -= CC[3+bs*2] * CC[3+bs*2];
4760 if(CC[3+bs*3]>0)
4761 {
4762 CC[3+bs*3] = sqrt(CC[3+bs*3]);
4763 tmp = 1.0/CC[3+bs*3];
4764 }
4765 else
4766 {
4767 CC[3+bs*3] = 0.0;
4768 tmp = 0.0;
4769 }
4770 inv_diag_D[3] = tmp;
4771
4772
4773 store:
4774
4775 if(km>=4)
4776 {
4777 D[0+bs*0] = CC[0+bs*0];
4778 D[1+bs*0] = CC[1+bs*0];
4779 D[2+bs*0] = CC[2+bs*0];
4780 D[3+bs*0] = CC[3+bs*0];
4781
4782 if(kn==1)
4783 return;
4784
4785 D[1+bs*1] = CC[1+bs*1];
4786 D[2+bs*1] = CC[2+bs*1];
4787 D[3+bs*1] = CC[3+bs*1];
4788
4789 if(kn==2)
4790 return;
4791
4792 D[2+bs*2] = CC[2+bs*2];
4793 D[3+bs*2] = CC[3+bs*2];
4794
4795 if(kn==3)
4796 return;
4797
4798 D[3+bs*3] = CC[3+bs*3];
4799 }
4800 else if(km>=3)
4801 {
4802 D[0+bs*0] = CC[0+bs*0];
4803 D[1+bs*0] = CC[1+bs*0];
4804 D[2+bs*0] = CC[2+bs*0];
4805
4806 if(kn==1)
4807 return;
4808
4809 D[1+bs*1] = CC[1+bs*1];
4810 D[2+bs*1] = CC[2+bs*1];
4811
4812 if(kn==2)
4813 return;
4814
4815 D[2+bs*2] = CC[2+bs*2];
4816 }
4817 else if(km>=2)
4818 {
4819 D[0+bs*0] = CC[0+bs*0];
4820 D[1+bs*0] = CC[1+bs*0];
4821
4822 if(kn==1)
4823 return;
4824
4825 D[1+bs*1] = CC[1+bs*1];
4826 }
4827 else //if(km>=1)
4828 {
4829 D[0+bs*0] = CC[0+bs*0];
4830 }
4831
4832 return;
4833
4834 }
4835 #endif
4836
4837
4838
4839 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp,float * Ap,float * Bp,int km_,float * Am,float * Bm,float * C,float * D,float * inv_diag_D)4840 void kernel_ssyrk_spotrf_nt_l_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D)
4841 {
4842 float alpha = 1.0;
4843 float beta = 1.0;
4844 kernel_ssyrk_nt_l_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
4845 kernel_spotrf_nt_l_4x4_lib4(km_, Am, Bm, D, D, inv_diag_D);
4846 }
4847 #endif
4848
4849
4850
4851 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp,float * Ap,float * Bp,int km_,float * Am,float * Bm,float * C,float * D,float * inv_diag_D,int km,int kn)4852 void kernel_ssyrk_spotrf_nt_l_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn)
4853 {
4854 float alpha = 1.0;
4855 float beta = 1.0;
4856 kernel_ssyrk_nt_l_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
4857 kernel_spotrf_nt_l_4x4_vs_lib4(km_, Am, Bm, D, D, inv_diag_D, km, kn);
4858 }
4859 #endif
4860
4861
4862
4863 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER)
kernel_strsm_nt_rl_inv_4x4_lib4(int kmax,float * A,float * B,float * beta,float * C,float * D,float * E,float * inv_diag_E)4864 void kernel_strsm_nt_rl_inv_4x4_lib4(int kmax, float *A, float *B, float *beta, float *C, float *D, float *E, float *inv_diag_E)
4865 {
4866
4867 const int bs = 4;
4868
4869 float tmp;
4870
4871 #if defined(TARGET_GENERIC)
4872 float CC[16] = {0};
4873 #else
4874 ALIGNED( float CC[16], 64 ) = {0};
4875 #endif
4876
4877 float alpha1 = -1.0;
4878
4879 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
4880
4881 tmp = inv_diag_E[0];
4882 CC[0+bs*0] *= tmp;
4883 CC[1+bs*0] *= tmp;
4884 CC[2+bs*0] *= tmp;
4885 CC[3+bs*0] *= tmp;
4886
4887 tmp = E[1+bs*0];
4888 CC[0+bs*1] -= CC[0+bs*0] * tmp;
4889 CC[1+bs*1] -= CC[1+bs*0] * tmp;
4890 CC[2+bs*1] -= CC[2+bs*0] * tmp;
4891 CC[3+bs*1] -= CC[3+bs*0] * tmp;
4892 tmp = inv_diag_E[1];
4893 CC[0+bs*1] *= tmp;
4894 CC[1+bs*1] *= tmp;
4895 CC[2+bs*1] *= tmp;
4896 CC[3+bs*1] *= tmp;
4897
4898 tmp = E[2+bs*0];
4899 CC[0+bs*2] -= CC[0+bs*0] * tmp;
4900 CC[1+bs*2] -= CC[1+bs*0] * tmp;
4901 CC[2+bs*2] -= CC[2+bs*0] * tmp;
4902 CC[3+bs*2] -= CC[3+bs*0] * tmp;
4903 tmp = E[2+bs*1];
4904 CC[0+bs*2] -= CC[0+bs*1] * tmp;
4905 CC[1+bs*2] -= CC[1+bs*1] * tmp;
4906 CC[2+bs*2] -= CC[2+bs*1] * tmp;
4907 CC[3+bs*2] -= CC[3+bs*1] * tmp;
4908 tmp = inv_diag_E[2];
4909 CC[0+bs*2] *= tmp;
4910 CC[1+bs*2] *= tmp;
4911 CC[2+bs*2] *= tmp;
4912 CC[3+bs*2] *= tmp;
4913
4914 tmp = E[3+bs*0];
4915 CC[0+bs*3] -= CC[0+bs*0] * tmp;
4916 CC[1+bs*3] -= CC[1+bs*0] * tmp;
4917 CC[2+bs*3] -= CC[2+bs*0] * tmp;
4918 CC[3+bs*3] -= CC[3+bs*0] * tmp;
4919 tmp = E[3+bs*1];
4920 CC[0+bs*3] -= CC[0+bs*1] * tmp;
4921 CC[1+bs*3] -= CC[1+bs*1] * tmp;
4922 CC[2+bs*3] -= CC[2+bs*1] * tmp;
4923 CC[3+bs*3] -= CC[3+bs*1] * tmp;
4924 tmp = E[3+bs*2];
4925 CC[0+bs*3] -= CC[0+bs*2] * tmp;
4926 CC[1+bs*3] -= CC[1+bs*2] * tmp;
4927 CC[2+bs*3] -= CC[2+bs*2] * tmp;
4928 CC[3+bs*3] -= CC[3+bs*2] * tmp;
4929 tmp = inv_diag_E[3];
4930 CC[0+bs*3] *= tmp;
4931 CC[1+bs*3] *= tmp;
4932 CC[2+bs*3] *= tmp;
4933 CC[3+bs*3] *= tmp;
4934
4935 D[0+bs*0] = CC[0+bs*0];
4936 D[1+bs*0] = CC[1+bs*0];
4937 D[2+bs*0] = CC[2+bs*0];
4938 D[3+bs*0] = CC[3+bs*0];
4939
4940 D[0+bs*1] = CC[0+bs*1];
4941 D[1+bs*1] = CC[1+bs*1];
4942 D[2+bs*1] = CC[2+bs*1];
4943 D[3+bs*1] = CC[3+bs*1];
4944
4945 D[0+bs*2] = CC[0+bs*2];
4946 D[1+bs*2] = CC[1+bs*2];
4947 D[2+bs*2] = CC[2+bs*2];
4948 D[3+bs*2] = CC[3+bs*2];
4949
4950 D[0+bs*3] = CC[0+bs*3];
4951 D[1+bs*3] = CC[1+bs*3];
4952 D[2+bs*3] = CC[2+bs*3];
4953 D[3+bs*3] = CC[3+bs*3];
4954
4955 return;
4956
4957 }
4958 #endif
4959
4960
4961
4962 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nt_rl_inv_4x4_vs_lib4(int kmax,float * A,float * B,float * beta,float * C,float * D,float * E,float * inv_diag_E,int km,int kn)4963 void kernel_strsm_nt_rl_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *beta, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
4964 {
4965
4966 const int bs = 4;
4967
4968 float tmp;
4969
4970 #if defined(TARGET_GENERIC)
4971 float CC[16] = {0};
4972 #else
4973 ALIGNED( float CC[16], 64 ) = {0};
4974 #endif
4975
4976 float alpha1 = -1.0;
4977
4978 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
4979
4980 tmp = inv_diag_E[0];
4981 CC[0+bs*0] *= tmp;
4982 CC[1+bs*0] *= tmp;
4983 CC[2+bs*0] *= tmp;
4984 CC[3+bs*0] *= tmp;
4985
4986 if(kn==1)
4987 goto store;
4988
4989 tmp = E[1+bs*0];
4990 CC[0+bs*1] -= CC[0+bs*0] * tmp;
4991 CC[1+bs*1] -= CC[1+bs*0] * tmp;
4992 CC[2+bs*1] -= CC[2+bs*0] * tmp;
4993 CC[3+bs*1] -= CC[3+bs*0] * tmp;
4994 tmp = inv_diag_E[1];
4995 CC[0+bs*1] *= tmp;
4996 CC[1+bs*1] *= tmp;
4997 CC[2+bs*1] *= tmp;
4998 CC[3+bs*1] *= tmp;
4999
5000 if(kn==2)
5001 goto store;
5002
5003 tmp = E[2+bs*0];
5004 CC[0+bs*2] -= CC[0+bs*0] * tmp;
5005 CC[1+bs*2] -= CC[1+bs*0] * tmp;
5006 CC[2+bs*2] -= CC[2+bs*0] * tmp;
5007 CC[3+bs*2] -= CC[3+bs*0] * tmp;
5008 tmp = E[2+bs*1];
5009 CC[0+bs*2] -= CC[0+bs*1] * tmp;
5010 CC[1+bs*2] -= CC[1+bs*1] * tmp;
5011 CC[2+bs*2] -= CC[2+bs*1] * tmp;
5012 CC[3+bs*2] -= CC[3+bs*1] * tmp;
5013 tmp = inv_diag_E[2];
5014 CC[0+bs*2] *= tmp;
5015 CC[1+bs*2] *= tmp;
5016 CC[2+bs*2] *= tmp;
5017 CC[3+bs*2] *= tmp;
5018
5019 if(kn==3)
5020 goto store;
5021
5022 tmp = E[3+bs*0];
5023 CC[0+bs*3] -= CC[0+bs*0] * tmp;
5024 CC[1+bs*3] -= CC[1+bs*0] * tmp;
5025 CC[2+bs*3] -= CC[2+bs*0] * tmp;
5026 CC[3+bs*3] -= CC[3+bs*0] * tmp;
5027 tmp = E[3+bs*1];
5028 CC[0+bs*3] -= CC[0+bs*1] * tmp;
5029 CC[1+bs*3] -= CC[1+bs*1] * tmp;
5030 CC[2+bs*3] -= CC[2+bs*1] * tmp;
5031 CC[3+bs*3] -= CC[3+bs*1] * tmp;
5032 tmp = E[3+bs*2];
5033 CC[0+bs*3] -= CC[0+bs*2] * tmp;
5034 CC[1+bs*3] -= CC[1+bs*2] * tmp;
5035 CC[2+bs*3] -= CC[2+bs*2] * tmp;
5036 CC[3+bs*3] -= CC[3+bs*2] * tmp;
5037 tmp = inv_diag_E[3];
5038 CC[0+bs*3] *= tmp;
5039 CC[1+bs*3] *= tmp;
5040 CC[2+bs*3] *= tmp;
5041 CC[3+bs*3] *= tmp;
5042
5043 store:
5044
5045 if(km>=4)
5046 {
5047 D[0+bs*0] = CC[0+bs*0];
5048 D[1+bs*0] = CC[1+bs*0];
5049 D[2+bs*0] = CC[2+bs*0];
5050 D[3+bs*0] = CC[3+bs*0];
5051
5052 if(kn==1)
5053 return;
5054
5055 D[0+bs*1] = CC[0+bs*1];
5056 D[1+bs*1] = CC[1+bs*1];
5057 D[2+bs*1] = CC[2+bs*1];
5058 D[3+bs*1] = CC[3+bs*1];
5059
5060 if(kn==2)
5061 return;
5062
5063 D[0+bs*2] = CC[0+bs*2];
5064 D[1+bs*2] = CC[1+bs*2];
5065 D[2+bs*2] = CC[2+bs*2];
5066 D[3+bs*2] = CC[3+bs*2];
5067
5068 if(kn==3)
5069 return;
5070
5071 D[0+bs*3] = CC[0+bs*3];
5072 D[1+bs*3] = CC[1+bs*3];
5073 D[2+bs*3] = CC[2+bs*3];
5074 D[3+bs*3] = CC[3+bs*3];
5075 }
5076 else if(km>=3)
5077 {
5078 D[0+bs*0] = CC[0+bs*0];
5079 D[1+bs*0] = CC[1+bs*0];
5080 D[2+bs*0] = CC[2+bs*0];
5081
5082 if(kn==1)
5083 return;
5084
5085 D[0+bs*1] = CC[0+bs*1];
5086 D[1+bs*1] = CC[1+bs*1];
5087 D[2+bs*1] = CC[2+bs*1];
5088
5089 if(kn==2)
5090 return;
5091
5092 D[0+bs*2] = CC[0+bs*2];
5093 D[1+bs*2] = CC[1+bs*2];
5094 D[2+bs*2] = CC[2+bs*2];
5095
5096 if(kn==3)
5097 return;
5098
5099 D[0+bs*3] = CC[0+bs*3];
5100 D[1+bs*3] = CC[1+bs*3];
5101 D[2+bs*3] = CC[2+bs*3];
5102 }
5103 else if(km>=2)
5104 {
5105 D[0+bs*0] = CC[0+bs*0];
5106 D[1+bs*0] = CC[1+bs*0];
5107
5108 if(kn==1)
5109 return;
5110
5111 D[0+bs*1] = CC[0+bs*1];
5112 D[1+bs*1] = CC[1+bs*1];
5113
5114 if(kn==2)
5115 return;
5116
5117 D[0+bs*2] = CC[0+bs*2];
5118 D[1+bs*2] = CC[1+bs*2];
5119
5120 if(kn==3)
5121 return;
5122
5123 D[0+bs*3] = CC[0+bs*3];
5124 D[1+bs*3] = CC[1+bs*3];
5125 }
5126 else //if(km>=1)
5127 {
5128 D[0+bs*0] = CC[0+bs*0];
5129
5130 if(kn==1)
5131 return;
5132
5133 D[0+bs*1] = CC[0+bs*1];
5134
5135 if(kn==2)
5136 return;
5137
5138 D[0+bs*2] = CC[0+bs*2];
5139
5140 if(kn==3)
5141 return;
5142
5143 D[0+bs*3] = CC[0+bs*3];
5144 }
5145
5146 return;
5147
5148 }
5149 #endif
5150
5151
5152
5153 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp,float * Ap,float * Bp,int km_,float * Am,float * Bm,float * C,float * D,float * E,float * inv_diag_E)5154 void kernel_sgemm_strsm_nt_rl_inv_4x4_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E)
5155 {
5156 float alpha = 1.0;
5157 float beta = 1.0;
5158 kernel_sgemm_nt_4x4_lib4(kp, &alpha, Ap, Bp, &beta, C, D);
5159 kernel_strsm_nt_rl_inv_4x4_lib4(km_, Am, Bm, &beta, D, D, E, inv_diag_E);
5160 }
5161 #endif
5162
5163
5164
5165 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp,float * Ap,float * Bp,int km_,float * Am,float * Bm,float * C,float * D,float * E,float * inv_diag_E,int km,int kn)5166 void kernel_sgemm_strsm_nt_rl_inv_4x4_vs_lib4(int kp, float *Ap, float *Bp, int km_, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
5167 {
5168 float alpha = 1.0;
5169 float beta = 1.0;
5170 kernel_sgemm_nt_4x4_vs_lib4(kp, &alpha, Ap, Bp, &beta, C, D, km, kn);
5171 kernel_strsm_nt_rl_inv_4x4_vs_lib4(km_, Am, Bm, &beta, D, D, E, inv_diag_E, km, kn);
5172 }
5173 #endif
5174
5175
5176
5177 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nt_rl_one_4x4_lib4(int kmax,float * A,float * B,float * beta,float * C,float * D,float * E)5178 void kernel_strsm_nt_rl_one_4x4_lib4(int kmax, float *A, float *B, float *beta, float *C, float *D, float *E)
5179 {
5180
5181 const int bs = 4;
5182
5183 float tmp;
5184
5185 #if defined(TARGET_GENERIC)
5186 float CC[16] = {0};
5187 #else
5188 ALIGNED( float CC[16], 64 ) = {0};
5189 #endif
5190
5191 float alpha1 = -1.0;
5192
5193 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
5194
5195 tmp = E[1+bs*0];
5196 CC[0+bs*1] -= CC[0+bs*0] * tmp;
5197 CC[1+bs*1] -= CC[1+bs*0] * tmp;
5198 CC[2+bs*1] -= CC[2+bs*0] * tmp;
5199 CC[3+bs*1] -= CC[3+bs*0] * tmp;
5200
5201 tmp = E[2+bs*0];
5202 CC[0+bs*2] -= CC[0+bs*0] * tmp;
5203 CC[1+bs*2] -= CC[1+bs*0] * tmp;
5204 CC[2+bs*2] -= CC[2+bs*0] * tmp;
5205 CC[3+bs*2] -= CC[3+bs*0] * tmp;
5206 tmp = E[2+bs*1];
5207 CC[0+bs*2] -= CC[0+bs*1] * tmp;
5208 CC[1+bs*2] -= CC[1+bs*1] * tmp;
5209 CC[2+bs*2] -= CC[2+bs*1] * tmp;
5210 CC[3+bs*2] -= CC[3+bs*1] * tmp;
5211
5212 tmp = E[3+bs*0];
5213 CC[0+bs*3] -= CC[0+bs*0] * tmp;
5214 CC[1+bs*3] -= CC[1+bs*0] * tmp;
5215 CC[2+bs*3] -= CC[2+bs*0] * tmp;
5216 CC[3+bs*3] -= CC[3+bs*0] * tmp;
5217 tmp = E[3+bs*1];
5218 CC[0+bs*3] -= CC[0+bs*1] * tmp;
5219 CC[1+bs*3] -= CC[1+bs*1] * tmp;
5220 CC[2+bs*3] -= CC[2+bs*1] * tmp;
5221 CC[3+bs*3] -= CC[3+bs*1] * tmp;
5222 tmp = E[3+bs*2];
5223 CC[0+bs*3] -= CC[0+bs*2] * tmp;
5224 CC[1+bs*3] -= CC[1+bs*2] * tmp;
5225 CC[2+bs*3] -= CC[2+bs*2] * tmp;
5226 CC[3+bs*3] -= CC[3+bs*2] * tmp;
5227
5228 D[0+bs*0] = CC[0+bs*0];
5229 D[1+bs*0] = CC[1+bs*0];
5230 D[2+bs*0] = CC[2+bs*0];
5231 D[3+bs*0] = CC[3+bs*0];
5232
5233 D[0+bs*1] = CC[0+bs*1];
5234 D[1+bs*1] = CC[1+bs*1];
5235 D[2+bs*1] = CC[2+bs*1];
5236 D[3+bs*1] = CC[3+bs*1];
5237
5238 D[0+bs*2] = CC[0+bs*2];
5239 D[1+bs*2] = CC[1+bs*2];
5240 D[2+bs*2] = CC[2+bs*2];
5241 D[3+bs*2] = CC[3+bs*2];
5242
5243 D[0+bs*3] = CC[0+bs*3];
5244 D[1+bs*3] = CC[1+bs*3];
5245 D[2+bs*3] = CC[2+bs*3];
5246 D[3+bs*3] = CC[3+bs*3];
5247
5248 return;
5249
5250 }
5251 #endif
5252
5253
5254
5255 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nt_rl_one_4x4_vs_lib4(int kmax,float * A,float * B,float * beta,float * C,float * D,float * E,int km,int kn)5256 void kernel_strsm_nt_rl_one_4x4_vs_lib4(int kmax, float *A, float *B, float *beta, float *C, float *D, float *E, int km, int kn)
5257 {
5258
5259 const int bs = 4;
5260
5261 float tmp;
5262
5263 #if defined(TARGET_GENERIC)
5264 float CC[16] = {0};
5265 #else
5266 ALIGNED( float CC[16], 64 ) = {0};
5267 #endif
5268
5269 float alpha1 = -1.0;
5270
5271 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
5272
5273 if(kn==1)
5274 goto store;
5275
5276 tmp = E[1+bs*0];
5277 CC[0+bs*1] -= CC[0+bs*0] * tmp;
5278 CC[1+bs*1] -= CC[1+bs*0] * tmp;
5279 CC[2+bs*1] -= CC[2+bs*0] * tmp;
5280 CC[3+bs*1] -= CC[3+bs*0] * tmp;
5281
5282 if(kn==2)
5283 goto store;
5284
5285 tmp = E[2+bs*0];
5286 CC[0+bs*2] -= CC[0+bs*0] * tmp;
5287 CC[1+bs*2] -= CC[1+bs*0] * tmp;
5288 CC[2+bs*2] -= CC[2+bs*0] * tmp;
5289 CC[3+bs*2] -= CC[3+bs*0] * tmp;
5290 tmp = E[2+bs*1];
5291 CC[0+bs*2] -= CC[0+bs*1] * tmp;
5292 CC[1+bs*2] -= CC[1+bs*1] * tmp;
5293 CC[2+bs*2] -= CC[2+bs*1] * tmp;
5294 CC[3+bs*2] -= CC[3+bs*1] * tmp;
5295
5296 if(kn==3)
5297 goto store;
5298
5299 tmp = E[3+bs*0];
5300 CC[0+bs*3] -= CC[0+bs*0] * tmp;
5301 CC[1+bs*3] -= CC[1+bs*0] * tmp;
5302 CC[2+bs*3] -= CC[2+bs*0] * tmp;
5303 CC[3+bs*3] -= CC[3+bs*0] * tmp;
5304 tmp = E[3+bs*1];
5305 CC[0+bs*3] -= CC[0+bs*1] * tmp;
5306 CC[1+bs*3] -= CC[1+bs*1] * tmp;
5307 CC[2+bs*3] -= CC[2+bs*1] * tmp;
5308 CC[3+bs*3] -= CC[3+bs*1] * tmp;
5309 tmp = E[3+bs*2];
5310 CC[0+bs*3] -= CC[0+bs*2] * tmp;
5311 CC[1+bs*3] -= CC[1+bs*2] * tmp;
5312 CC[2+bs*3] -= CC[2+bs*2] * tmp;
5313 CC[3+bs*3] -= CC[3+bs*2] * tmp;
5314
5315 store:
5316
5317 if(km>=4)
5318 {
5319 D[0+bs*0] = CC[0+bs*0];
5320 D[1+bs*0] = CC[1+bs*0];
5321 D[2+bs*0] = CC[2+bs*0];
5322 D[3+bs*0] = CC[3+bs*0];
5323
5324 if(kn==1)
5325 return;
5326
5327 D[0+bs*1] = CC[0+bs*1];
5328 D[1+bs*1] = CC[1+bs*1];
5329 D[2+bs*1] = CC[2+bs*1];
5330 D[3+bs*1] = CC[3+bs*1];
5331
5332 if(kn==2)
5333 return;
5334
5335 D[0+bs*2] = CC[0+bs*2];
5336 D[1+bs*2] = CC[1+bs*2];
5337 D[2+bs*2] = CC[2+bs*2];
5338 D[3+bs*2] = CC[3+bs*2];
5339
5340 if(kn==3)
5341 return;
5342
5343 D[0+bs*3] = CC[0+bs*3];
5344 D[1+bs*3] = CC[1+bs*3];
5345 D[2+bs*3] = CC[2+bs*3];
5346 D[3+bs*3] = CC[3+bs*3];
5347 }
5348 else if(km>=3)
5349 {
5350 D[0+bs*0] = CC[0+bs*0];
5351 D[1+bs*0] = CC[1+bs*0];
5352 D[2+bs*0] = CC[2+bs*0];
5353
5354 if(kn==1)
5355 return;
5356
5357 D[0+bs*1] = CC[0+bs*1];
5358 D[1+bs*1] = CC[1+bs*1];
5359 D[2+bs*1] = CC[2+bs*1];
5360
5361 if(kn==2)
5362 return;
5363
5364 D[0+bs*2] = CC[0+bs*2];
5365 D[1+bs*2] = CC[1+bs*2];
5366 D[2+bs*2] = CC[2+bs*2];
5367
5368 if(kn==3)
5369 return;
5370
5371 D[0+bs*3] = CC[0+bs*3];
5372 D[1+bs*3] = CC[1+bs*3];
5373 D[2+bs*3] = CC[2+bs*3];
5374 }
5375 else if(km>=2)
5376 {
5377 D[0+bs*0] = CC[0+bs*0];
5378 D[1+bs*0] = CC[1+bs*0];
5379
5380 if(kn==1)
5381 return;
5382
5383 D[0+bs*1] = CC[0+bs*1];
5384 D[1+bs*1] = CC[1+bs*1];
5385
5386 if(kn==2)
5387 return;
5388
5389 D[0+bs*2] = CC[0+bs*2];
5390 D[1+bs*2] = CC[1+bs*2];
5391
5392 if(kn==3)
5393 return;
5394
5395 D[0+bs*3] = CC[0+bs*3];
5396 D[1+bs*3] = CC[1+bs*3];
5397 }
5398 else //if(km>=1)
5399 {
5400 D[0+bs*0] = CC[0+bs*0];
5401
5402 if(kn==1)
5403 return;
5404
5405 D[0+bs*1] = CC[0+bs*1];
5406
5407 if(kn==2)
5408 return;
5409
5410 D[0+bs*2] = CC[0+bs*2];
5411
5412 if(kn==3)
5413 return;
5414
5415 D[0+bs*3] = CC[0+bs*3];
5416 }
5417
5418 return;
5419
5420 }
5421 #endif
5422
5423
5424
5425 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nt_ru_inv_4x4_lib4(int kmax,float * A,float * B,float * beta,float * C,float * D,float * E,float * inv_diag_E)5426 void kernel_strsm_nt_ru_inv_4x4_lib4(int kmax, float *A, float *B, float *beta, float *C, float *D, float *E, float *inv_diag_E)
5427 {
5428
5429 const int bs = 4;
5430
5431 float tmp;
5432
5433 #if defined(TARGET_GENERIC)
5434 float CC[16] = {0};
5435 #else
5436 ALIGNED( float CC[16], 64 ) = {0};
5437 #endif
5438
5439 float alpha1 = -1.0;
5440
5441 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
5442
5443 tmp = inv_diag_E[3];
5444 CC[0+bs*3] *= tmp;
5445 CC[1+bs*3] *= tmp;
5446 CC[2+bs*3] *= tmp;
5447 CC[3+bs*3] *= tmp;
5448 tmp = E[2+bs*3];
5449 CC[0+bs*2] -= CC[0+bs*3] * tmp;
5450 CC[1+bs*2] -= CC[1+bs*3] * tmp;
5451 CC[2+bs*2] -= CC[2+bs*3] * tmp;
5452 CC[3+bs*2] -= CC[3+bs*3] * tmp;
5453 tmp = E[1+bs*3];
5454 CC[0+bs*1] -= CC[0+bs*3] * tmp;
5455 CC[1+bs*1] -= CC[1+bs*3] * tmp;
5456 CC[2+bs*1] -= CC[2+bs*3] * tmp;
5457 CC[3+bs*1] -= CC[3+bs*3] * tmp;
5458 tmp = E[0+bs*3];
5459 CC[0+bs*0] -= CC[0+bs*3] * tmp;
5460 CC[1+bs*0] -= CC[1+bs*3] * tmp;
5461 CC[2+bs*0] -= CC[2+bs*3] * tmp;
5462 CC[3+bs*0] -= CC[3+bs*3] * tmp;
5463
5464 tmp = inv_diag_E[2];
5465 CC[0+bs*2] *= tmp;
5466 CC[1+bs*2] *= tmp;
5467 CC[2+bs*2] *= tmp;
5468 CC[3+bs*2] *= tmp;
5469 tmp = E[1+bs*2];
5470 CC[0+bs*1] -= CC[0+bs*2] * tmp;
5471 CC[1+bs*1] -= CC[1+bs*2] * tmp;
5472 CC[2+bs*1] -= CC[2+bs*2] * tmp;
5473 CC[3+bs*1] -= CC[3+bs*2] * tmp;
5474 tmp = E[0+bs*2];
5475 CC[0+bs*0] -= CC[0+bs*2] * tmp;
5476 CC[1+bs*0] -= CC[1+bs*2] * tmp;
5477 CC[2+bs*0] -= CC[2+bs*2] * tmp;
5478 CC[3+bs*0] -= CC[3+bs*2] * tmp;
5479
5480 tmp = inv_diag_E[1];
5481 CC[0+bs*1] *= tmp;
5482 CC[1+bs*1] *= tmp;
5483 CC[2+bs*1] *= tmp;
5484 CC[3+bs*1] *= tmp;
5485 tmp = E[0+bs*1];
5486 CC[0+bs*0] -= CC[0+bs*1] * tmp;
5487 CC[1+bs*0] -= CC[1+bs*1] * tmp;
5488 CC[2+bs*0] -= CC[2+bs*1] * tmp;
5489 CC[3+bs*0] -= CC[3+bs*1] * tmp;
5490
5491 tmp = inv_diag_E[0];
5492 CC[0+bs*0] *= tmp;
5493 CC[1+bs*0] *= tmp;
5494 CC[2+bs*0] *= tmp;
5495 CC[3+bs*0] *= tmp;
5496
5497 D[0+bs*0] = CC[0+bs*0];
5498 D[1+bs*0] = CC[1+bs*0];
5499 D[2+bs*0] = CC[2+bs*0];
5500 D[3+bs*0] = CC[3+bs*0];
5501
5502 D[0+bs*1] = CC[0+bs*1];
5503 D[1+bs*1] = CC[1+bs*1];
5504 D[2+bs*1] = CC[2+bs*1];
5505 D[3+bs*1] = CC[3+bs*1];
5506
5507 D[0+bs*2] = CC[0+bs*2];
5508 D[1+bs*2] = CC[1+bs*2];
5509 D[2+bs*2] = CC[2+bs*2];
5510 D[3+bs*2] = CC[3+bs*2];
5511
5512 D[0+bs*3] = CC[0+bs*3];
5513 D[1+bs*3] = CC[1+bs*3];
5514 D[2+bs*3] = CC[2+bs*3];
5515 D[3+bs*3] = CC[3+bs*3];
5516
5517 return;
5518
5519 }
5520 #endif
5521
5522
5523
5524 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nt_ru_inv_4x4_vs_lib4(int kmax,float * A,float * B,float * beta,float * C,float * D,float * E,float * inv_diag_E,int km,int kn)5525 void kernel_strsm_nt_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, float *beta, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
5526 {
5527
5528 const int bs = 4;
5529
5530 float tmp;
5531
5532 #if defined(TARGET_GENERIC)
5533 float CC[16] = {0};
5534 #else
5535 ALIGNED( float CC[16], 64 ) = {0};
5536 #endif
5537
5538 float alpha1 = -1.0;
5539
5540 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
5541
5542 if(kn>3)
5543 {
5544 tmp = inv_diag_E[3];
5545 CC[0+bs*3] *= tmp;
5546 CC[1+bs*3] *= tmp;
5547 CC[2+bs*3] *= tmp;
5548 CC[3+bs*3] *= tmp;
5549 tmp = E[2+bs*3];
5550 CC[0+bs*2] -= CC[0+bs*3] * tmp;
5551 CC[1+bs*2] -= CC[1+bs*3] * tmp;
5552 CC[2+bs*2] -= CC[2+bs*3] * tmp;
5553 CC[3+bs*2] -= CC[3+bs*3] * tmp;
5554 tmp = E[1+bs*3];
5555 CC[0+bs*1] -= CC[0+bs*3] * tmp;
5556 CC[1+bs*1] -= CC[1+bs*3] * tmp;
5557 CC[2+bs*1] -= CC[2+bs*3] * tmp;
5558 CC[3+bs*1] -= CC[3+bs*3] * tmp;
5559 tmp = E[0+bs*3];
5560 CC[0+bs*0] -= CC[0+bs*3] * tmp;
5561 CC[1+bs*0] -= CC[1+bs*3] * tmp;
5562 CC[2+bs*0] -= CC[2+bs*3] * tmp;
5563 CC[3+bs*0] -= CC[3+bs*3] * tmp;
5564 }
5565
5566 if(kn>2)
5567 {
5568 tmp = inv_diag_E[2];
5569 CC[0+bs*2] *= tmp;
5570 CC[1+bs*2] *= tmp;
5571 CC[2+bs*2] *= tmp;
5572 CC[3+bs*2] *= tmp;
5573 tmp = E[1+bs*2];
5574 CC[0+bs*1] -= CC[0+bs*2] * tmp;
5575 CC[1+bs*1] -= CC[1+bs*2] * tmp;
5576 CC[2+bs*1] -= CC[2+bs*2] * tmp;
5577 CC[3+bs*1] -= CC[3+bs*2] * tmp;
5578 tmp = E[0+bs*2];
5579 CC[0+bs*0] -= CC[0+bs*2] * tmp;
5580 CC[1+bs*0] -= CC[1+bs*2] * tmp;
5581 CC[2+bs*0] -= CC[2+bs*2] * tmp;
5582 CC[3+bs*0] -= CC[3+bs*2] * tmp;
5583 }
5584
5585 if(kn>1)
5586 {
5587 tmp = inv_diag_E[1];
5588 CC[0+bs*1] *= tmp;
5589 CC[1+bs*1] *= tmp;
5590 CC[2+bs*1] *= tmp;
5591 CC[3+bs*1] *= tmp;
5592 tmp = E[0+bs*1];
5593 CC[0+bs*0] -= CC[0+bs*1] * tmp;
5594 CC[1+bs*0] -= CC[1+bs*1] * tmp;
5595 CC[2+bs*0] -= CC[2+bs*1] * tmp;
5596 CC[3+bs*0] -= CC[3+bs*1] * tmp;
5597 }
5598
5599 tmp = inv_diag_E[0];
5600 CC[0+bs*0] *= tmp;
5601 CC[1+bs*0] *= tmp;
5602 CC[2+bs*0] *= tmp;
5603 CC[3+bs*0] *= tmp;
5604
5605
5606 store:
5607
5608 if(km>=4)
5609 {
5610 D[0+bs*0] = CC[0+bs*0];
5611 D[1+bs*0] = CC[1+bs*0];
5612 D[2+bs*0] = CC[2+bs*0];
5613 D[3+bs*0] = CC[3+bs*0];
5614
5615 if(kn==1)
5616 return;
5617
5618 D[0+bs*1] = CC[0+bs*1];
5619 D[1+bs*1] = CC[1+bs*1];
5620 D[2+bs*1] = CC[2+bs*1];
5621 D[3+bs*1] = CC[3+bs*1];
5622
5623 if(kn==2)
5624 return;
5625
5626 D[0+bs*2] = CC[0+bs*2];
5627 D[1+bs*2] = CC[1+bs*2];
5628 D[2+bs*2] = CC[2+bs*2];
5629 D[3+bs*2] = CC[3+bs*2];
5630
5631 if(kn==3)
5632 return;
5633
5634 D[0+bs*3] = CC[0+bs*3];
5635 D[1+bs*3] = CC[1+bs*3];
5636 D[2+bs*3] = CC[2+bs*3];
5637 D[3+bs*3] = CC[3+bs*3];
5638 }
5639 else if(km>=3)
5640 {
5641 D[0+bs*0] = CC[0+bs*0];
5642 D[1+bs*0] = CC[1+bs*0];
5643 D[2+bs*0] = CC[2+bs*0];
5644
5645 if(kn==1)
5646 return;
5647
5648 D[0+bs*1] = CC[0+bs*1];
5649 D[1+bs*1] = CC[1+bs*1];
5650 D[2+bs*1] = CC[2+bs*1];
5651
5652 if(kn==2)
5653 return;
5654
5655 D[0+bs*2] = CC[0+bs*2];
5656 D[1+bs*2] = CC[1+bs*2];
5657 D[2+bs*2] = CC[2+bs*2];
5658
5659 if(kn==3)
5660 return;
5661
5662 D[0+bs*3] = CC[0+bs*3];
5663 D[1+bs*3] = CC[1+bs*3];
5664 D[2+bs*3] = CC[2+bs*3];
5665 }
5666 else if(km>=2)
5667 {
5668 D[0+bs*0] = CC[0+bs*0];
5669 D[1+bs*0] = CC[1+bs*0];
5670
5671 if(kn==1)
5672 return;
5673
5674 D[0+bs*1] = CC[0+bs*1];
5675 D[1+bs*1] = CC[1+bs*1];
5676
5677 if(kn==2)
5678 return;
5679
5680 D[0+bs*2] = CC[0+bs*2];
5681 D[1+bs*2] = CC[1+bs*2];
5682
5683 if(kn==3)
5684 return;
5685
5686 D[0+bs*3] = CC[0+bs*3];
5687 D[1+bs*3] = CC[1+bs*3];
5688 }
5689 else //if(km>=1)
5690 {
5691 D[0+bs*0] = CC[0+bs*0];
5692
5693 if(kn==1)
5694 return;
5695
5696 D[0+bs*1] = CC[0+bs*1];
5697
5698 if(kn==2)
5699 return;
5700
5701 D[0+bs*2] = CC[0+bs*2];
5702
5703 if(kn==3)
5704 return;
5705
5706 D[0+bs*3] = CC[0+bs*3];
5707 }
5708
5709 return;
5710
5711 }
5712 #endif
5713
5714
5715
5716 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgetrf_nn_4x4_lib4(int kmax,float * A,float * B,int sdb,float * C,float * D,float * inv_diag_D)5717 void kernel_sgetrf_nn_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D)
5718 {
5719
5720 const int bs = 4;
5721
5722 int k;
5723
5724 float tmp;
5725
5726 #if defined(TARGET_GENERIC)
5727 float CC[16] = {0};
5728 #else
5729 ALIGNED( float CC[16], 64 ) = {0};
5730 #endif
5731
5732 float alpha1 = -1.0;
5733 float beta1 = 1.0;
5734
5735 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
5736
5737 // factorization
5738
5739 // first column
5740 tmp = 1.0 / CC[0+bs*0];
5741 CC[1+bs*0] *= tmp;
5742 CC[2+bs*0] *= tmp;
5743 CC[3+bs*0] *= tmp;
5744
5745 inv_diag_D[0] = tmp;
5746
5747 // second column
5748 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1];
5749 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1];
5750 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1];
5751
5752 tmp = 1.0 / CC[1+bs*1];
5753 CC[2+bs*1] *= tmp;
5754 CC[3+bs*1] *= tmp;
5755
5756 inv_diag_D[1] = tmp;
5757
5758 // third column
5759 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2];
5760 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2];
5761 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2];
5762
5763 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2];
5764 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2];
5765
5766 tmp = 1.0 / CC[2+bs*2];
5767 CC[3+bs*2] *= tmp;
5768
5769 inv_diag_D[2] = tmp;
5770
5771 // fourth column
5772 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3];
5773 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3];
5774 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3];
5775
5776 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3];
5777 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3];
5778
5779 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3];
5780
5781 tmp = 1.0 / CC[3+bs*3];
5782
5783 inv_diag_D[3] = tmp;
5784
5785 D[0+bs*0] = CC[0+bs*0];
5786 D[1+bs*0] = CC[1+bs*0];
5787 D[2+bs*0] = CC[2+bs*0];
5788 D[3+bs*0] = CC[3+bs*0];
5789
5790 D[0+bs*1] = CC[0+bs*1];
5791 D[1+bs*1] = CC[1+bs*1];
5792 D[2+bs*1] = CC[2+bs*1];
5793 D[3+bs*1] = CC[3+bs*1];
5794
5795 D[0+bs*2] = CC[0+bs*2];
5796 D[1+bs*2] = CC[1+bs*2];
5797 D[2+bs*2] = CC[2+bs*2];
5798 D[3+bs*2] = CC[3+bs*2];
5799
5800 D[0+bs*3] = CC[0+bs*3];
5801 D[1+bs*3] = CC[1+bs*3];
5802 D[2+bs*3] = CC[2+bs*3];
5803 D[3+bs*3] = CC[3+bs*3];
5804
5805 return;
5806
5807 }
5808 #endif
5809
5810
5811
5812 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgetrf_nn_4x4_vs_lib4(int kmax,float * A,float * B,int sdb,float * C,float * D,float * inv_diag_D,int km,int kn)5813 void kernel_sgetrf_nn_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *inv_diag_D, int km, int kn)
5814 {
5815
5816 const int bs = 4;
5817
5818 int k;
5819
5820 float tmp;
5821
5822 #if defined(TARGET_GENERIC)
5823 float CC[16] = {0};
5824 #else
5825 ALIGNED( float CC[16], 64 ) = {0};
5826 #endif
5827
5828 float alpha1 = -1.0;
5829 float beta1 = 1.0;
5830
5831 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
5832
5833 // factorization
5834
5835 // first column
5836 tmp = 1.0 / CC[0+bs*0];
5837 CC[1+bs*0] *= tmp;
5838 CC[2+bs*0] *= tmp;
5839 CC[3+bs*0] *= tmp;
5840
5841 inv_diag_D[0] = tmp;
5842
5843 if(kn==1)
5844 goto store;
5845
5846 // second column
5847 CC[1+bs*1] -= CC[1+bs*0] * CC[0+bs*1];
5848 CC[2+bs*1] -= CC[2+bs*0] * CC[0+bs*1];
5849 CC[3+bs*1] -= CC[3+bs*0] * CC[0+bs*1];
5850
5851 tmp = 1.0 / CC[1+bs*1];
5852 CC[2+bs*1] *= tmp;
5853 CC[3+bs*1] *= tmp;
5854
5855 inv_diag_D[1] = tmp;
5856
5857 if(kn==2)
5858 goto store;
5859
5860 // third column
5861 CC[1+bs*2] -= CC[1+bs*0] * CC[0+bs*2];
5862 CC[2+bs*2] -= CC[2+bs*0] * CC[0+bs*2];
5863 CC[3+bs*2] -= CC[3+bs*0] * CC[0+bs*2];
5864
5865 CC[2+bs*2] -= CC[2+bs*1] * CC[1+bs*2];
5866 CC[3+bs*2] -= CC[3+bs*1] * CC[1+bs*2];
5867
5868 tmp = 1.0 / CC[2+bs*2];
5869 CC[3+bs*2] *= tmp;
5870
5871 inv_diag_D[2] = tmp;
5872
5873 if(kn==3)
5874 goto store;
5875
5876 // fourth column
5877 CC[1+bs*3] -= CC[1+bs*0] * CC[0+bs*3];
5878 CC[2+bs*3] -= CC[2+bs*0] * CC[0+bs*3];
5879 CC[3+bs*3] -= CC[3+bs*0] * CC[0+bs*3];
5880
5881 CC[2+bs*3] -= CC[2+bs*1] * CC[1+bs*3];
5882 CC[3+bs*3] -= CC[3+bs*1] * CC[1+bs*3];
5883
5884 CC[3+bs*3] -= CC[3+bs*2] * CC[2+bs*3];
5885
5886 tmp = 1.0 / CC[3+bs*3];
5887
5888 inv_diag_D[3] = tmp;
5889
5890 store:
5891
5892 if(km>=4)
5893 {
5894 D[0+bs*0] = CC[0+bs*0];
5895 D[1+bs*0] = CC[1+bs*0];
5896 D[2+bs*0] = CC[2+bs*0];
5897 D[3+bs*0] = CC[3+bs*0];
5898
5899 if(kn==1)
5900 return;
5901
5902 D[0+bs*1] = CC[0+bs*1];
5903 D[1+bs*1] = CC[1+bs*1];
5904 D[2+bs*1] = CC[2+bs*1];
5905 D[3+bs*1] = CC[3+bs*1];
5906
5907 if(kn==2)
5908 return;
5909
5910 D[0+bs*2] = CC[0+bs*2];
5911 D[1+bs*2] = CC[1+bs*2];
5912 D[2+bs*2] = CC[2+bs*2];
5913 D[3+bs*2] = CC[3+bs*2];
5914
5915 if(kn==3)
5916 return;
5917
5918 D[0+bs*3] = CC[0+bs*3];
5919 D[1+bs*3] = CC[1+bs*3];
5920 D[2+bs*3] = CC[2+bs*3];
5921 D[3+bs*3] = CC[3+bs*3];
5922 }
5923 else if(km>=3)
5924 {
5925 D[0+bs*0] = CC[0+bs*0];
5926 D[1+bs*0] = CC[1+bs*0];
5927 D[2+bs*0] = CC[2+bs*0];
5928
5929 if(kn==1)
5930 return;
5931
5932 D[0+bs*1] = CC[0+bs*1];
5933 D[1+bs*1] = CC[1+bs*1];
5934 D[2+bs*1] = CC[2+bs*1];
5935
5936 if(kn==2)
5937 return;
5938
5939 D[0+bs*2] = CC[0+bs*2];
5940 D[1+bs*2] = CC[1+bs*2];
5941 D[2+bs*2] = CC[2+bs*2];
5942
5943 if(kn==3)
5944 return;
5945
5946 D[0+bs*3] = CC[0+bs*3];
5947 D[1+bs*3] = CC[1+bs*3];
5948 D[2+bs*3] = CC[2+bs*3];
5949 }
5950 else if(km>=2)
5951 {
5952 D[0+bs*0] = CC[0+bs*0];
5953 D[1+bs*0] = CC[1+bs*0];
5954
5955 if(kn==1)
5956 return;
5957
5958 D[0+bs*1] = CC[0+bs*1];
5959 D[1+bs*1] = CC[1+bs*1];
5960
5961 if(kn==2)
5962 return;
5963
5964 D[0+bs*2] = CC[0+bs*2];
5965 D[1+bs*2] = CC[1+bs*2];
5966
5967 if(kn==3)
5968 return;
5969
5970 D[0+bs*3] = CC[0+bs*3];
5971 D[1+bs*3] = CC[1+bs*3];
5972 }
5973 else //if(km>=1)
5974 {
5975 D[0+bs*0] = CC[0+bs*0];
5976
5977 if(kn==1)
5978 return;
5979
5980 D[0+bs*1] = CC[0+bs*1];
5981
5982 if(kn==2)
5983 return;
5984
5985 D[0+bs*2] = CC[0+bs*2];
5986
5987 if(kn==3)
5988 return;
5989
5990 D[0+bs*3] = CC[0+bs*3];
5991 }
5992
5993 return;
5994
5995 }
5996 #endif
5997
5998
5999
6000 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nt_ru_one_4x4_lib4(int kmax,float * A,float * B,float * beta,float * C,float * D,float * E)6001 void kernel_strsm_nt_ru_one_4x4_lib4(int kmax, float *A, float *B, float *beta, float *C, float *D, float *E)
6002 {
6003
6004 const int bs = 4;
6005
6006 float tmp;
6007
6008 #if defined(TARGET_GENERIC)
6009 float CC[16] = {0};
6010 #else
6011 ALIGNED( float CC[16], 64 ) = {0};
6012 #endif
6013
6014 float alpha1 = -1.0;
6015
6016 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6017
6018 tmp = E[2+bs*3];
6019 CC[0+bs*2] -= CC[0+bs*3] * tmp;
6020 CC[1+bs*2] -= CC[1+bs*3] * tmp;
6021 CC[2+bs*2] -= CC[2+bs*3] * tmp;
6022 CC[3+bs*2] -= CC[3+bs*3] * tmp;
6023 tmp = E[1+bs*3];
6024 CC[0+bs*1] -= CC[0+bs*3] * tmp;
6025 CC[1+bs*1] -= CC[1+bs*3] * tmp;
6026 CC[2+bs*1] -= CC[2+bs*3] * tmp;
6027 CC[3+bs*1] -= CC[3+bs*3] * tmp;
6028 tmp = E[0+bs*3];
6029 CC[0+bs*0] -= CC[0+bs*3] * tmp;
6030 CC[1+bs*0] -= CC[1+bs*3] * tmp;
6031 CC[2+bs*0] -= CC[2+bs*3] * tmp;
6032 CC[3+bs*0] -= CC[3+bs*3] * tmp;
6033
6034 tmp = E[1+bs*2];
6035 CC[0+bs*1] -= CC[0+bs*2] * tmp;
6036 CC[1+bs*1] -= CC[1+bs*2] * tmp;
6037 CC[2+bs*1] -= CC[2+bs*2] * tmp;
6038 CC[3+bs*1] -= CC[3+bs*2] * tmp;
6039 tmp = E[0+bs*2];
6040 CC[0+bs*0] -= CC[0+bs*2] * tmp;
6041 CC[1+bs*0] -= CC[1+bs*2] * tmp;
6042 CC[2+bs*0] -= CC[2+bs*2] * tmp;
6043 CC[3+bs*0] -= CC[3+bs*2] * tmp;
6044
6045 tmp = E[0+bs*1];
6046 CC[0+bs*0] -= CC[0+bs*1] * tmp;
6047 CC[1+bs*0] -= CC[1+bs*1] * tmp;
6048 CC[2+bs*0] -= CC[2+bs*1] * tmp;
6049 CC[3+bs*0] -= CC[3+bs*1] * tmp;
6050
6051
6052 D[0+bs*0] = CC[0+bs*0];
6053 D[1+bs*0] = CC[1+bs*0];
6054 D[2+bs*0] = CC[2+bs*0];
6055 D[3+bs*0] = CC[3+bs*0];
6056
6057 D[0+bs*1] = CC[0+bs*1];
6058 D[1+bs*1] = CC[1+bs*1];
6059 D[2+bs*1] = CC[2+bs*1];
6060 D[3+bs*1] = CC[3+bs*1];
6061
6062 D[0+bs*2] = CC[0+bs*2];
6063 D[1+bs*2] = CC[1+bs*2];
6064 D[2+bs*2] = CC[2+bs*2];
6065 D[3+bs*2] = CC[3+bs*2];
6066
6067 D[0+bs*3] = CC[0+bs*3];
6068 D[1+bs*3] = CC[1+bs*3];
6069 D[2+bs*3] = CC[2+bs*3];
6070 D[3+bs*3] = CC[3+bs*3];
6071
6072 return;
6073
6074 }
6075 #endif
6076
6077
6078
6079 #if defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nt_ru_one_4x4_vs_lib4(int kmax,float * A,float * B,float * beta,float * C,float * D,float * E,int km,int kn)6080 void kernel_strsm_nt_ru_one_4x4_vs_lib4(int kmax, float *A, float *B, float *beta, float *C, float *D, float *E, int km, int kn)
6081 {
6082
6083 const int bs = 4;
6084
6085 float tmp;
6086
6087 #if defined(TARGET_GENERIC)
6088 float CC[16] = {0};
6089 #else
6090 ALIGNED( float CC[16], 64 ) = {0};
6091 #endif
6092
6093 float alpha1 = -1.0;
6094
6095 kernel_sgemm_nt_4x4_lib4(kmax, &alpha1, A, B, beta, C, CC);
6096
6097 if(kn>3)
6098 {
6099 tmp = E[2+bs*3];
6100 CC[0+bs*2] -= CC[0+bs*3] * tmp;
6101 CC[1+bs*2] -= CC[1+bs*3] * tmp;
6102 CC[2+bs*2] -= CC[2+bs*3] * tmp;
6103 CC[3+bs*2] -= CC[3+bs*3] * tmp;
6104 tmp = E[1+bs*3];
6105 CC[0+bs*1] -= CC[0+bs*3] * tmp;
6106 CC[1+bs*1] -= CC[1+bs*3] * tmp;
6107 CC[2+bs*1] -= CC[2+bs*3] * tmp;
6108 CC[3+bs*1] -= CC[3+bs*3] * tmp;
6109 tmp = E[0+bs*3];
6110 CC[0+bs*0] -= CC[0+bs*3] * tmp;
6111 CC[1+bs*0] -= CC[1+bs*3] * tmp;
6112 CC[2+bs*0] -= CC[2+bs*3] * tmp;
6113 CC[3+bs*0] -= CC[3+bs*3] * tmp;
6114 }
6115
6116 if(kn>2)
6117 {
6118 tmp = E[1+bs*2];
6119 CC[0+bs*1] -= CC[0+bs*2] * tmp;
6120 CC[1+bs*1] -= CC[1+bs*2] * tmp;
6121 CC[2+bs*1] -= CC[2+bs*2] * tmp;
6122 CC[3+bs*1] -= CC[3+bs*2] * tmp;
6123 tmp = E[0+bs*2];
6124 CC[0+bs*0] -= CC[0+bs*2] * tmp;
6125 CC[1+bs*0] -= CC[1+bs*2] * tmp;
6126 CC[2+bs*0] -= CC[2+bs*2] * tmp;
6127 CC[3+bs*0] -= CC[3+bs*2] * tmp;
6128 }
6129
6130 if(kn>1)
6131 {
6132 tmp = E[0+bs*1];
6133 CC[0+bs*0] -= CC[0+bs*1] * tmp;
6134 CC[1+bs*0] -= CC[1+bs*1] * tmp;
6135 CC[2+bs*0] -= CC[2+bs*1] * tmp;
6136 CC[3+bs*0] -= CC[3+bs*1] * tmp;
6137 }
6138
6139
6140 store:
6141
6142 if(km>=4)
6143 {
6144 D[0+bs*0] = CC[0+bs*0];
6145 D[1+bs*0] = CC[1+bs*0];
6146 D[2+bs*0] = CC[2+bs*0];
6147 D[3+bs*0] = CC[3+bs*0];
6148
6149 if(kn==1)
6150 return;
6151
6152 D[0+bs*1] = CC[0+bs*1];
6153 D[1+bs*1] = CC[1+bs*1];
6154 D[2+bs*1] = CC[2+bs*1];
6155 D[3+bs*1] = CC[3+bs*1];
6156
6157 if(kn==2)
6158 return;
6159
6160 D[0+bs*2] = CC[0+bs*2];
6161 D[1+bs*2] = CC[1+bs*2];
6162 D[2+bs*2] = CC[2+bs*2];
6163 D[3+bs*2] = CC[3+bs*2];
6164
6165 if(kn==3)
6166 return;
6167
6168 D[0+bs*3] = CC[0+bs*3];
6169 D[1+bs*3] = CC[1+bs*3];
6170 D[2+bs*3] = CC[2+bs*3];
6171 D[3+bs*3] = CC[3+bs*3];
6172 }
6173 else if(km>=3)
6174 {
6175 D[0+bs*0] = CC[0+bs*0];
6176 D[1+bs*0] = CC[1+bs*0];
6177 D[2+bs*0] = CC[2+bs*0];
6178
6179 if(kn==1)
6180 return;
6181
6182 D[0+bs*1] = CC[0+bs*1];
6183 D[1+bs*1] = CC[1+bs*1];
6184 D[2+bs*1] = CC[2+bs*1];
6185
6186 if(kn==2)
6187 return;
6188
6189 D[0+bs*2] = CC[0+bs*2];
6190 D[1+bs*2] = CC[1+bs*2];
6191 D[2+bs*2] = CC[2+bs*2];
6192
6193 if(kn==3)
6194 return;
6195
6196 D[0+bs*3] = CC[0+bs*3];
6197 D[1+bs*3] = CC[1+bs*3];
6198 D[2+bs*3] = CC[2+bs*3];
6199 }
6200 else if(km>=2)
6201 {
6202 D[0+bs*0] = CC[0+bs*0];
6203 D[1+bs*0] = CC[1+bs*0];
6204
6205 if(kn==1)
6206 return;
6207
6208 D[0+bs*1] = CC[0+bs*1];
6209 D[1+bs*1] = CC[1+bs*1];
6210
6211 if(kn==2)
6212 return;
6213
6214 D[0+bs*2] = CC[0+bs*2];
6215 D[1+bs*2] = CC[1+bs*2];
6216
6217 if(kn==3)
6218 return;
6219
6220 D[0+bs*3] = CC[0+bs*3];
6221 D[1+bs*3] = CC[1+bs*3];
6222 }
6223 else //if(km>=1)
6224 {
6225 D[0+bs*0] = CC[0+bs*0];
6226
6227 if(kn==1)
6228 return;
6229
6230 D[0+bs*1] = CC[0+bs*1];
6231
6232 if(kn==2)
6233 return;
6234
6235 D[0+bs*2] = CC[0+bs*2];
6236
6237 if(kn==3)
6238 return;
6239
6240 D[0+bs*3] = CC[0+bs*3];
6241 }
6242
6243 return;
6244
6245 }
6246 #endif
6247
6248
6249
6250 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nn_ll_one_4x4_lib4(int kmax,float * A,float * B,int sdb,float * C,float * D,float * E)6251 void kernel_strsm_nn_ll_one_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E)
6252 {
6253
6254 const int bs = 4;
6255
6256 int k;
6257
6258 float
6259 tmp,
6260 e_1, e_2, e_3;
6261
6262 #if defined(TARGET_GENERIC)
6263 float CC[16] = {0};
6264 #else
6265 ALIGNED( float CC[16], 64 ) = {0};
6266 #endif
6267
6268 float alpha1 = -1.0;
6269 float beta1 = 1.0;
6270
6271 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
6272
6273 // solution
6274
6275 e_1 = E[1+bs*0];
6276 e_2 = E[2+bs*0];
6277 e_3 = E[3+bs*0];
6278 CC[1+bs*0] -= e_1 * CC[0+bs*0];
6279 CC[2+bs*0] -= e_2 * CC[0+bs*0];
6280 CC[3+bs*0] -= e_3 * CC[0+bs*0];
6281 CC[1+bs*1] -= e_1 * CC[0+bs*1];
6282 CC[2+bs*1] -= e_2 * CC[0+bs*1];
6283 CC[3+bs*1] -= e_3 * CC[0+bs*1];
6284 CC[1+bs*2] -= e_1 * CC[0+bs*2];
6285 CC[2+bs*2] -= e_2 * CC[0+bs*2];
6286 CC[3+bs*2] -= e_3 * CC[0+bs*2];
6287 CC[1+bs*3] -= e_1 * CC[0+bs*3];
6288 CC[2+bs*3] -= e_2 * CC[0+bs*3];
6289 CC[3+bs*3] -= e_3 * CC[0+bs*3];
6290
6291 e_2 = E[2+bs*1];
6292 e_3 = E[3+bs*1];
6293 CC[2+bs*0] -= e_2 * CC[1+bs*0];
6294 CC[3+bs*0] -= e_3 * CC[1+bs*0];
6295 CC[2+bs*1] -= e_2 * CC[1+bs*1];
6296 CC[3+bs*1] -= e_3 * CC[1+bs*1];
6297 CC[2+bs*2] -= e_2 * CC[1+bs*2];
6298 CC[3+bs*2] -= e_3 * CC[1+bs*2];
6299 CC[2+bs*3] -= e_2 * CC[1+bs*3];
6300 CC[3+bs*3] -= e_3 * CC[1+bs*3];
6301
6302 e_3 = E[3+bs*2];
6303 CC[3+bs*0] -= e_3 * CC[2+bs*0];
6304 CC[3+bs*1] -= e_3 * CC[2+bs*1];
6305 CC[3+bs*2] -= e_3 * CC[2+bs*2];
6306 CC[3+bs*3] -= e_3 * CC[2+bs*3];
6307
6308 D[0+bs*0] = CC[0+bs*0];
6309 D[1+bs*0] = CC[1+bs*0];
6310 D[2+bs*0] = CC[2+bs*0];
6311 D[3+bs*0] = CC[3+bs*0];
6312
6313 D[0+bs*1] = CC[0+bs*1];
6314 D[1+bs*1] = CC[1+bs*1];
6315 D[2+bs*1] = CC[2+bs*1];
6316 D[3+bs*1] = CC[3+bs*1];
6317
6318 D[0+bs*2] = CC[0+bs*2];
6319 D[1+bs*2] = CC[1+bs*2];
6320 D[2+bs*2] = CC[2+bs*2];
6321 D[3+bs*2] = CC[3+bs*2];
6322
6323 D[0+bs*3] = CC[0+bs*3];
6324 D[1+bs*3] = CC[1+bs*3];
6325 D[2+bs*3] = CC[2+bs*3];
6326 D[3+bs*3] = CC[3+bs*3];
6327
6328 return;
6329
6330 }
6331 #endif
6332
6333
6334
6335 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nn_ll_one_4x4_vs_lib4(int kmax,float * A,float * B,int sdb,float * C,float * D,float * E,int km,int kn)6336 void kernel_strsm_nn_ll_one_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, int km, int kn)
6337 {
6338
6339 const int bs = 4;
6340
6341 int k;
6342
6343 float
6344 tmp,
6345 e_1, e_2, e_3;
6346
6347 #if defined(TARGET_GENERIC)
6348 float CC[16] = {0};
6349 #else
6350 ALIGNED( float CC[16], 64 ) = {0};
6351 #endif
6352
6353 float alpha1 = -1.0;
6354 float beta1 = 1.0;
6355
6356 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
6357
6358 // solution
6359
6360 if(km==1)
6361 goto store;
6362
6363 e_1 = E[1+bs*0];
6364 e_2 = E[2+bs*0];
6365 e_3 = E[3+bs*0];
6366 CC[1+bs*0] -= e_1 * CC[0+bs*0];
6367 CC[2+bs*0] -= e_2 * CC[0+bs*0];
6368 CC[3+bs*0] -= e_3 * CC[0+bs*0];
6369 CC[1+bs*1] -= e_1 * CC[0+bs*1];
6370 CC[2+bs*1] -= e_2 * CC[0+bs*1];
6371 CC[3+bs*1] -= e_3 * CC[0+bs*1];
6372 CC[1+bs*2] -= e_1 * CC[0+bs*2];
6373 CC[2+bs*2] -= e_2 * CC[0+bs*2];
6374 CC[3+bs*2] -= e_3 * CC[0+bs*2];
6375 CC[1+bs*3] -= e_1 * CC[0+bs*3];
6376 CC[2+bs*3] -= e_2 * CC[0+bs*3];
6377 CC[3+bs*3] -= e_3 * CC[0+bs*3];
6378
6379 if(km==2)
6380 goto store;
6381
6382 e_2 = E[2+bs*1];
6383 e_3 = E[3+bs*1];
6384 CC[2+bs*0] -= e_2 * CC[1+bs*0];
6385 CC[3+bs*0] -= e_3 * CC[1+bs*0];
6386 CC[2+bs*1] -= e_2 * CC[1+bs*1];
6387 CC[3+bs*1] -= e_3 * CC[1+bs*1];
6388 CC[2+bs*2] -= e_2 * CC[1+bs*2];
6389 CC[3+bs*2] -= e_3 * CC[1+bs*2];
6390 CC[2+bs*3] -= e_2 * CC[1+bs*3];
6391 CC[3+bs*3] -= e_3 * CC[1+bs*3];
6392
6393 if(km==3)
6394 goto store;
6395
6396 e_3 = E[3+bs*2];
6397 CC[3+bs*0] -= e_3 * CC[2+bs*0];
6398 CC[3+bs*1] -= e_3 * CC[2+bs*1];
6399 CC[3+bs*2] -= e_3 * CC[2+bs*2];
6400 CC[3+bs*3] -= e_3 * CC[2+bs*3];
6401
6402 store:
6403
6404 if(km>=4)
6405 {
6406 D[0+bs*0] = CC[0+bs*0];
6407 D[1+bs*0] = CC[1+bs*0];
6408 D[2+bs*0] = CC[2+bs*0];
6409 D[3+bs*0] = CC[3+bs*0];
6410
6411 if(kn==1)
6412 return;
6413
6414 D[0+bs*1] = CC[0+bs*1];
6415 D[1+bs*1] = CC[1+bs*1];
6416 D[2+bs*1] = CC[2+bs*1];
6417 D[3+bs*1] = CC[3+bs*1];
6418
6419 if(kn==2)
6420 return;
6421
6422 D[0+bs*2] = CC[0+bs*2];
6423 D[1+bs*2] = CC[1+bs*2];
6424 D[2+bs*2] = CC[2+bs*2];
6425 D[3+bs*2] = CC[3+bs*2];
6426
6427 if(kn==3)
6428 return;
6429
6430 D[0+bs*3] = CC[0+bs*3];
6431 D[1+bs*3] = CC[1+bs*3];
6432 D[2+bs*3] = CC[2+bs*3];
6433 D[3+bs*3] = CC[3+bs*3];
6434 }
6435 else if(km>=3)
6436 {
6437 D[0+bs*0] = CC[0+bs*0];
6438 D[1+bs*0] = CC[1+bs*0];
6439 D[2+bs*0] = CC[2+bs*0];
6440
6441 if(kn==1)
6442 return;
6443
6444 D[0+bs*1] = CC[0+bs*1];
6445 D[1+bs*1] = CC[1+bs*1];
6446 D[2+bs*1] = CC[2+bs*1];
6447
6448 if(kn==2)
6449 return;
6450
6451 D[0+bs*2] = CC[0+bs*2];
6452 D[1+bs*2] = CC[1+bs*2];
6453 D[2+bs*2] = CC[2+bs*2];
6454
6455 if(kn==3)
6456 return;
6457
6458 D[0+bs*3] = CC[0+bs*3];
6459 D[1+bs*3] = CC[1+bs*3];
6460 D[2+bs*3] = CC[2+bs*3];
6461 }
6462 else if(km>=2)
6463 {
6464 D[0+bs*0] = CC[0+bs*0];
6465 D[1+bs*0] = CC[1+bs*0];
6466
6467 if(kn==1)
6468 return;
6469
6470 D[0+bs*1] = CC[0+bs*1];
6471 D[1+bs*1] = CC[1+bs*1];
6472
6473 if(kn==2)
6474 return;
6475
6476 D[0+bs*2] = CC[0+bs*2];
6477 D[1+bs*2] = CC[1+bs*2];
6478
6479 if(kn==3)
6480 return;
6481
6482 D[0+bs*3] = CC[0+bs*3];
6483 D[1+bs*3] = CC[1+bs*3];
6484 }
6485 else //if(km>=1)
6486 {
6487 D[0+bs*0] = CC[0+bs*0];
6488
6489 if(kn==1)
6490 return;
6491
6492 D[0+bs*1] = CC[0+bs*1];
6493
6494 if(kn==2)
6495 return;
6496
6497 D[0+bs*2] = CC[0+bs*2];
6498
6499 if(kn==3)
6500 return;
6501
6502 D[0+bs*3] = CC[0+bs*3];
6503 }
6504
6505 return;
6506
6507 }
6508 #endif
6509
6510
6511
6512 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nn_ru_inv_4x4_lib4(int kmax,float * A,float * B,int sdb,float * beta,float * C,float * D,float * E,float * inv_diag_E)6513 void kernel_strsm_nn_ru_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *beta, float *C, float *D, float *E, float *inv_diag_E)
6514 {
6515
6516 const int bs = 4;
6517
6518 int k;
6519
6520 float
6521 tmp,
6522 e_00, e_01, e_02, e_03,
6523 e_11, e_12, e_13,
6524 e_22, e_23,
6525 e_33;
6526
6527 #if defined(TARGET_GENERIC)
6528 float CC[16] = {0};
6529 #else
6530 ALIGNED( float CC[16], 64 ) = {0};
6531 #endif
6532
6533 float alpha1 = -1.0;
6534
6535 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC);
6536
6537 // solve
6538
6539 e_00 = inv_diag_E[0];
6540 CC[0+bs*0] *= e_00;
6541 CC[1+bs*0] *= e_00;
6542 CC[2+bs*0] *= e_00;
6543 CC[3+bs*0] *= e_00;
6544
6545 e_01 = E[0+bs*1];
6546 e_11 = inv_diag_E[1];
6547 CC[0+bs*1] -= CC[0+bs*0] * e_01;
6548 CC[1+bs*1] -= CC[1+bs*0] * e_01;
6549 CC[2+bs*1] -= CC[2+bs*0] * e_01;
6550 CC[3+bs*1] -= CC[3+bs*0] * e_01;
6551 CC[0+bs*1] *= e_11;
6552 CC[1+bs*1] *= e_11;
6553 CC[2+bs*1] *= e_11;
6554 CC[3+bs*1] *= e_11;
6555
6556 e_02 = E[0+bs*2];
6557 e_12 = E[1+bs*2];
6558 e_22 = inv_diag_E[2];
6559 CC[0+bs*2] -= CC[0+bs*0] * e_02;
6560 CC[1+bs*2] -= CC[1+bs*0] * e_02;
6561 CC[2+bs*2] -= CC[2+bs*0] * e_02;
6562 CC[3+bs*2] -= CC[3+bs*0] * e_02;
6563 CC[0+bs*2] -= CC[0+bs*1] * e_12;
6564 CC[1+bs*2] -= CC[1+bs*1] * e_12;
6565 CC[2+bs*2] -= CC[2+bs*1] * e_12;
6566 CC[3+bs*2] -= CC[3+bs*1] * e_12;
6567 CC[0+bs*2] *= e_22;
6568 CC[1+bs*2] *= e_22;
6569 CC[2+bs*2] *= e_22;
6570 CC[3+bs*2] *= e_22;
6571
6572 e_03 = E[0+bs*3];
6573 e_13 = E[1+bs*3];
6574 e_23 = E[2+bs*3];
6575 e_33 = inv_diag_E[3];
6576 CC[0+bs*3] -= CC[0+bs*0] * e_03;
6577 CC[1+bs*3] -= CC[1+bs*0] * e_03;
6578 CC[2+bs*3] -= CC[2+bs*0] * e_03;
6579 CC[3+bs*3] -= CC[3+bs*0] * e_03;
6580 CC[0+bs*3] -= CC[0+bs*1] * e_13;
6581 CC[1+bs*3] -= CC[1+bs*1] * e_13;
6582 CC[2+bs*3] -= CC[2+bs*1] * e_13;
6583 CC[3+bs*3] -= CC[3+bs*1] * e_13;
6584 CC[0+bs*3] -= CC[0+bs*2] * e_23;
6585 CC[1+bs*3] -= CC[1+bs*2] * e_23;
6586 CC[2+bs*3] -= CC[2+bs*2] * e_23;
6587 CC[3+bs*3] -= CC[3+bs*2] * e_23;
6588 CC[0+bs*3] *= e_33;
6589 CC[1+bs*3] *= e_33;
6590 CC[2+bs*3] *= e_33;
6591 CC[3+bs*3] *= e_33;
6592
6593 D[0+bs*0] = CC[0+bs*0];
6594 D[1+bs*0] = CC[1+bs*0];
6595 D[2+bs*0] = CC[2+bs*0];
6596 D[3+bs*0] = CC[3+bs*0];
6597
6598 D[0+bs*1] = CC[0+bs*1];
6599 D[1+bs*1] = CC[1+bs*1];
6600 D[2+bs*1] = CC[2+bs*1];
6601 D[3+bs*1] = CC[3+bs*1];
6602
6603 D[0+bs*2] = CC[0+bs*2];
6604 D[1+bs*2] = CC[1+bs*2];
6605 D[2+bs*2] = CC[2+bs*2];
6606 D[3+bs*2] = CC[3+bs*2];
6607
6608 D[0+bs*3] = CC[0+bs*3];
6609 D[1+bs*3] = CC[1+bs*3];
6610 D[2+bs*3] = CC[2+bs*3];
6611 D[3+bs*3] = CC[3+bs*3];
6612
6613 return;
6614
6615 }
6616 #endif
6617
6618
6619
6620 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nn_ru_inv_4x4_vs_lib4(int kmax,float * A,float * B,int sdb,float * beta,float * C,float * D,float * E,float * inv_diag_E,int km,int kn)6621 void kernel_strsm_nn_ru_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *beta, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
6622 {
6623
6624 const int bs = 4;
6625
6626 int k;
6627
6628 float
6629 tmp,
6630 e_00, e_01, e_02, e_03,
6631 e_11, e_12, e_13,
6632 e_22, e_23,
6633 e_33;
6634
6635 #if defined(TARGET_GENERIC)
6636 float CC[16] = {0};
6637 #else
6638 ALIGNED( float CC[16], 64 ) = {0};
6639 #endif
6640
6641 float alpha1 = -1.0;
6642
6643 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, beta, C, CC);
6644
6645 // solve
6646
6647 e_00 = inv_diag_E[0];
6648 CC[0+bs*0] *= e_00;
6649 CC[1+bs*0] *= e_00;
6650 CC[2+bs*0] *= e_00;
6651 CC[3+bs*0] *= e_00;
6652
6653 if(kn==1)
6654 goto store;
6655
6656 e_01 = E[0+bs*1];
6657 e_11 = inv_diag_E[1];
6658 CC[0+bs*1] -= CC[0+bs*0] * e_01;
6659 CC[1+bs*1] -= CC[1+bs*0] * e_01;
6660 CC[2+bs*1] -= CC[2+bs*0] * e_01;
6661 CC[3+bs*1] -= CC[3+bs*0] * e_01;
6662 CC[0+bs*1] *= e_11;
6663 CC[1+bs*1] *= e_11;
6664 CC[2+bs*1] *= e_11;
6665 CC[3+bs*1] *= e_11;
6666
6667 if(kn==2)
6668 goto store;
6669
6670 e_02 = E[0+bs*2];
6671 e_12 = E[1+bs*2];
6672 e_22 = inv_diag_E[2];
6673 CC[0+bs*2] -= CC[0+bs*0] * e_02;
6674 CC[1+bs*2] -= CC[1+bs*0] * e_02;
6675 CC[2+bs*2] -= CC[2+bs*0] * e_02;
6676 CC[3+bs*2] -= CC[3+bs*0] * e_02;
6677 CC[0+bs*2] -= CC[0+bs*1] * e_12;
6678 CC[1+bs*2] -= CC[1+bs*1] * e_12;
6679 CC[2+bs*2] -= CC[2+bs*1] * e_12;
6680 CC[3+bs*2] -= CC[3+bs*1] * e_12;
6681 CC[0+bs*2] *= e_22;
6682 CC[1+bs*2] *= e_22;
6683 CC[2+bs*2] *= e_22;
6684 CC[3+bs*2] *= e_22;
6685
6686 if(kn==3)
6687 goto store;
6688
6689 e_03 = E[0+bs*3];
6690 e_13 = E[1+bs*3];
6691 e_23 = E[2+bs*3];
6692 e_33 = inv_diag_E[3];
6693 CC[0+bs*3] -= CC[0+bs*0] * e_03;
6694 CC[1+bs*3] -= CC[1+bs*0] * e_03;
6695 CC[2+bs*3] -= CC[2+bs*0] * e_03;
6696 CC[3+bs*3] -= CC[3+bs*0] * e_03;
6697 CC[0+bs*3] -= CC[0+bs*1] * e_13;
6698 CC[1+bs*3] -= CC[1+bs*1] * e_13;
6699 CC[2+bs*3] -= CC[2+bs*1] * e_13;
6700 CC[3+bs*3] -= CC[3+bs*1] * e_13;
6701 CC[0+bs*3] -= CC[0+bs*2] * e_23;
6702 CC[1+bs*3] -= CC[1+bs*2] * e_23;
6703 CC[2+bs*3] -= CC[2+bs*2] * e_23;
6704 CC[3+bs*3] -= CC[3+bs*2] * e_23;
6705 CC[0+bs*3] *= e_33;
6706 CC[1+bs*3] *= e_33;
6707 CC[2+bs*3] *= e_33;
6708 CC[3+bs*3] *= e_33;
6709
6710 store:
6711
6712 if(km>=4)
6713 {
6714 D[0+bs*0] = CC[0+bs*0];
6715 D[1+bs*0] = CC[1+bs*0];
6716 D[2+bs*0] = CC[2+bs*0];
6717 D[3+bs*0] = CC[3+bs*0];
6718
6719 if(kn==1)
6720 return;
6721
6722 D[0+bs*1] = CC[0+bs*1];
6723 D[1+bs*1] = CC[1+bs*1];
6724 D[2+bs*1] = CC[2+bs*1];
6725 D[3+bs*1] = CC[3+bs*1];
6726
6727 if(kn==2)
6728 return;
6729
6730 D[0+bs*2] = CC[0+bs*2];
6731 D[1+bs*2] = CC[1+bs*2];
6732 D[2+bs*2] = CC[2+bs*2];
6733 D[3+bs*2] = CC[3+bs*2];
6734
6735 if(kn==3)
6736 return;
6737
6738 D[0+bs*3] = CC[0+bs*3];
6739 D[1+bs*3] = CC[1+bs*3];
6740 D[2+bs*3] = CC[2+bs*3];
6741 D[3+bs*3] = CC[3+bs*3];
6742 }
6743 else if(km>=3)
6744 {
6745 D[0+bs*0] = CC[0+bs*0];
6746 D[1+bs*0] = CC[1+bs*0];
6747 D[2+bs*0] = CC[2+bs*0];
6748
6749 if(kn==1)
6750 return;
6751
6752 D[0+bs*1] = CC[0+bs*1];
6753 D[1+bs*1] = CC[1+bs*1];
6754 D[2+bs*1] = CC[2+bs*1];
6755
6756 if(kn==2)
6757 return;
6758
6759 D[0+bs*2] = CC[0+bs*2];
6760 D[1+bs*2] = CC[1+bs*2];
6761 D[2+bs*2] = CC[2+bs*2];
6762
6763 if(kn==3)
6764 return;
6765
6766 D[0+bs*3] = CC[0+bs*3];
6767 D[1+bs*3] = CC[1+bs*3];
6768 D[2+bs*3] = CC[2+bs*3];
6769 }
6770 else if(km>=2)
6771 {
6772 D[0+bs*0] = CC[0+bs*0];
6773 D[1+bs*0] = CC[1+bs*0];
6774
6775 if(kn==1)
6776 return;
6777
6778 D[0+bs*1] = CC[0+bs*1];
6779 D[1+bs*1] = CC[1+bs*1];
6780
6781 if(kn==2)
6782 return;
6783
6784 D[0+bs*2] = CC[0+bs*2];
6785 D[1+bs*2] = CC[1+bs*2];
6786
6787 if(kn==3)
6788 return;
6789
6790 D[0+bs*3] = CC[0+bs*3];
6791 D[1+bs*3] = CC[1+bs*3];
6792 }
6793 else //if(km>=1)
6794 {
6795 D[0+bs*0] = CC[0+bs*0];
6796
6797 if(kn==1)
6798 return;
6799
6800 D[0+bs*1] = CC[0+bs*1];
6801
6802 if(kn==2)
6803 return;
6804
6805 D[0+bs*2] = CC[0+bs*2];
6806
6807 if(kn==3)
6808 return;
6809
6810 D[0+bs*3] = CC[0+bs*3];
6811 }
6812
6813 return;
6814
6815 }
6816 #endif
6817
6818
6819
6820 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nn_lu_inv_4x4_lib4(int kmax,float * A,float * B,int sdb,float * C,float * D,float * E,float * inv_diag_E)6821 void kernel_strsm_nn_lu_inv_4x4_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E)
6822 {
6823
6824 const int bs = 4;
6825
6826 int k;
6827
6828 float
6829 tmp,
6830 a_0, a_1, a_2, a_3,
6831 b_0, b_1, b_2, b_3,
6832 e_00, e_01, e_02, e_03,
6833 e_11, e_12, e_13,
6834 e_22, e_23,
6835 e_33;
6836
6837 #if defined(TARGET_GENERIC)
6838 float CC[16] = {0};
6839 #else
6840 ALIGNED( float CC[16], 64 ) = {0};
6841 #endif
6842
6843 float alpha1 = -1.0;
6844 float beta1 = 1.0;
6845
6846 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
6847
6848 // solve
6849
6850 e_03 = E[0+bs*3];
6851 e_13 = E[1+bs*3];
6852 e_23 = E[2+bs*3];
6853 e_33 = inv_diag_E[3];
6854 CC[3+bs*0] *= e_33;
6855 CC[3+bs*1] *= e_33;
6856 CC[3+bs*2] *= e_33;
6857 CC[3+bs*3] *= e_33;
6858 CC[0+bs*0] -= e_03 * CC[3+bs*0];
6859 CC[0+bs*1] -= e_03 * CC[3+bs*1];
6860 CC[0+bs*2] -= e_03 * CC[3+bs*2];
6861 CC[0+bs*3] -= e_03 * CC[3+bs*3];
6862 CC[1+bs*0] -= e_13 * CC[3+bs*0];
6863 CC[1+bs*1] -= e_13 * CC[3+bs*1];
6864 CC[1+bs*2] -= e_13 * CC[3+bs*2];
6865 CC[1+bs*3] -= e_13 * CC[3+bs*3];
6866 CC[2+bs*0] -= e_23 * CC[3+bs*0];
6867 CC[2+bs*1] -= e_23 * CC[3+bs*1];
6868 CC[2+bs*2] -= e_23 * CC[3+bs*2];
6869 CC[2+bs*3] -= e_23 * CC[3+bs*3];
6870
6871 e_02 = E[0+bs*2];
6872 e_12 = E[1+bs*2];
6873 e_22 = inv_diag_E[2];
6874 CC[2+bs*0] *= e_22;
6875 CC[2+bs*1] *= e_22;
6876 CC[2+bs*2] *= e_22;
6877 CC[2+bs*3] *= e_22;
6878 CC[0+bs*0] -= e_02 * CC[2+bs*0];
6879 CC[0+bs*1] -= e_02 * CC[2+bs*1];
6880 CC[0+bs*2] -= e_02 * CC[2+bs*2];
6881 CC[0+bs*3] -= e_02 * CC[2+bs*3];
6882 CC[1+bs*0] -= e_12 * CC[2+bs*0];
6883 CC[1+bs*1] -= e_12 * CC[2+bs*1];
6884 CC[1+bs*2] -= e_12 * CC[2+bs*2];
6885 CC[1+bs*3] -= e_12 * CC[2+bs*3];
6886
6887 e_01 = E[0+bs*1];
6888 e_11 = inv_diag_E[1];
6889 CC[1+bs*0] *= e_11;
6890 CC[1+bs*1] *= e_11;
6891 CC[1+bs*2] *= e_11;
6892 CC[1+bs*3] *= e_11;
6893 CC[0+bs*0] -= e_01 * CC[1+bs*0];
6894 CC[0+bs*1] -= e_01 * CC[1+bs*1];
6895 CC[0+bs*2] -= e_01 * CC[1+bs*2];
6896 CC[0+bs*3] -= e_01 * CC[1+bs*3];
6897
6898 e_00 = inv_diag_E[0];
6899 CC[0+bs*0] *= e_00;
6900 CC[0+bs*1] *= e_00;
6901 CC[0+bs*2] *= e_00;
6902 CC[0+bs*3] *= e_00;
6903
6904 D[0+bs*0] = CC[0+bs*0];
6905 D[1+bs*0] = CC[1+bs*0];
6906 D[2+bs*0] = CC[2+bs*0];
6907 D[3+bs*0] = CC[3+bs*0];
6908
6909 D[0+bs*1] = CC[0+bs*1];
6910 D[1+bs*1] = CC[1+bs*1];
6911 D[2+bs*1] = CC[2+bs*1];
6912 D[3+bs*1] = CC[3+bs*1];
6913
6914 D[0+bs*2] = CC[0+bs*2];
6915 D[1+bs*2] = CC[1+bs*2];
6916 D[2+bs*2] = CC[2+bs*2];
6917 D[3+bs*2] = CC[3+bs*2];
6918
6919 D[0+bs*3] = CC[0+bs*3];
6920 D[1+bs*3] = CC[1+bs*3];
6921 D[2+bs*3] = CC[2+bs*3];
6922 D[3+bs*3] = CC[3+bs*3];
6923
6924 return;
6925
6926 }
6927 #endif
6928
6929
6930
6931 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax,float * A,float * B,int sdb,float * C,float * D,float * E,float * inv_diag_E,int km,int kn)6932 void kernel_strsm_nn_lu_inv_4x4_vs_lib4(int kmax, float *A, float *B, int sdb, float *C, float *D, float *E, float *inv_diag_E, int km, int kn)
6933 {
6934
6935 const int bs = 4;
6936
6937 int k;
6938
6939 float
6940 tmp,
6941 a_0, a_1, a_2, a_3,
6942 b_0, b_1, b_2, b_3,
6943 e_00, e_01, e_02, e_03,
6944 e_11, e_12, e_13,
6945 e_22, e_23,
6946 e_33;
6947
6948 #if defined(TARGET_GENERIC)
6949 float CC[16] = {0};
6950 #else
6951 ALIGNED( float CC[16], 64 ) = {0};
6952 #endif
6953
6954 float alpha1 = -1.0;
6955 float beta1 = 1.0;
6956
6957 kernel_sgemm_nn_4x4_lib4(kmax, &alpha1, A, 0, B, sdb, &beta1, C, CC);
6958
6959 // solve
6960
6961 if(km>3)
6962 {
6963 e_03 = E[0+bs*3];
6964 e_13 = E[1+bs*3];
6965 e_23 = E[2+bs*3];
6966 e_33 = inv_diag_E[3];
6967 CC[3+bs*0] *= e_33;
6968 CC[3+bs*1] *= e_33;
6969 CC[3+bs*2] *= e_33;
6970 CC[3+bs*3] *= e_33;
6971 CC[0+bs*0] -= e_03 * CC[3+bs*0];
6972 CC[0+bs*1] -= e_03 * CC[3+bs*1];
6973 CC[0+bs*2] -= e_03 * CC[3+bs*2];
6974 CC[0+bs*3] -= e_03 * CC[3+bs*3];
6975 CC[1+bs*0] -= e_13 * CC[3+bs*0];
6976 CC[1+bs*1] -= e_13 * CC[3+bs*1];
6977 CC[1+bs*2] -= e_13 * CC[3+bs*2];
6978 CC[1+bs*3] -= e_13 * CC[3+bs*3];
6979 CC[2+bs*0] -= e_23 * CC[3+bs*0];
6980 CC[2+bs*1] -= e_23 * CC[3+bs*1];
6981 CC[2+bs*2] -= e_23 * CC[3+bs*2];
6982 CC[2+bs*3] -= e_23 * CC[3+bs*3];
6983 }
6984
6985 if(km>2)
6986 {
6987 e_02 = E[0+bs*2];
6988 e_12 = E[1+bs*2];
6989 e_22 = inv_diag_E[2];
6990 CC[2+bs*0] *= e_22;
6991 CC[2+bs*1] *= e_22;
6992 CC[2+bs*2] *= e_22;
6993 CC[2+bs*3] *= e_22;
6994 CC[0+bs*0] -= e_02 * CC[2+bs*0];
6995 CC[0+bs*1] -= e_02 * CC[2+bs*1];
6996 CC[0+bs*2] -= e_02 * CC[2+bs*2];
6997 CC[0+bs*3] -= e_02 * CC[2+bs*3];
6998 CC[1+bs*0] -= e_12 * CC[2+bs*0];
6999 CC[1+bs*1] -= e_12 * CC[2+bs*1];
7000 CC[1+bs*2] -= e_12 * CC[2+bs*2];
7001 CC[1+bs*3] -= e_12 * CC[2+bs*3];
7002 }
7003
7004 if(km>1)
7005 {
7006 e_01 = E[0+bs*1];
7007 e_11 = inv_diag_E[1];
7008 CC[1+bs*0] *= e_11;
7009 CC[1+bs*1] *= e_11;
7010 CC[1+bs*2] *= e_11;
7011 CC[1+bs*3] *= e_11;
7012 CC[0+bs*0] -= e_01 * CC[1+bs*0];
7013 CC[0+bs*1] -= e_01 * CC[1+bs*1];
7014 CC[0+bs*2] -= e_01 * CC[1+bs*2];
7015 CC[0+bs*3] -= e_01 * CC[1+bs*3];
7016 }
7017
7018 e_00 = inv_diag_E[0];
7019 CC[0+bs*0] *= e_00;
7020 CC[0+bs*1] *= e_00;
7021 CC[0+bs*2] *= e_00;
7022 CC[0+bs*3] *= e_00;
7023
7024 store:
7025
7026 if(km>=4)
7027 {
7028 D[0+bs*0] = CC[0+bs*0];
7029 D[1+bs*0] = CC[1+bs*0];
7030 D[2+bs*0] = CC[2+bs*0];
7031 D[3+bs*0] = CC[3+bs*0];
7032
7033 if(kn==1)
7034 return;
7035
7036 D[0+bs*1] = CC[0+bs*1];
7037 D[1+bs*1] = CC[1+bs*1];
7038 D[2+bs*1] = CC[2+bs*1];
7039 D[3+bs*1] = CC[3+bs*1];
7040
7041 if(kn==2)
7042 return;
7043
7044 D[0+bs*2] = CC[0+bs*2];
7045 D[1+bs*2] = CC[1+bs*2];
7046 D[2+bs*2] = CC[2+bs*2];
7047 D[3+bs*2] = CC[3+bs*2];
7048
7049 if(kn==3)
7050 return;
7051
7052 D[0+bs*3] = CC[0+bs*3];
7053 D[1+bs*3] = CC[1+bs*3];
7054 D[2+bs*3] = CC[2+bs*3];
7055 D[3+bs*3] = CC[3+bs*3];
7056 }
7057 else if(km>=3)
7058 {
7059 D[0+bs*0] = CC[0+bs*0];
7060 D[1+bs*0] = CC[1+bs*0];
7061 D[2+bs*0] = CC[2+bs*0];
7062
7063 if(kn==1)
7064 return;
7065
7066 D[0+bs*1] = CC[0+bs*1];
7067 D[1+bs*1] = CC[1+bs*1];
7068 D[2+bs*1] = CC[2+bs*1];
7069
7070 if(kn==2)
7071 return;
7072
7073 D[0+bs*2] = CC[0+bs*2];
7074 D[1+bs*2] = CC[1+bs*2];
7075 D[2+bs*2] = CC[2+bs*2];
7076
7077 if(kn==3)
7078 return;
7079
7080 D[0+bs*3] = CC[0+bs*3];
7081 D[1+bs*3] = CC[1+bs*3];
7082 D[2+bs*3] = CC[2+bs*3];
7083 }
7084 else if(km>=2)
7085 {
7086 D[0+bs*0] = CC[0+bs*0];
7087 D[1+bs*0] = CC[1+bs*0];
7088
7089 if(kn==1)
7090 return;
7091
7092 D[0+bs*1] = CC[0+bs*1];
7093 D[1+bs*1] = CC[1+bs*1];
7094
7095 if(kn==2)
7096 return;
7097
7098 D[0+bs*2] = CC[0+bs*2];
7099 D[1+bs*2] = CC[1+bs*2];
7100
7101 if(kn==3)
7102 return;
7103
7104 D[0+bs*3] = CC[0+bs*3];
7105 D[1+bs*3] = CC[1+bs*3];
7106 }
7107 else //if(km>=1)
7108 {
7109 D[0+bs*0] = CC[0+bs*0];
7110
7111 if(kn==1)
7112 return;
7113
7114 D[0+bs*1] = CC[0+bs*1];
7115
7116 if(kn==2)
7117 return;
7118
7119 D[0+bs*2] = CC[0+bs*2];
7120
7121 if(kn==3)
7122 return;
7123
7124 D[0+bs*3] = CC[0+bs*3];
7125 }
7126
7127 return;
7128
7129 }
7130 #endif
7131
7132
7133
7134
7135
7136 #if defined(BLAS_API)
7137
7138 #include "kernel_sgemm_4x4_lib.c"
7139
7140 #endif
7141
7142