1 /**************************************************************************************************
2 * *
3 * This file is part of BLASFEO. *
4 * *
5 * BLASFEO -- BLAS For Embedded Optimization. *
6 * Copyright (C) 2019 by Gianluca Frison. *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. *
8 * All rights reserved. *
9 * *
10 * The 2-Clause BSD License *
11 * *
12 * Redistribution and use in source and binary forms, with or without *
13 * modification, are permitted provided that the following conditions are met: *
14 * *
15 * 1. Redistributions of source code must retain the above copyright notice, this *
16 * list of conditions and the following disclaimer. *
17 * 2. Redistributions in binary form must reproduce the above copyright notice, *
18 * this list of conditions and the following disclaimer in the documentation *
19 * and/or other materials provided with the distribution. *
20 * *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
31 * *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de *
33 * *
34 **************************************************************************************************/
35
36
37
38 // B is the diagonal of a matrix, case beta=0.0
39 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_4_a0_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * D,int sdd)40 void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
41 {
42
43 if(kmax<=0)
44 return;
45
46 const int bs = 4;
47
48 int k;
49
50 float
51 alpha0,
52 a_0, a_1, a_2, a_3,
53 b_0, b_1, b_2, b_3,
54 c_0, c_1, c_2, c_3;
55
56 alpha0 = alpha[0];
57
58 b_0 = alpha0 * B[0];
59 b_1 = alpha0 * B[1];
60 b_2 = alpha0 * B[2];
61 b_3 = alpha0 * B[3];
62
63 for(k=0; k<kmax-3; k+=4)
64 {
65
66 a_0 = A[0+bs*0];
67 a_1 = A[1+bs*0];
68 a_2 = A[2+bs*0];
69 a_3 = A[3+bs*0];
70
71 c_0 = a_0 * b_0;
72 c_1 = a_1 * b_0;
73 c_2 = a_2 * b_0;
74 c_3 = a_3 * b_0;
75
76 D[0+bs*0] = c_0;
77 D[1+bs*0] = c_1;
78 D[2+bs*0] = c_2;
79 D[3+bs*0] = c_3;
80
81
82 a_0 = A[0+bs*1];
83 a_1 = A[1+bs*1];
84 a_2 = A[2+bs*1];
85 a_3 = A[3+bs*1];
86
87 c_0 = a_0 * b_1;
88 c_1 = a_1 * b_1;
89 c_2 = a_2 * b_1;
90 c_3 = a_3 * b_1;
91
92 D[0+bs*1] = c_0;
93 D[1+bs*1] = c_1;
94 D[2+bs*1] = c_2;
95 D[3+bs*1] = c_3;
96
97
98 a_0 = A[0+bs*2];
99 a_1 = A[1+bs*2];
100 a_2 = A[2+bs*2];
101 a_3 = A[3+bs*2];
102
103 c_0 = a_0 * b_2;
104 c_1 = a_1 * b_2;
105 c_2 = a_2 * b_2;
106 c_3 = a_3 * b_2;
107
108 D[0+bs*2] = c_0;
109 D[1+bs*2] = c_1;
110 D[2+bs*2] = c_2;
111 D[3+bs*2] = c_3;
112
113
114 a_0 = A[0+bs*3];
115 a_1 = A[1+bs*3];
116 a_2 = A[2+bs*3];
117 a_3 = A[3+bs*3];
118
119 c_0 = a_0 * b_3;
120 c_1 = a_1 * b_3;
121 c_2 = a_2 * b_3;
122 c_3 = a_3 * b_3;
123
124 D[0+bs*3] = c_0;
125 D[1+bs*3] = c_1;
126 D[2+bs*3] = c_2;
127 D[3+bs*3] = c_3;
128
129 A += 4*sda;
130 D += 4*sdd;
131
132 }
133 for(; k<kmax; k++)
134 {
135
136 a_0 = A[0+bs*0];
137
138 c_0 = a_0 * b_0;
139
140 D[0+bs*0] = c_0;
141
142
143 a_0 = A[0+bs*1];
144
145 c_0 = a_0 * b_1;
146
147 D[0+bs*1] = c_0;
148
149
150 a_0 = A[0+bs*2];
151
152 c_0 = a_0 * b_2;
153
154 D[0+bs*2] = c_0;
155
156
157 a_0 = A[0+bs*3];
158
159 c_0 = a_0 * b_3;
160
161 D[0+bs*3] = c_0;
162
163
164 A += 1;
165 D += 1;
166
167 }
168
169 }
170 #endif
171
172
173
174 // B is the diagonal of a matrix
175 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_4_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * beta,float * C,int sdc,float * D,int sdd)176 void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
177 {
178
179 if(kmax<=0)
180 return;
181
182 const int bs = 4;
183
184 int k;
185
186 float
187 alpha0, beta0,
188 a_0, a_1, a_2, a_3,
189 b_0, b_1, b_2, b_3,
190 c_0, c_1, c_2, c_3;
191
192 alpha0 = alpha[0];
193 beta0 = beta[0];
194
195 b_0 = alpha0 * B[0];
196 b_1 = alpha0 * B[1];
197 b_2 = alpha0 * B[2];
198 b_3 = alpha0 * B[3];
199
200 for(k=0; k<kmax-3; k+=4)
201 {
202
203 a_0 = A[0+bs*0];
204 a_1 = A[1+bs*0];
205 a_2 = A[2+bs*0];
206 a_3 = A[3+bs*0];
207
208 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
209 c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
210 c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
211 c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
212
213 D[0+bs*0] = c_0;
214 D[1+bs*0] = c_1;
215 D[2+bs*0] = c_2;
216 D[3+bs*0] = c_3;
217
218
219 a_0 = A[0+bs*1];
220 a_1 = A[1+bs*1];
221 a_2 = A[2+bs*1];
222 a_3 = A[3+bs*1];
223
224 c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
225 c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
226 c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
227 c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
228
229 D[0+bs*1] = c_0;
230 D[1+bs*1] = c_1;
231 D[2+bs*1] = c_2;
232 D[3+bs*1] = c_3;
233
234
235 a_0 = A[0+bs*2];
236 a_1 = A[1+bs*2];
237 a_2 = A[2+bs*2];
238 a_3 = A[3+bs*2];
239
240 c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
241 c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
242 c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
243 c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
244
245 D[0+bs*2] = c_0;
246 D[1+bs*2] = c_1;
247 D[2+bs*2] = c_2;
248 D[3+bs*2] = c_3;
249
250
251 a_0 = A[0+bs*3];
252 a_1 = A[1+bs*3];
253 a_2 = A[2+bs*3];
254 a_3 = A[3+bs*3];
255
256 c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
257 c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
258 c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
259 c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
260
261 D[0+bs*3] = c_0;
262 D[1+bs*3] = c_1;
263 D[2+bs*3] = c_2;
264 D[3+bs*3] = c_3;
265
266 A += 4*sda;
267 C += 4*sdc;
268 D += 4*sdd;
269
270 }
271 for(; k<kmax; k++)
272 {
273
274 a_0 = A[0+bs*0];
275
276 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
277
278 D[0+bs*0] = c_0;
279
280
281 a_0 = A[0+bs*1];
282
283 c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
284
285 D[0+bs*1] = c_0;
286
287
288 a_0 = A[0+bs*2];
289
290 c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
291
292 D[0+bs*2] = c_0;
293
294
295 a_0 = A[0+bs*3];
296
297 c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
298
299 D[0+bs*3] = c_0;
300
301
302 A += 1;
303 C += 1;
304 D += 1;
305
306 }
307
308 }
309 #endif
310
311
312
313 // B is the diagonal of a matrix
314 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_3_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * beta,float * C,int sdc,float * D,int sdd)315 void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
316 {
317
318 if(kmax<=0)
319 return;
320
321 const int bs = 4;
322
323 int k;
324
325 float
326 alpha0, beta0,
327 a_0, a_1, a_2, a_3,
328 b_0, b_1, b_2,
329 c_0, c_1, c_2, c_3;
330
331 alpha0 = alpha[0];
332 beta0 = beta[0];
333
334 b_0 = alpha0 * B[0];
335 b_1 = alpha0 * B[1];
336 b_2 = alpha0 * B[2];
337
338 for(k=0; k<kmax-3; k+=4)
339 {
340
341 a_0 = A[0+bs*0];
342 a_1 = A[1+bs*0];
343 a_2 = A[2+bs*0];
344 a_3 = A[3+bs*0];
345
346 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
347 c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
348 c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
349 c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
350
351 D[0+bs*0] = c_0;
352 D[1+bs*0] = c_1;
353 D[2+bs*0] = c_2;
354 D[3+bs*0] = c_3;
355
356
357 a_0 = A[0+bs*1];
358 a_1 = A[1+bs*1];
359 a_2 = A[2+bs*1];
360 a_3 = A[3+bs*1];
361
362 c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
363 c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
364 c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
365 c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
366
367 D[0+bs*1] = c_0;
368 D[1+bs*1] = c_1;
369 D[2+bs*1] = c_2;
370 D[3+bs*1] = c_3;
371
372
373 a_0 = A[0+bs*2];
374 a_1 = A[1+bs*2];
375 a_2 = A[2+bs*2];
376 a_3 = A[3+bs*2];
377
378 c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
379 c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
380 c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
381 c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
382
383 D[0+bs*2] = c_0;
384 D[1+bs*2] = c_1;
385 D[2+bs*2] = c_2;
386 D[3+bs*2] = c_3;
387
388
389 A += 4*sda;
390 C += 4*sdc;
391 D += 4*sdd;
392
393 }
394 for(; k<kmax; k++)
395 {
396
397 a_0 = A[0+bs*0];
398
399 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
400
401 D[0+bs*0] = c_0;
402
403
404 a_0 = A[0+bs*1];
405
406 c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
407
408 D[0+bs*1] = c_0;
409
410
411 a_0 = A[0+bs*2];
412
413 c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
414
415 D[0+bs*2] = c_0;
416
417
418 A += 1;
419 C += 1;
420 D += 1;
421
422 }
423
424 }
425 #endif
426
427
428
429 // B is the diagonal of a matrix
430 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_2_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * beta,float * C,int sdc,float * D,int sdd)431 void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
432 {
433
434 if(kmax<=0)
435 return;
436
437 const int bs = 4;
438
439 int k;
440
441 float
442 alpha0, beta0,
443 a_0, a_1, a_2, a_3,
444 b_0, b_1,
445 c_0, c_1, c_2, c_3;
446
447 alpha0 = alpha[0];
448 beta0 = beta[0];
449
450 b_0 = alpha0 * B[0];
451 b_1 = alpha0 * B[1];
452
453 for(k=0; k<kmax-3; k+=4)
454 {
455
456 a_0 = A[0+bs*0];
457 a_1 = A[1+bs*0];
458 a_2 = A[2+bs*0];
459 a_3 = A[3+bs*0];
460
461 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
462 c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
463 c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
464 c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
465
466 D[0+bs*0] = c_0;
467 D[1+bs*0] = c_1;
468 D[2+bs*0] = c_2;
469 D[3+bs*0] = c_3;
470
471
472 a_0 = A[0+bs*1];
473 a_1 = A[1+bs*1];
474 a_2 = A[2+bs*1];
475 a_3 = A[3+bs*1];
476
477 c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
478 c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
479 c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
480 c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
481
482 D[0+bs*1] = c_0;
483 D[1+bs*1] = c_1;
484 D[2+bs*1] = c_2;
485 D[3+bs*1] = c_3;
486
487
488 A += 4*sda;
489 C += 4*sdc;
490 D += 4*sdd;
491
492 }
493 for(; k<kmax; k++)
494 {
495
496 a_0 = A[0+bs*0];
497
498 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
499
500 D[0+bs*0] = c_0;
501
502
503 a_0 = A[0+bs*1];
504
505 c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
506
507 D[0+bs*1] = c_0;
508
509
510 A += 1;
511 C += 1;
512 D += 1;
513
514 }
515
516 }
517 #endif
518
519
520
521 // B is the diagonal of a matrix
522 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_1_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * beta,float * C,int sdc,float * D,int sdd)523 void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
524 {
525
526 if(kmax<=0)
527 return;
528
529 const int bs = 4;
530
531 int k;
532
533 float
534 alpha0, beta0,
535 a_0, a_1, a_2, a_3,
536 b_0,
537 c_0, c_1, c_2, c_3;
538
539 alpha0 = alpha[0];
540 beta0 = beta[0];
541
542 b_0 = alpha0 * B[0];
543
544 for(k=0; k<kmax-3; k+=4)
545 {
546
547 a_0 = A[0+bs*0];
548 a_1 = A[1+bs*0];
549 a_2 = A[2+bs*0];
550 a_3 = A[3+bs*0];
551
552 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
553 c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
554 c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
555 c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
556
557 D[0+bs*0] = c_0;
558 D[1+bs*0] = c_1;
559 D[2+bs*0] = c_2;
560 D[3+bs*0] = c_3;
561
562
563 A += 4*sda;
564 C += 4*sdc;
565 D += 4*sdd;
566
567 }
568 for(; k<kmax; k++)
569 {
570
571 a_0 = A[0+bs*0];
572
573 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
574
575 D[0+bs*0] = c_0;
576
577
578 A += 1;
579 C += 1;
580 D += 1;
581
582 }
583
584 }
585 #endif
586
587
588
589 // A is the diagonal of a matrix, case beta=0.0
590 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_4_a0_lib4(int kmax,float * alpha,float * A,float * B,float * D)591 void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D)
592 {
593
594 if(kmax<=0)
595 return;
596
597 const int bs = 4;
598
599 int k;
600
601 float
602 alpha0,
603 a_0, a_1, a_2, a_3,
604 b_0, b_1, b_2, b_3,
605 c_0, c_1, c_2, c_3;
606
607 alpha0 = alpha[0];
608
609 a_0 = alpha0 * A[0];
610 a_1 = alpha0 * A[1];
611 a_2 = alpha0 * A[2];
612 a_3 = alpha0 * A[3];
613
614 for(k=0; k<kmax-3; k+=4)
615 {
616
617 b_0 = B[0+bs*0];
618 b_1 = B[1+bs*0];
619 b_2 = B[2+bs*0];
620 b_3 = B[3+bs*0];
621
622 c_0 = a_0 * b_0;
623 c_1 = a_1 * b_1;
624 c_2 = a_2 * b_2;
625 c_3 = a_3 * b_3;
626
627 D[0+bs*0] = c_0;
628 D[1+bs*0] = c_1;
629 D[2+bs*0] = c_2;
630 D[3+bs*0] = c_3;
631
632
633 b_0 = B[0+bs*1];
634 b_1 = B[1+bs*1];
635 b_2 = B[2+bs*1];
636 b_3 = B[3+bs*1];
637
638 c_0 = a_0 * b_0;
639 c_1 = a_1 * b_1;
640 c_2 = a_2 * b_2;
641 c_3 = a_3 * b_3;
642
643 D[0+bs*1] = c_0;
644 D[1+bs*1] = c_1;
645 D[2+bs*1] = c_2;
646 D[3+bs*1] = c_3;
647
648
649 b_0 = B[0+bs*2];
650 b_1 = B[1+bs*2];
651 b_2 = B[2+bs*2];
652 b_3 = B[3+bs*2];
653
654 c_0 = a_0 * b_0;
655 c_1 = a_1 * b_1;
656 c_2 = a_2 * b_2;
657 c_3 = a_3 * b_3;
658
659 D[0+bs*2] = c_0;
660 D[1+bs*2] = c_1;
661 D[2+bs*2] = c_2;
662 D[3+bs*2] = c_3;
663
664
665 b_0 = B[0+bs*3];
666 b_1 = B[1+bs*3];
667 b_2 = B[2+bs*3];
668 b_3 = B[3+bs*3];
669
670 c_0 = a_0 * b_0;
671 c_1 = a_1 * b_1;
672 c_2 = a_2 * b_2;
673 c_3 = a_3 * b_3;
674
675 D[0+bs*3] = c_0;
676 D[1+bs*3] = c_1;
677 D[2+bs*3] = c_2;
678 D[3+bs*3] = c_3;
679
680 B += 16;
681 D += 16;
682
683 }
684 for(; k<kmax; k++)
685 {
686
687 b_0 = B[0+bs*0];
688 b_1 = B[1+bs*0];
689 b_2 = B[2+bs*0];
690 b_3 = B[3+bs*0];
691
692 c_0 = a_0 * b_0;
693 c_1 = a_1 * b_1;
694 c_2 = a_2 * b_2;
695 c_3 = a_3 * b_3;
696
697 D[0+bs*0] = c_0;
698 D[1+bs*0] = c_1;
699 D[2+bs*0] = c_2;
700 D[3+bs*0] = c_3;
701
702 B += 4;
703 D += 4;
704
705 }
706
707 }
708 #endif
709
710
711
712 // A is the diagonal of a matrix
713 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_4_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)714 void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
715 {
716
717 if(kmax<=0)
718 return;
719
720 const int bs = 4;
721
722 int k;
723
724 float
725 alpha0, beta0,
726 a_0, a_1, a_2, a_3,
727 b_0, b_1, b_2, b_3,
728 c_0, c_1, c_2, c_3;
729
730 alpha0 = alpha[0];
731 beta0 = beta[0];
732
733 a_0 = alpha0 * A[0];
734 a_1 = alpha0 * A[1];
735 a_2 = alpha0 * A[2];
736 a_3 = alpha0 * A[3];
737
738 for(k=0; k<kmax-3; k+=4)
739 {
740
741 b_0 = B[0+bs*0];
742 b_1 = B[1+bs*0];
743 b_2 = B[2+bs*0];
744 b_3 = B[3+bs*0];
745
746 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
747 c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
748 c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
749 c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
750
751 D[0+bs*0] = c_0;
752 D[1+bs*0] = c_1;
753 D[2+bs*0] = c_2;
754 D[3+bs*0] = c_3;
755
756
757 b_0 = B[0+bs*1];
758 b_1 = B[1+bs*1];
759 b_2 = B[2+bs*1];
760 b_3 = B[3+bs*1];
761
762 c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
763 c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
764 c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
765 c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
766
767 D[0+bs*1] = c_0;
768 D[1+bs*1] = c_1;
769 D[2+bs*1] = c_2;
770 D[3+bs*1] = c_3;
771
772
773 b_0 = B[0+bs*2];
774 b_1 = B[1+bs*2];
775 b_2 = B[2+bs*2];
776 b_3 = B[3+bs*2];
777
778 c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
779 c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
780 c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
781 c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
782
783 D[0+bs*2] = c_0;
784 D[1+bs*2] = c_1;
785 D[2+bs*2] = c_2;
786 D[3+bs*2] = c_3;
787
788
789 b_0 = B[0+bs*3];
790 b_1 = B[1+bs*3];
791 b_2 = B[2+bs*3];
792 b_3 = B[3+bs*3];
793
794 c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
795 c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
796 c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
797 c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
798
799 D[0+bs*3] = c_0;
800 D[1+bs*3] = c_1;
801 D[2+bs*3] = c_2;
802 D[3+bs*3] = c_3;
803
804 B += 16;
805 C += 16;
806 D += 16;
807
808 }
809 for(; k<kmax; k++)
810 {
811
812 b_0 = B[0+bs*0];
813 b_1 = B[1+bs*0];
814 b_2 = B[2+bs*0];
815 b_3 = B[3+bs*0];
816
817 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
818 c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
819 c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
820 c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
821
822 D[0+bs*0] = c_0;
823 D[1+bs*0] = c_1;
824 D[2+bs*0] = c_2;
825 D[3+bs*0] = c_3;
826
827 B += 4;
828 C += 4;
829 D += 4;
830
831 }
832
833 }
834 #endif
835
836
837
838 // A is the diagonal of a matrix
839 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_3_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)840 void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
841 {
842
843 if(kmax<=0)
844 return;
845
846 const int bs = 4;
847
848 int k;
849
850 float
851 alpha0, beta0,
852 a_0, a_1, a_2,
853 b_0, b_1, b_2,
854 c_0, c_1, c_2;
855
856 alpha0 = alpha[0];
857 beta0 = beta[0];
858
859 a_0 = alpha0 * A[0];
860 a_1 = alpha0 * A[1];
861 a_2 = alpha0 * A[2];
862
863 for(k=0; k<kmax-3; k+=4)
864 {
865
866 b_0 = B[0+bs*0];
867 b_1 = B[1+bs*0];
868 b_2 = B[2+bs*0];
869
870 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
871 c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
872 c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
873
874 D[0+bs*0] = c_0;
875 D[1+bs*0] = c_1;
876 D[2+bs*0] = c_2;
877
878
879 b_0 = B[0+bs*1];
880 b_1 = B[1+bs*1];
881 b_2 = B[2+bs*1];
882
883 c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
884 c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
885 c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
886
887 D[0+bs*1] = c_0;
888 D[1+bs*1] = c_1;
889 D[2+bs*1] = c_2;
890
891
892 b_0 = B[0+bs*2];
893 b_1 = B[1+bs*2];
894 b_2 = B[2+bs*2];
895
896 c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
897 c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
898 c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
899
900 D[0+bs*2] = c_0;
901 D[1+bs*2] = c_1;
902 D[2+bs*2] = c_2;
903
904
905 b_0 = B[0+bs*3];
906 b_1 = B[1+bs*3];
907 b_2 = B[2+bs*3];
908
909 c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
910 c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
911 c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
912
913 D[0+bs*3] = c_0;
914 D[1+bs*3] = c_1;
915 D[2+bs*3] = c_2;
916
917 B += 16;
918 C += 16;
919 D += 16;
920
921 }
922 for(; k<kmax; k++)
923 {
924
925 b_0 = B[0+bs*0];
926 b_1 = B[1+bs*0];
927 b_2 = B[2+bs*0];
928
929 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
930 c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
931 c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
932
933 D[0+bs*0] = c_0;
934 D[1+bs*0] = c_1;
935 D[2+bs*0] = c_2;
936
937 B += 4;
938 C += 4;
939 D += 4;
940
941 }
942
943 }
944 #endif
945
946
947
948 // A is the diagonal of a matrix
949 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_2_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)950 void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
951 {
952
953 if(kmax<=0)
954 return;
955
956 const int bs = 4;
957
958 int k;
959
960 float
961 alpha0, beta0,
962 a_0, a_1,
963 b_0, b_1,
964 c_0, c_1;
965
966 alpha0 = alpha[0];
967 beta0 = beta[0];
968
969 a_0 = alpha0 * A[0];
970 a_1 = alpha0 * A[1];
971
972 for(k=0; k<kmax-3; k+=4)
973 {
974
975 b_0 = B[0+bs*0];
976 b_1 = B[1+bs*0];
977
978 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
979 c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
980
981 D[0+bs*0] = c_0;
982 D[1+bs*0] = c_1;
983
984
985 b_0 = B[0+bs*1];
986 b_1 = B[1+bs*1];
987
988 c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
989 c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
990
991 D[0+bs*1] = c_0;
992 D[1+bs*1] = c_1;
993
994
995 b_0 = B[0+bs*2];
996 b_1 = B[1+bs*2];
997
998 c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
999 c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
1000
1001 D[0+bs*2] = c_0;
1002 D[1+bs*2] = c_1;
1003
1004
1005 b_0 = B[0+bs*3];
1006 b_1 = B[1+bs*3];
1007
1008 c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
1009 c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
1010
1011 D[0+bs*3] = c_0;
1012 D[1+bs*3] = c_1;
1013
1014 B += 16;
1015 C += 16;
1016 D += 16;
1017
1018 }
1019 for(; k<kmax; k++)
1020 {
1021
1022 b_0 = B[0+bs*0];
1023 b_1 = B[1+bs*0];
1024
1025 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
1026 c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
1027
1028 D[0+bs*0] = c_0;
1029 D[1+bs*0] = c_1;
1030
1031 B += 4;
1032 C += 4;
1033 D += 4;
1034
1035 }
1036
1037 }
1038 #endif
1039
1040
1041
1042 // A is the diagonal of a matrix
1043 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_1_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)1044 void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
1045 {
1046
1047 if(kmax<=0)
1048 return;
1049
1050 const int bs = 4;
1051
1052 int k;
1053
1054 float
1055 alpha0, beta0,
1056 a_0,
1057 b_0,
1058 c_0;
1059
1060 alpha0 = alpha[0];
1061 beta0 = beta[0];
1062
1063 a_0 = alpha0 * A[0];
1064
1065 for(k=0; k<kmax-3; k+=4)
1066 {
1067
1068 b_0 = B[0+bs*0];
1069
1070 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
1071
1072 D[0+bs*0] = c_0;
1073
1074
1075 b_0 = B[0+bs*1];
1076
1077 c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
1078
1079 D[0+bs*1] = c_0;
1080
1081
1082 b_0 = B[0+bs*2];
1083
1084 c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
1085
1086 D[0+bs*2] = c_0;
1087
1088
1089 b_0 = B[0+bs*3];
1090
1091 c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
1092
1093 D[0+bs*3] = c_0;
1094
1095 B += 16;
1096 C += 16;
1097 D += 16;
1098
1099 }
1100 for(; k<kmax; k++)
1101 {
1102
1103 b_0 = B[0+bs*0];
1104
1105 c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
1106
1107 D[0+bs*0] = c_0;
1108
1109 B += 4;
1110 C += 4;
1111 D += 4;
1112
1113 }
1114
1115 }
1116 #endif
1117
1118
1119
1120