1 /**************************************************************************************************
2 *                                                                                                 *
3 * This file is part of BLASFEO.                                                                   *
4 *                                                                                                 *
5 * BLASFEO -- BLAS For Embedded Optimization.                                                      *
6 * Copyright (C) 2019 by Gianluca Frison.                                                          *
7 * Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8 * All rights reserved.                                                                            *
9 *                                                                                                 *
10 * The 2-Clause BSD License                                                                        *
11 *                                                                                                 *
12 * Redistribution and use in source and binary forms, with or without                              *
13 * modification, are permitted provided that the following conditions are met:                     *
14 *                                                                                                 *
15 * 1. Redistributions of source code must retain the above copyright notice, this                  *
16 *    list of conditions and the following disclaimer.                                             *
17 * 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18 *    this list of conditions and the following disclaimer in the documentation                    *
19 *    and/or other materials provided with the distribution.                                       *
20 *                                                                                                 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31 *                                                                                                 *
32 * Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33 *                                                                                                 *
34 **************************************************************************************************/
35 
36 
37 
38 // B is the diagonal of a matrix, case beta=0.0
39 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_4_a0_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * D,int sdd)40 void kernel_sgemm_diag_right_4_a0_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *D, int sdd)
41 	{
42 
43 	if(kmax<=0)
44 		return;
45 
46 	const int bs = 4;
47 
48 	int k;
49 
50 	float
51 		alpha0,
52 		a_0, a_1, a_2, a_3,
53 		b_0, b_1, b_2, b_3,
54 		c_0, c_1, c_2, c_3;
55 
56 	alpha0 = alpha[0];
57 
58 	b_0 = alpha0 * B[0];
59 	b_1 = alpha0 * B[1];
60 	b_2 = alpha0 * B[2];
61 	b_3 = alpha0 * B[3];
62 
63 	for(k=0; k<kmax-3; k+=4)
64 		{
65 
66 		a_0 = A[0+bs*0];
67 		a_1 = A[1+bs*0];
68 		a_2 = A[2+bs*0];
69 		a_3 = A[3+bs*0];
70 
71 		c_0 = a_0 * b_0;
72 		c_1 = a_1 * b_0;
73 		c_2 = a_2 * b_0;
74 		c_3 = a_3 * b_0;
75 
76 		D[0+bs*0] = c_0;
77 		D[1+bs*0] = c_1;
78 		D[2+bs*0] = c_2;
79 		D[3+bs*0] = c_3;
80 
81 
82 		a_0 = A[0+bs*1];
83 		a_1 = A[1+bs*1];
84 		a_2 = A[2+bs*1];
85 		a_3 = A[3+bs*1];
86 
87 		c_0 = a_0 * b_1;
88 		c_1 = a_1 * b_1;
89 		c_2 = a_2 * b_1;
90 		c_3 = a_3 * b_1;
91 
92 		D[0+bs*1] = c_0;
93 		D[1+bs*1] = c_1;
94 		D[2+bs*1] = c_2;
95 		D[3+bs*1] = c_3;
96 
97 
98 		a_0 = A[0+bs*2];
99 		a_1 = A[1+bs*2];
100 		a_2 = A[2+bs*2];
101 		a_3 = A[3+bs*2];
102 
103 		c_0 = a_0 * b_2;
104 		c_1 = a_1 * b_2;
105 		c_2 = a_2 * b_2;
106 		c_3 = a_3 * b_2;
107 
108 		D[0+bs*2] = c_0;
109 		D[1+bs*2] = c_1;
110 		D[2+bs*2] = c_2;
111 		D[3+bs*2] = c_3;
112 
113 
114 		a_0 = A[0+bs*3];
115 		a_1 = A[1+bs*3];
116 		a_2 = A[2+bs*3];
117 		a_3 = A[3+bs*3];
118 
119 		c_0 = a_0 * b_3;
120 		c_1 = a_1 * b_3;
121 		c_2 = a_2 * b_3;
122 		c_3 = a_3 * b_3;
123 
124 		D[0+bs*3] = c_0;
125 		D[1+bs*3] = c_1;
126 		D[2+bs*3] = c_2;
127 		D[3+bs*3] = c_3;
128 
129 		A += 4*sda;
130 		D += 4*sdd;
131 
132 		}
133 	for(; k<kmax; k++)
134 		{
135 
136 		a_0 = A[0+bs*0];
137 
138 		c_0 = a_0 * b_0;
139 
140 		D[0+bs*0] = c_0;
141 
142 
143 		a_0 = A[0+bs*1];
144 
145 		c_0 = a_0 * b_1;
146 
147 		D[0+bs*1] = c_0;
148 
149 
150 		a_0 = A[0+bs*2];
151 
152 		c_0 = a_0 * b_2;
153 
154 		D[0+bs*2] = c_0;
155 
156 
157 		a_0 = A[0+bs*3];
158 
159 		c_0 = a_0 * b_3;
160 
161 		D[0+bs*3] = c_0;
162 
163 
164 		A += 1;
165 		D += 1;
166 
167 		}
168 
169 	}
170 #endif
171 
172 
173 
174 // B is the diagonal of a matrix
175 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_4_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * beta,float * C,int sdc,float * D,int sdd)176 void kernel_sgemm_diag_right_4_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
177 	{
178 
179 	if(kmax<=0)
180 		return;
181 
182 	const int bs = 4;
183 
184 	int k;
185 
186 	float
187 		alpha0, beta0,
188 		a_0, a_1, a_2, a_3,
189 		b_0, b_1, b_2, b_3,
190 		c_0, c_1, c_2, c_3;
191 
192 	alpha0 = alpha[0];
193 	beta0  = beta[0];
194 
195 	b_0 = alpha0 * B[0];
196 	b_1 = alpha0 * B[1];
197 	b_2 = alpha0 * B[2];
198 	b_3 = alpha0 * B[3];
199 
200 	for(k=0; k<kmax-3; k+=4)
201 		{
202 
203 		a_0 = A[0+bs*0];
204 		a_1 = A[1+bs*0];
205 		a_2 = A[2+bs*0];
206 		a_3 = A[3+bs*0];
207 
208 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
209 		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
210 		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
211 		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
212 
213 		D[0+bs*0] = c_0;
214 		D[1+bs*0] = c_1;
215 		D[2+bs*0] = c_2;
216 		D[3+bs*0] = c_3;
217 
218 
219 		a_0 = A[0+bs*1];
220 		a_1 = A[1+bs*1];
221 		a_2 = A[2+bs*1];
222 		a_3 = A[3+bs*1];
223 
224 		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
225 		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
226 		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
227 		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
228 
229 		D[0+bs*1] = c_0;
230 		D[1+bs*1] = c_1;
231 		D[2+bs*1] = c_2;
232 		D[3+bs*1] = c_3;
233 
234 
235 		a_0 = A[0+bs*2];
236 		a_1 = A[1+bs*2];
237 		a_2 = A[2+bs*2];
238 		a_3 = A[3+bs*2];
239 
240 		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
241 		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
242 		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
243 		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
244 
245 		D[0+bs*2] = c_0;
246 		D[1+bs*2] = c_1;
247 		D[2+bs*2] = c_2;
248 		D[3+bs*2] = c_3;
249 
250 
251 		a_0 = A[0+bs*3];
252 		a_1 = A[1+bs*3];
253 		a_2 = A[2+bs*3];
254 		a_3 = A[3+bs*3];
255 
256 		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
257 		c_1 = beta0 * C[1+bs*3] + a_1 * b_3;
258 		c_2 = beta0 * C[2+bs*3] + a_2 * b_3;
259 		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
260 
261 		D[0+bs*3] = c_0;
262 		D[1+bs*3] = c_1;
263 		D[2+bs*3] = c_2;
264 		D[3+bs*3] = c_3;
265 
266 		A += 4*sda;
267 		C += 4*sdc;
268 		D += 4*sdd;
269 
270 		}
271 	for(; k<kmax; k++)
272 		{
273 
274 		a_0 = A[0+bs*0];
275 
276 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
277 
278 		D[0+bs*0] = c_0;
279 
280 
281 		a_0 = A[0+bs*1];
282 
283 		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
284 
285 		D[0+bs*1] = c_0;
286 
287 
288 		a_0 = A[0+bs*2];
289 
290 		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
291 
292 		D[0+bs*2] = c_0;
293 
294 
295 		a_0 = A[0+bs*3];
296 
297 		c_0 = beta0 * C[0+bs*3] + a_0 * b_3;
298 
299 		D[0+bs*3] = c_0;
300 
301 
302 		A += 1;
303 		C += 1;
304 		D += 1;
305 
306 		}
307 
308 	}
309 #endif
310 
311 
312 
313 // B is the diagonal of a matrix
314 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_3_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * beta,float * C,int sdc,float * D,int sdd)315 void kernel_sgemm_diag_right_3_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
316 	{
317 
318 	if(kmax<=0)
319 		return;
320 
321 	const int bs = 4;
322 
323 	int k;
324 
325 	float
326 		alpha0, beta0,
327 		a_0, a_1, a_2, a_3,
328 		b_0, b_1, b_2,
329 		c_0, c_1, c_2, c_3;
330 
331 	alpha0 = alpha[0];
332 	beta0  = beta[0];
333 
334 	b_0 = alpha0 * B[0];
335 	b_1 = alpha0 * B[1];
336 	b_2 = alpha0 * B[2];
337 
338 	for(k=0; k<kmax-3; k+=4)
339 		{
340 
341 		a_0 = A[0+bs*0];
342 		a_1 = A[1+bs*0];
343 		a_2 = A[2+bs*0];
344 		a_3 = A[3+bs*0];
345 
346 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
347 		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
348 		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
349 		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
350 
351 		D[0+bs*0] = c_0;
352 		D[1+bs*0] = c_1;
353 		D[2+bs*0] = c_2;
354 		D[3+bs*0] = c_3;
355 
356 
357 		a_0 = A[0+bs*1];
358 		a_1 = A[1+bs*1];
359 		a_2 = A[2+bs*1];
360 		a_3 = A[3+bs*1];
361 
362 		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
363 		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
364 		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
365 		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
366 
367 		D[0+bs*1] = c_0;
368 		D[1+bs*1] = c_1;
369 		D[2+bs*1] = c_2;
370 		D[3+bs*1] = c_3;
371 
372 
373 		a_0 = A[0+bs*2];
374 		a_1 = A[1+bs*2];
375 		a_2 = A[2+bs*2];
376 		a_3 = A[3+bs*2];
377 
378 		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
379 		c_1 = beta0 * C[1+bs*2] + a_1 * b_2;
380 		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
381 		c_3 = beta0 * C[3+bs*2] + a_3 * b_2;
382 
383 		D[0+bs*2] = c_0;
384 		D[1+bs*2] = c_1;
385 		D[2+bs*2] = c_2;
386 		D[3+bs*2] = c_3;
387 
388 
389 		A += 4*sda;
390 		C += 4*sdc;
391 		D += 4*sdd;
392 
393 		}
394 	for(; k<kmax; k++)
395 		{
396 
397 		a_0 = A[0+bs*0];
398 
399 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
400 
401 		D[0+bs*0] = c_0;
402 
403 
404 		a_0 = A[0+bs*1];
405 
406 		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
407 
408 		D[0+bs*1] = c_0;
409 
410 
411 		a_0 = A[0+bs*2];
412 
413 		c_0 = beta0 * C[0+bs*2] + a_0 * b_2;
414 
415 		D[0+bs*2] = c_0;
416 
417 
418 		A += 1;
419 		C += 1;
420 		D += 1;
421 
422 		}
423 
424 	}
425 #endif
426 
427 
428 
429 // B is the diagonal of a matrix
430 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_2_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * beta,float * C,int sdc,float * D,int sdd)431 void kernel_sgemm_diag_right_2_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
432 	{
433 
434 	if(kmax<=0)
435 		return;
436 
437 	const int bs = 4;
438 
439 	int k;
440 
441 	float
442 		alpha0, beta0,
443 		a_0, a_1, a_2, a_3,
444 		b_0, b_1,
445 		c_0, c_1, c_2, c_3;
446 
447 	alpha0 = alpha[0];
448 	beta0  = beta[0];
449 
450 	b_0 = alpha0 * B[0];
451 	b_1 = alpha0 * B[1];
452 
453 	for(k=0; k<kmax-3; k+=4)
454 		{
455 
456 		a_0 = A[0+bs*0];
457 		a_1 = A[1+bs*0];
458 		a_2 = A[2+bs*0];
459 		a_3 = A[3+bs*0];
460 
461 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
462 		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
463 		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
464 		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
465 
466 		D[0+bs*0] = c_0;
467 		D[1+bs*0] = c_1;
468 		D[2+bs*0] = c_2;
469 		D[3+bs*0] = c_3;
470 
471 
472 		a_0 = A[0+bs*1];
473 		a_1 = A[1+bs*1];
474 		a_2 = A[2+bs*1];
475 		a_3 = A[3+bs*1];
476 
477 		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
478 		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
479 		c_2 = beta0 * C[2+bs*1] + a_2 * b_1;
480 		c_3 = beta0 * C[3+bs*1] + a_3 * b_1;
481 
482 		D[0+bs*1] = c_0;
483 		D[1+bs*1] = c_1;
484 		D[2+bs*1] = c_2;
485 		D[3+bs*1] = c_3;
486 
487 
488 		A += 4*sda;
489 		C += 4*sdc;
490 		D += 4*sdd;
491 
492 		}
493 	for(; k<kmax; k++)
494 		{
495 
496 		a_0 = A[0+bs*0];
497 
498 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
499 
500 		D[0+bs*0] = c_0;
501 
502 
503 		a_0 = A[0+bs*1];
504 
505 		c_0 = beta0 * C[0+bs*1] + a_0 * b_1;
506 
507 		D[0+bs*1] = c_0;
508 
509 
510 		A += 1;
511 		C += 1;
512 		D += 1;
513 
514 		}
515 
516 	}
517 #endif
518 
519 
520 
521 // B is the diagonal of a matrix
522 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_right_1_lib4(int kmax,float * alpha,float * A,int sda,float * B,float * beta,float * C,int sdc,float * D,int sdd)523 void kernel_sgemm_diag_right_1_lib4(int kmax, float *alpha, float *A, int sda, float *B, float *beta, float *C, int sdc, float *D, int sdd)
524 	{
525 
526 	if(kmax<=0)
527 		return;
528 
529 	const int bs = 4;
530 
531 	int k;
532 
533 	float
534 		alpha0, beta0,
535 		a_0, a_1, a_2, a_3,
536 		b_0,
537 		c_0, c_1, c_2, c_3;
538 
539 	alpha0 = alpha[0];
540 	beta0  = beta[0];
541 
542 	b_0 = alpha0 * B[0];
543 
544 	for(k=0; k<kmax-3; k+=4)
545 		{
546 
547 		a_0 = A[0+bs*0];
548 		a_1 = A[1+bs*0];
549 		a_2 = A[2+bs*0];
550 		a_3 = A[3+bs*0];
551 
552 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
553 		c_1 = beta0 * C[1+bs*0] + a_1 * b_0;
554 		c_2 = beta0 * C[2+bs*0] + a_2 * b_0;
555 		c_3 = beta0 * C[3+bs*0] + a_3 * b_0;
556 
557 		D[0+bs*0] = c_0;
558 		D[1+bs*0] = c_1;
559 		D[2+bs*0] = c_2;
560 		D[3+bs*0] = c_3;
561 
562 
563 		A += 4*sda;
564 		C += 4*sdc;
565 		D += 4*sdd;
566 
567 		}
568 	for(; k<kmax; k++)
569 		{
570 
571 		a_0 = A[0+bs*0];
572 
573 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
574 
575 		D[0+bs*0] = c_0;
576 
577 
578 		A += 1;
579 		C += 1;
580 		D += 1;
581 
582 		}
583 
584 	}
585 #endif
586 
587 
588 
589 // A is the diagonal of a matrix, case beta=0.0
590 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_4_a0_lib4(int kmax,float * alpha,float * A,float * B,float * D)591 void kernel_sgemm_diag_left_4_a0_lib4(int kmax, float *alpha, float *A, float *B, float *D)
592 	{
593 
594 	if(kmax<=0)
595 		return;
596 
597 	const int bs = 4;
598 
599 	int k;
600 
601 	float
602 		alpha0,
603 		a_0, a_1, a_2, a_3,
604 		b_0, b_1, b_2, b_3,
605 		c_0, c_1, c_2, c_3;
606 
607 	alpha0 = alpha[0];
608 
609 	a_0 = alpha0 * A[0];
610 	a_1 = alpha0 * A[1];
611 	a_2 = alpha0 * A[2];
612 	a_3 = alpha0 * A[3];
613 
614 	for(k=0; k<kmax-3; k+=4)
615 		{
616 
617 		b_0 = B[0+bs*0];
618 		b_1 = B[1+bs*0];
619 		b_2 = B[2+bs*0];
620 		b_3 = B[3+bs*0];
621 
622 		c_0 = a_0 * b_0;
623 		c_1 = a_1 * b_1;
624 		c_2 = a_2 * b_2;
625 		c_3 = a_3 * b_3;
626 
627 		D[0+bs*0] = c_0;
628 		D[1+bs*0] = c_1;
629 		D[2+bs*0] = c_2;
630 		D[3+bs*0] = c_3;
631 
632 
633 		b_0 = B[0+bs*1];
634 		b_1 = B[1+bs*1];
635 		b_2 = B[2+bs*1];
636 		b_3 = B[3+bs*1];
637 
638 		c_0 = a_0 * b_0;
639 		c_1 = a_1 * b_1;
640 		c_2 = a_2 * b_2;
641 		c_3 = a_3 * b_3;
642 
643 		D[0+bs*1] = c_0;
644 		D[1+bs*1] = c_1;
645 		D[2+bs*1] = c_2;
646 		D[3+bs*1] = c_3;
647 
648 
649 		b_0 = B[0+bs*2];
650 		b_1 = B[1+bs*2];
651 		b_2 = B[2+bs*2];
652 		b_3 = B[3+bs*2];
653 
654 		c_0 = a_0 * b_0;
655 		c_1 = a_1 * b_1;
656 		c_2 = a_2 * b_2;
657 		c_3 = a_3 * b_3;
658 
659 		D[0+bs*2] = c_0;
660 		D[1+bs*2] = c_1;
661 		D[2+bs*2] = c_2;
662 		D[3+bs*2] = c_3;
663 
664 
665 		b_0 = B[0+bs*3];
666 		b_1 = B[1+bs*3];
667 		b_2 = B[2+bs*3];
668 		b_3 = B[3+bs*3];
669 
670 		c_0 = a_0 * b_0;
671 		c_1 = a_1 * b_1;
672 		c_2 = a_2 * b_2;
673 		c_3 = a_3 * b_3;
674 
675 		D[0+bs*3] = c_0;
676 		D[1+bs*3] = c_1;
677 		D[2+bs*3] = c_2;
678 		D[3+bs*3] = c_3;
679 
680 		B += 16;
681 		D += 16;
682 
683 		}
684 	for(; k<kmax; k++)
685 		{
686 
687 		b_0 = B[0+bs*0];
688 		b_1 = B[1+bs*0];
689 		b_2 = B[2+bs*0];
690 		b_3 = B[3+bs*0];
691 
692 		c_0 = a_0 * b_0;
693 		c_1 = a_1 * b_1;
694 		c_2 = a_2 * b_2;
695 		c_3 = a_3 * b_3;
696 
697 		D[0+bs*0] = c_0;
698 		D[1+bs*0] = c_1;
699 		D[2+bs*0] = c_2;
700 		D[3+bs*0] = c_3;
701 
702 		B += 4;
703 		D += 4;
704 
705 		}
706 
707 	}
708 #endif
709 
710 
711 
712 // A is the diagonal of a matrix
713 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_4_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)714 void kernel_sgemm_diag_left_4_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
715 	{
716 
717 	if(kmax<=0)
718 		return;
719 
720 	const int bs = 4;
721 
722 	int k;
723 
724 	float
725 		alpha0, beta0,
726 		a_0, a_1, a_2, a_3,
727 		b_0, b_1, b_2, b_3,
728 		c_0, c_1, c_2, c_3;
729 
730 	alpha0 = alpha[0];
731 	beta0  = beta[0];
732 
733 	a_0 = alpha0 * A[0];
734 	a_1 = alpha0 * A[1];
735 	a_2 = alpha0 * A[2];
736 	a_3 = alpha0 * A[3];
737 
738 	for(k=0; k<kmax-3; k+=4)
739 		{
740 
741 		b_0 = B[0+bs*0];
742 		b_1 = B[1+bs*0];
743 		b_2 = B[2+bs*0];
744 		b_3 = B[3+bs*0];
745 
746 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
747 		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
748 		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
749 		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
750 
751 		D[0+bs*0] = c_0;
752 		D[1+bs*0] = c_1;
753 		D[2+bs*0] = c_2;
754 		D[3+bs*0] = c_3;
755 
756 
757 		b_0 = B[0+bs*1];
758 		b_1 = B[1+bs*1];
759 		b_2 = B[2+bs*1];
760 		b_3 = B[3+bs*1];
761 
762 		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
763 		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
764 		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
765 		c_3 = beta0 * C[3+bs*1] + a_3 * b_3;
766 
767 		D[0+bs*1] = c_0;
768 		D[1+bs*1] = c_1;
769 		D[2+bs*1] = c_2;
770 		D[3+bs*1] = c_3;
771 
772 
773 		b_0 = B[0+bs*2];
774 		b_1 = B[1+bs*2];
775 		b_2 = B[2+bs*2];
776 		b_3 = B[3+bs*2];
777 
778 		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
779 		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
780 		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
781 		c_3 = beta0 * C[3+bs*2] + a_3 * b_3;
782 
783 		D[0+bs*2] = c_0;
784 		D[1+bs*2] = c_1;
785 		D[2+bs*2] = c_2;
786 		D[3+bs*2] = c_3;
787 
788 
789 		b_0 = B[0+bs*3];
790 		b_1 = B[1+bs*3];
791 		b_2 = B[2+bs*3];
792 		b_3 = B[3+bs*3];
793 
794 		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
795 		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
796 		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
797 		c_3 = beta0 * C[3+bs*3] + a_3 * b_3;
798 
799 		D[0+bs*3] = c_0;
800 		D[1+bs*3] = c_1;
801 		D[2+bs*3] = c_2;
802 		D[3+bs*3] = c_3;
803 
804 		B += 16;
805 		C += 16;
806 		D += 16;
807 
808 		}
809 	for(; k<kmax; k++)
810 		{
811 
812 		b_0 = B[0+bs*0];
813 		b_1 = B[1+bs*0];
814 		b_2 = B[2+bs*0];
815 		b_3 = B[3+bs*0];
816 
817 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
818 		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
819 		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
820 		c_3 = beta0 * C[3+bs*0] + a_3 * b_3;
821 
822 		D[0+bs*0] = c_0;
823 		D[1+bs*0] = c_1;
824 		D[2+bs*0] = c_2;
825 		D[3+bs*0] = c_3;
826 
827 		B += 4;
828 		C += 4;
829 		D += 4;
830 
831 		}
832 
833 	}
834 #endif
835 
836 
837 
838 // A is the diagonal of a matrix
839 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_3_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)840 void kernel_sgemm_diag_left_3_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
841 	{
842 
843 	if(kmax<=0)
844 		return;
845 
846 	const int bs = 4;
847 
848 	int k;
849 
850 	float
851 		alpha0, beta0,
852 		a_0, a_1, a_2,
853 		b_0, b_1, b_2,
854 		c_0, c_1, c_2;
855 
856 	alpha0 = alpha[0];
857 	beta0  = beta[0];
858 
859 	a_0 = alpha0 * A[0];
860 	a_1 = alpha0 * A[1];
861 	a_2 = alpha0 * A[2];
862 
863 	for(k=0; k<kmax-3; k+=4)
864 		{
865 
866 		b_0 = B[0+bs*0];
867 		b_1 = B[1+bs*0];
868 		b_2 = B[2+bs*0];
869 
870 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
871 		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
872 		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
873 
874 		D[0+bs*0] = c_0;
875 		D[1+bs*0] = c_1;
876 		D[2+bs*0] = c_2;
877 
878 
879 		b_0 = B[0+bs*1];
880 		b_1 = B[1+bs*1];
881 		b_2 = B[2+bs*1];
882 
883 		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
884 		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
885 		c_2 = beta0 * C[2+bs*1] + a_2 * b_2;
886 
887 		D[0+bs*1] = c_0;
888 		D[1+bs*1] = c_1;
889 		D[2+bs*1] = c_2;
890 
891 
892 		b_0 = B[0+bs*2];
893 		b_1 = B[1+bs*2];
894 		b_2 = B[2+bs*2];
895 
896 		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
897 		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
898 		c_2 = beta0 * C[2+bs*2] + a_2 * b_2;
899 
900 		D[0+bs*2] = c_0;
901 		D[1+bs*2] = c_1;
902 		D[2+bs*2] = c_2;
903 
904 
905 		b_0 = B[0+bs*3];
906 		b_1 = B[1+bs*3];
907 		b_2 = B[2+bs*3];
908 
909 		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
910 		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
911 		c_2 = beta0 * C[2+bs*3] + a_2 * b_2;
912 
913 		D[0+bs*3] = c_0;
914 		D[1+bs*3] = c_1;
915 		D[2+bs*3] = c_2;
916 
917 		B += 16;
918 		C += 16;
919 		D += 16;
920 
921 		}
922 	for(; k<kmax; k++)
923 		{
924 
925 		b_0 = B[0+bs*0];
926 		b_1 = B[1+bs*0];
927 		b_2 = B[2+bs*0];
928 
929 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
930 		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
931 		c_2 = beta0 * C[2+bs*0] + a_2 * b_2;
932 
933 		D[0+bs*0] = c_0;
934 		D[1+bs*0] = c_1;
935 		D[2+bs*0] = c_2;
936 
937 		B += 4;
938 		C += 4;
939 		D += 4;
940 
941 		}
942 
943 	}
944 #endif
945 
946 
947 
948 // A is the diagonal of a matrix
949 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_2_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)950 void kernel_sgemm_diag_left_2_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
951 	{
952 
953 	if(kmax<=0)
954 		return;
955 
956 	const int bs = 4;
957 
958 	int k;
959 
960 	float
961 		alpha0, beta0,
962 		a_0, a_1,
963 		b_0, b_1,
964 		c_0, c_1;
965 
966 	alpha0 = alpha[0];
967 	beta0  = beta[0];
968 
969 	a_0 = alpha0 * A[0];
970 	a_1 = alpha0 * A[1];
971 
972 	for(k=0; k<kmax-3; k+=4)
973 		{
974 
975 		b_0 = B[0+bs*0];
976 		b_1 = B[1+bs*0];
977 
978 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
979 		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
980 
981 		D[0+bs*0] = c_0;
982 		D[1+bs*0] = c_1;
983 
984 
985 		b_0 = B[0+bs*1];
986 		b_1 = B[1+bs*1];
987 
988 		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
989 		c_1 = beta0 * C[1+bs*1] + a_1 * b_1;
990 
991 		D[0+bs*1] = c_0;
992 		D[1+bs*1] = c_1;
993 
994 
995 		b_0 = B[0+bs*2];
996 		b_1 = B[1+bs*2];
997 
998 		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
999 		c_1 = beta0 * C[1+bs*2] + a_1 * b_1;
1000 
1001 		D[0+bs*2] = c_0;
1002 		D[1+bs*2] = c_1;
1003 
1004 
1005 		b_0 = B[0+bs*3];
1006 		b_1 = B[1+bs*3];
1007 
1008 		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
1009 		c_1 = beta0 * C[1+bs*3] + a_1 * b_1;
1010 
1011 		D[0+bs*3] = c_0;
1012 		D[1+bs*3] = c_1;
1013 
1014 		B += 16;
1015 		C += 16;
1016 		D += 16;
1017 
1018 		}
1019 	for(; k<kmax; k++)
1020 		{
1021 
1022 		b_0 = B[0+bs*0];
1023 		b_1 = B[1+bs*0];
1024 
1025 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
1026 		c_1 = beta0 * C[1+bs*0] + a_1 * b_1;
1027 
1028 		D[0+bs*0] = c_0;
1029 		D[1+bs*0] = c_1;
1030 
1031 		B += 4;
1032 		C += 4;
1033 		D += 4;
1034 
1035 		}
1036 
1037 	}
1038 #endif
1039 
1040 
1041 
1042 // A is the diagonal of a matrix
1043 #if defined(TARGET_GENERIC) || defined(TARGET_X86_AMD_BARCELONA) || defined(TARGET_X86_AMD_JAGUAR) || defined(TARGET_X64_INTEL_HASWELL) || defined(TARGET_X64_INTEL_SANDY_BRIDGE) || defined(TARGET_X64_INTEL_CORE) || defined(TARGET_X64_AMD_BULLDOZER) || defined(TARGET_ARMV7A_ARM_CORTEX_A15) || defined(TARGET_ARMV7A_ARM_CORTEX_A7) || defined(TARGET_ARMV7A_ARM_CORTEX_A9) || defined(TARGET_ARMV8A_ARM_CORTEX_A57) || defined(TARGET_ARMV8A_ARM_CORTEX_A53)
kernel_sgemm_diag_left_1_lib4(int kmax,float * alpha,float * A,float * B,float * beta,float * C,float * D)1044 void kernel_sgemm_diag_left_1_lib4(int kmax, float *alpha, float *A, float *B, float *beta, float *C, float *D)
1045 	{
1046 
1047 	if(kmax<=0)
1048 		return;
1049 
1050 	const int bs = 4;
1051 
1052 	int k;
1053 
1054 	float
1055 		alpha0, beta0,
1056 		a_0,
1057 		b_0,
1058 		c_0;
1059 
1060 	alpha0 = alpha[0];
1061 	beta0  = beta[0];
1062 
1063 	a_0 = alpha0 * A[0];
1064 
1065 	for(k=0; k<kmax-3; k+=4)
1066 		{
1067 
1068 		b_0 = B[0+bs*0];
1069 
1070 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
1071 
1072 		D[0+bs*0] = c_0;
1073 
1074 
1075 		b_0 = B[0+bs*1];
1076 
1077 		c_0 = beta0 * C[0+bs*1] + a_0 * b_0;
1078 
1079 		D[0+bs*1] = c_0;
1080 
1081 
1082 		b_0 = B[0+bs*2];
1083 
1084 		c_0 = beta0 * C[0+bs*2] + a_0 * b_0;
1085 
1086 		D[0+bs*2] = c_0;
1087 
1088 
1089 		b_0 = B[0+bs*3];
1090 
1091 		c_0 = beta0 * C[0+bs*3] + a_0 * b_0;
1092 
1093 		D[0+bs*3] = c_0;
1094 
1095 		B += 16;
1096 		C += 16;
1097 		D += 16;
1098 
1099 		}
1100 	for(; k<kmax; k++)
1101 		{
1102 
1103 		b_0 = B[0+bs*0];
1104 
1105 		c_0 = beta0 * C[0+bs*0] + a_0 * b_0;
1106 
1107 		D[0+bs*0] = c_0;
1108 
1109 		B += 4;
1110 		C += 4;
1111 		D += 4;
1112 
1113 		}
1114 
1115 	}
1116 #endif
1117 
1118 
1119 
1120