1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36
37
38
39
40// subroutine
41//
42// input arguments:
43// r4   <- alpha
44// r5   <- beta
45// r6   <- C
46// r7   <- ldc*sizeof(float)
47//
48// output arguments:
49
50#if MACRO_LEVEL>=1
51	.macro INNER_SCALE_AB_8X4_LIB lc_zero
52#else
53	.align 3
5499: // 0
55	.word 0
56	.word 0
57//	.p2align 4,,15
58#if defined(OS_LINUX)
59	.type inner_scale_ab_8x4_lib, %function
60inner_scale_ab_8x4_lib:
61#elif defined(OS_MAC)
62_inner_scale_ab_8x4_lib:
63#endif
64#endif
65
66	flds		s8, [r4, #0] // alpha
67	flds		s9, [r5, #0] // beta
68
69#if MACRO_LEVEL>=1
70	flds		s10, \lc_zero // 0.0
71#else
72	flds		s10, 99b // 0.0
73#endif
74
75	vmul.f32	q4, q4, d4[0]
76	vmul.f32	q5, q5, d4[0]
77	vmul.f32	q6, q6, d4[0]
78	vmul.f32	q7, q7, d4[0]
79	fcmpes		s9, s10
80	vmul.f32	q8, q8, d4[0]
81	vmul.f32	q9, q9, d4[0]
82	vmul.f32	q10, q10, d4[0]
83	vmul.f32	q11, q11, d4[0]
84	fmstat
85
86	beq		0f // end
87
88	add		r8, r6, #16
89
90	vld1.64		{d0, d1}, [r6], r7
91	vld1.64		{d2, d3}, [r8], r7
92	vmla.f32	q4, q0, d4[1]
93	vmla.f32	q8, q1, d4[1]
94	vld1.64		{d0, d1}, [r6], r7
95	vld1.64		{d2, d3}, [r8], r7
96	vmla.f32	q5, q0, d4[1]
97	vmla.f32	q9, q1, d4[1]
98	vld1.64		{d0, d1}, [r6], r7
99	vld1.64		{d2, d3}, [r8], r7
100	vmla.f32	q6, q0, d4[1]
101	vmla.f32	q10, q1, d4[1]
102	vld1.64		{d0, d1}, [r6], r7
103	vld1.64		{d2, d3}, [r8], r7
104	vmla.f32	q7, q0, d4[1]
105	vmla.f32	q11, q1, d4[1]
106
1070:
108
109#if MACRO_LEVEL>=1
110	.endm
111#else
112	mov		pc, lr // return
113
114#if defined(OS_LINUX)
115	.size	inner_scale_ab_8x4_lib, .-inner_scale_ab_8x4_lib
116#endif
117#endif
118
119
120
121
122
123// subroutine
124//
125// input arguments:
126// r4   <- D
127// r5   <- ldd*sizeof(float)
128//
129// output arguments:
130
131#if MACRO_LEVEL>=1
132	.macro INNER_STORE_8X4_LIB
133#else
134//	.p2align 4,,15
135#if defined(OS_LINUX)
136	.type inner_store_8x4_lib, %function
137inner_store_8x4_lib:
138#elif defined(OS_MAC)
139_inner_store_8x4_lib4:
140#endif
141#endif
142
143	add		r6, r4, #16
144
145	vst1.64		{d8, d9}, [r4], r5
146	vst1.64		{d16, d17}, [r6], r5
147	vst1.64		{d10, d11}, [r4], r5
148	vst1.64		{d18, d19}, [r6], r5
149	vst1.64		{d12, d13}, [r4], r5
150	vst1.64		{d20, d21}, [r6], r5
151	vst1.64		{d14, d15}, [r4], r5
152	vst1.64		{d22, d23}, [r6], r5
153
154#if MACRO_LEVEL>=1
155	.endm
156#else
157	mov		pc, lr // return
158
159#if defined(OS_LINUX)
160	.size	inner_store_8x4_lib, .-inner_store_8x4_lib
161#endif
162#endif
163
164
165
166
167
168	.align 3
16999: // 0
170	.word 0
171	.word 0
172
173
174
175
176//                               r0        r1             r2         r3       sp+0       sp+4          sp+8       sp+12    sp+16   sp+20
177// void kernel_sgemm_nt_8x4_lib44cc(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int ldc, double *D, int ldd)
178
179//	.p2align 4,,15
180#if defined(OS_LINUX)
181	.global	kernel_sgemm_nt_8x4_lib44cc
182	.type	kernel_sgemm_nt_8x4_lib44cc, %function
183kernel_sgemm_nt_8x4_lib44cc:
184#elif defined(OS_MAC)
185	.global	kernel_sgemm_nt_8x4_lib44cc
186_kernel_sgemm_nt_8x4_lib44cc:
187#endif
188
189	PROLOGUE
190
191
192
193	// zero accumulation registers
194	vldr	d8, 99b
195	vldr	d9, 99b
196	vmov	q5, q4
197	vmov	q6, q4
198	vmov	q7, q4
199	vmov	q8, q4
200	vmov	q9, q4
201	vmov	q10, q4
202	vmov	q11, q4
203
204
205
206	// call inner kernel dgemm nt
207	mov		r4, r0 // kmax
208	mov		r5, r2 // A
209	mov		r6, r3 // sda
210	lsl		r6, r6, #4 // 4*sizeof(float)*sda
211	ldr		r7, [fp, #0] // B
212
213#if MACRO_LEVEL>=2
214	INNER_KERNEL_GEMM_ADD_NT_8X4_LIB4
215#else
216#if defined(OS_LINUX)
217	bl	inner_kernel_gemm_add_nt_8x4_lib4
218#elif defined(OS_MAC)
219	bl	_inner_kernel_gemm_add_nt_8x4_lib4
220#endif
221#endif
222
223
224
225	// call inner blend for generic alpha and beta
226	mov		r4, r1 // alpha
227	ldr		r5, [fp, #4] // beta
228	ldr		r6, [fp, #8] // C
229	ldr		r7, [fp, #12] // ldc
230	lsl		r7, r7, #2 // sizeof(float)*ldc
231
232#if MACRO_LEVEL>=1
233	INNER_SCALE_AB_8X4_LIB 99f
234#else
235#if defined(OS_LINUX)
236	bl inner_scale_ab_8x4_lib
237#elif defined(OS_MAC)
238	bl _inner_scale_ab_8x4_lib
239#endif
240#endif
241
242
243
244	// store n
245	ldr		r4, [fp, #16] // D
246	ldr		r5, [fp, #20] // ldd
247	lsl		r5, r5, #2 // sizeof(float)*ldd
248
249#if MACRO_LEVEL>=1
250	INNER_STORE_8X4_LIB
251#else
252#if defined(OS_LINUX)
253	bl inner_store_8x4_lib
254#elif defined(OS_MAC)
255	bl _inner_store_8x4_lib
256#endif
257#endif
258
259
260
261	EPILOGUE
262
263#if defined(OS_LINUX)
264	.size	kernel_sgemm_nt_8x4_lib44cc, .-kernel_sgemm_nt_8x4_lib44cc
265#endif
266
267
268
269
270
271	.align 3
27299: // 0
273	.word 0
274	.word 0
275
276
277
278
279
280
281
282