1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2013/11/05 Saar
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*
34**************************************************************************************/
35
36#define ASSEMBLER
37#include "common.h"
38
39#define STACKSIZE 256
40
41#define	OLD_M	r0
42#define	OLD_N	r1
43#define	OLD_A	r2
44#define	OLD_LDA	r3
45
46
47/******************************************************
48* [fp, #-128] - [fp, #-64] is reserved
49* for store and restore of floating point
50* registers
51*******************************************************/
52
53#define LDA	[fp, #-260 ]
54
55#define B	[fp, #4 ]
56
57#define M	r0
58#define N	r1
59#define A	r2
60
61#define	BO	r5
62
63#define	AO1	r6
64#define	AO2	r7
65#define	AO3	r8
66#define	AO4	r9
67
68#define I	r3
69#define	J	r12
70
71#define A_PRE	192
72
73/**************************************************************************************
74* Macro definitions
75**************************************************************************************/
76
77.macro COPY4x4
78
79	flds s0 , [ AO1, #0  ]
80	flds s1 , [ AO2, #0  ]
81	flds s2 , [ AO3, #0  ]
82	flds s3 , [ AO4, #0  ]
83
84	flds s4 , [ AO1, #4  ]
85	flds s8 , [ AO1, #8 ]
86	flds s12, [ AO1, #12 ]
87
88	flds s5 , [ AO2, #4  ]
89	add	AO1, AO1, #16
90	flds s9 , [ AO2, #8 ]
91	flds s13, [ AO2, #12 ]
92
93	flds s6 , [ AO3, #4  ]
94	add	AO2, AO2, #16
95	flds s10, [ AO3, #8 ]
96	flds s14, [ AO3, #12 ]
97
98	flds s7 , [ AO4, #4  ]
99	add	AO3, AO3, #16
100	flds s11, [ AO4, #8 ]
101	flds s15, [ AO4, #12 ]
102
103	vstmia.f32	BO!, { s0 - s3 }
104	add	AO4, AO4, #16
105	vstmia.f32	BO!, { s4 - s7 }
106	vstmia.f32	BO!, { s8 - s15 }
107
108.endm
109
110.macro COPY1x4
111
112	flds s0 , [ AO1, #0  ]
113	flds s1 , [ AO2, #0  ]
114	add	AO1, AO1, #4
115	flds s2 , [ AO3, #0  ]
116	add	AO2, AO2, #4
117	flds s3 , [ AO4, #0  ]
118
119	add	AO3, AO3, #4
120	vstmia.f32	BO!, { s0 - s3 }
121	add	AO4, AO4, #4
122
123.endm
124
125.macro COPY4x2
126
127	flds s0 , [ AO1, #0  ]
128	flds s2 , [ AO1, #4  ]
129	flds s4 , [ AO1, #8 ]
130	flds s6 , [ AO1, #12 ]
131
132	flds s1 , [ AO2, #0  ]
133	flds s3 , [ AO2, #4  ]
134	add	AO1, AO1, #16
135	flds s5 , [ AO2, #8 ]
136	flds s7 , [ AO2, #12 ]
137
138	vstmia.f32	BO!, { s0 - s7 }
139	add	AO2, AO2, #16
140
141.endm
142
143
144.macro COPY1x2
145
146	flds s0 , [ AO1, #0  ]
147	flds s1 , [ AO2, #0  ]
148	add	AO1, AO1, #4
149
150	vstmia.f32	BO!, { s0 - s1 }
151	add	AO2, AO2, #4
152
153.endm
154
155.macro COPY4x1
156
157	flds s0 , [ AO1, #0  ]
158	flds s1 , [ AO1, #4  ]
159	flds s2 , [ AO1, #8 ]
160	flds s3 , [ AO1, #12 ]
161
162	vstmia.f32	BO!, { s0 - s3 }
163	add	AO1, AO1, #16
164
165.endm
166
167
168.macro COPY1x1
169
170	flds s0 , [ AO1, #0  ]
171
172	vstmia.f32	BO!, { s0 }
173	add	AO1, AO1, #4
174
175.endm
176
177
178
179
180
181/**************************************************************************************
182* End of macro definitions
183**************************************************************************************/
184
185	PROLOGUE
186
187	.align 5
188
189	push	{r4 - r9, fp}
190	add	fp, sp, #24
191	sub	sp, sp, #STACKSIZE				// reserve stack
192
193
194	lsl	r3, r3, #2					// lda = lda * 4
195	str	r3, LDA
196
197	sub	r4, fp, #128
198	vstm	r4, { s8 - s15} 				// store floating point registers
199
200	ldr	BO, B
201
202sgemm_ncopy_L4_BEGIN:
203
204	asrs	J, N, #2					// J = N / 4
205	ble	sgemm_ncopy_L2_BEGIN
206
207sgemm_ncopy_L4_M4_BEGIN:
208
209	mov	AO1, A						// AO1 = A
210	ldr	r4 , LDA
211	add	AO2, AO1, r4
212	add	AO3, AO2, r4
213	add	AO4, AO3, r4
214	add	A  , AO4, r4					// A = A + 4 * LDA
215
216	asrs	I, M, #2					// I = M / 4
217	ble	sgemm_ncopy_L4_M4_40
218
219sgemm_ncopy_L4_M4_20:
220
221	pld	[ AO1, #A_PRE ]
222	pld	[ AO2, #A_PRE ]
223	pld	[ AO3, #A_PRE ]
224	pld	[ AO4, #A_PRE ]
225	COPY4x4
226
227	subs	I , I , #1
228	ble	sgemm_ncopy_L4_M4_40
229
230	COPY4x4
231
232	subs	I , I , #1
233	bne	sgemm_ncopy_L4_M4_20
234
235
236sgemm_ncopy_L4_M4_40:
237
238	ands	I, M , #3
239	ble	sgemm_ncopy_L4_M4_END
240
241sgemm_ncopy_L4_M4_60:
242
243	COPY1x4
244
245	subs	I , I , #1
246	bne	sgemm_ncopy_L4_M4_60
247
248
249sgemm_ncopy_L4_M4_END:
250
251	subs	J , J, #1						// j--
252	bne	sgemm_ncopy_L4_M4_BEGIN
253
254
255
256/*********************************************************************************************/
257
258sgemm_ncopy_L2_BEGIN:
259
260	tst	N, #3
261	ble	sgemm_ncopy_L999
262
263	tst	N, #2
264	ble	sgemm_ncopy_L1_BEGIN
265
266sgemm_ncopy_L2_M4_BEGIN:
267
268	mov	AO1, A						// AO1 = A
269	ldr	r4 , LDA
270	add	AO2, AO1, r4
271	add	A  , AO2, r4 					// A = A + 2 * LDA
272
273	asrs	I, M, #2					// I = M / 4
274	ble	sgemm_ncopy_L2_M4_40
275
276sgemm_ncopy_L2_M4_20:
277
278	COPY4x2
279
280	subs	I , I , #1
281	bne	sgemm_ncopy_L2_M4_20
282
283
284sgemm_ncopy_L2_M4_40:
285
286	ands	I, M , #3
287	ble	sgemm_ncopy_L2_M4_END
288
289sgemm_ncopy_L2_M4_60:
290
291	COPY1x2
292
293	subs	I , I , #1
294	bne	sgemm_ncopy_L2_M4_60
295
296
297sgemm_ncopy_L2_M4_END:
298
299
300/*********************************************************************************************/
301
302sgemm_ncopy_L1_BEGIN:
303
304	tst	N, #1
305	ble	sgemm_ncopy_L999
306
307
308sgemm_ncopy_L1_M4_BEGIN:
309
310	mov	AO1, A						// AO1 = A
311	ldr	r4 , LDA
312	add	A  , AO1, r4 					// A = A + 1 * LDA
313
314	asrs	I, M, #2					// I = M / 4
315	ble	sgemm_ncopy_L1_M4_40
316
317sgemm_ncopy_L1_M4_20:
318
319	COPY4x1
320
321	subs	I , I , #1
322	bne	sgemm_ncopy_L1_M4_20
323
324
325sgemm_ncopy_L1_M4_40:
326
327	ands	I, M , #3
328	ble	sgemm_ncopy_L1_M4_END
329
330sgemm_ncopy_L1_M4_60:
331
332	COPY1x1
333
334	subs	I , I , #1
335	bne	sgemm_ncopy_L1_M4_60
336
337
338sgemm_ncopy_L1_M4_END:
339
340
341
342sgemm_ncopy_L999:
343
344	sub	r3, fp, #128
345	vldm	r3, { s8 - s15}					// restore floating point registers
346
347	movs	r0, #0						// set return value
348	sub	sp, fp, #24
349	pop	{r4 - r9, fp}
350	bx	lr
351
352	EPILOGUE
353
354