1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2013/11/07 Saar
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*
34**************************************************************************************/
35
36#define ASSEMBLER
37#include "common.h"
38
39#define STACKSIZE 256
40
41#define	OLD_M	r0
42#define	OLD_N	r1
43#define	OLD_A	r2
44#define	OLD_LDA	r3
45
46
47/******************************************************
48* [fp, #-128] - [fp, #-64] is reserved
49* for store and restore of floating point
50* registers
51*******************************************************/
52
53#define B	[fp, #4 ]
54#define A	[fp, #-248 ]
55
56#define M	r0
57#define N	r1
58#define M4	r2
59
60#define	LDA	r5
61
62#define	AO1	r6
63#define	BO1	r7
64#define	BO2	r8
65
66#define I	r4
67#define	J	r12
68
69#define A_PRE	256
70
71/**************************************************************************************
72* Macro definitions
73**************************************************************************************/
74.macro COPY2x2
75
76	pld	[ AO1, #A_PRE ]
77	vldmia.f64	AO1, { d0 - d3 }
78
79	add	r3, AO1, LDA
80	pld	[ r3, #A_PRE ]
81	vldmia.f64	r3, { d4 - d7 }
82
83	vstmia.f64	BO1, { d0 - d7 }
84	add	AO1, AO1, #32
85	add	BO1, BO1, M4
86
87.endm
88
89.macro COPY1x2
90
91	vldmia.f64	AO1, { d0 -d1 }
92
93	add	r3, AO1, LDA
94	vldmia.f64	r3, { d2 - d3 }
95
96	vstmia.f64	BO2, { d0 - d3 }
97	add	AO1, AO1, #16
98	add	BO2, BO2, #32
99
100.endm
101
102/*************************************************************************************************************************/
103.macro COPY2x1
104
105	vldmia.f64	AO1, { d0 - d3 }
106
107	vstmia.f64	BO1, { d0 - d3 }
108	add	AO1, AO1, #32
109	add	BO1, BO1, M4
110
111.endm
112
113.macro COPY1x1
114
115	vldmia.f64	AO1, { d0 - d1 }
116
117	vstmia.f64	BO2, { d0 - d1 }
118	add	AO1, AO1, #16
119	add	BO2, BO2, #16
120
121.endm
122
123
124
125/**************************************************************************************
126* End of macro definitions
127**************************************************************************************/
128
129	PROLOGUE
130
131	.align 5
132
133	push	{r4 - r9, fp}
134	add	fp, sp, #24
135	sub	sp, sp, #STACKSIZE				// reserve stack
136
137	str	OLD_A, A					// store A
138
139	lsl	LDA, OLD_LDA, #4				// lda = lda * SIZE * 2
140
141	sub	r4, fp, #128
142	vstm	r4, { d8 - d15} 				// store floating point registers
143
144	lsl	r4 , M, #4					// M * SIZE * 2
145
146	ldr	r3, B
147
148	and	BO2 , N , #-2
149
150	mul	BO2, BO2, r4
151
152	add	BO2 , BO2, r3
153
154	lsl	M4, M, #5					// M4 = M * 2 * SIZE * 2
155
156zgemm_tcopy_L2_BEGIN:
157
158	asrs	J, M, #1					// J = N / 2
159	ble	zgemm_tcopy_L1_BEGIN
160
161zgemm_tcopy_L2_M2_BEGIN:
162
163	ldr	AO1, A						// AO1 = A
164	lsl	r3, LDA, #1					// r3 = 2 * LDA
165	add	r3, r3 , AO1					// A = A + 2 * LDA
166	str	r3, A						// store A
167
168	ldr	BO1, B
169	add	r3, BO1, #64					// B = B + 4 * SIZE *2
170	str	r3, B
171
172	asrs	I, N, #1					// I = M / 2
173	ble	zgemm_tcopy_L2_M2_60
174
175zgemm_tcopy_L2_M2_40:
176
177	COPY2x2
178	subs I, I, #1
179	bne	zgemm_tcopy_L2_M2_40
180
181zgemm_tcopy_L2_M2_60:
182
183	tst	N , #1
184	ble	zgemm_tcopy_L2_M2_END
185
186	COPY1x2
187
188
189zgemm_tcopy_L2_M2_END:
190
191	subs	J , J, #1						// j--
192	bne	zgemm_tcopy_L2_M2_BEGIN
193
194/*********************************************************************************************/
195
196zgemm_tcopy_L1_BEGIN:
197
198	tst	M, #1
199	ble	zgemm_tcopy_L999
200
201
202zgemm_tcopy_L1_M2_BEGIN:
203
204	ldr	AO1, A						// AO1 = A
205	add	r3, LDA , AO1					// A = A + 1 * LDA
206	str	r3, A						// store A
207
208	ldr	BO1, B
209	add	r3, BO1, #32					// B = B + 2 * SIZE *2
210	str	r3, B
211
212	asrs	I, N, #1					// I = M / 2
213	ble	zgemm_tcopy_L1_M2_60
214
215
216zgemm_tcopy_L1_M2_40:
217
218	COPY2x1
219	subs I, I, #1
220	bne	zgemm_tcopy_L1_M2_40
221
222zgemm_tcopy_L1_M2_60:
223
224	tst	N , #1
225	ble	zgemm_tcopy_L1_M2_END
226
227	COPY1x1
228
229
230zgemm_tcopy_L1_M2_END:
231
232
233
234zgemm_tcopy_L999:
235
236	sub	r3, fp, #128
237	vldm	r3, { d8 - d15}					// restore floating point registers
238
239	mov	r0, #0						// set return value
240	sub	sp, fp, #24
241	pop	{r4 - r9, fp}
242	bx	lr
243
244	EPILOGUE
245
246