1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2013/11/05 Saar
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*
34**************************************************************************************/
35
36#define ASSEMBLER
37#include "common.h"
38
39#define STACKSIZE 256
40
41#define	OLD_M	r0
42#define	OLD_N	r1
43#define	OLD_A	r2
44#define	OLD_LDA	r3
45
46
47/******************************************************
48* [fp, #-128] - [fp, #-64] is reserved
49* for store and restore of floating point
50* registers
51*******************************************************/
52
53#define LDA	[fp, #-260 ]
54
55#define B	[fp, #4 ]
56
57#define M	r0
58#define N	r1
59#define A	r2
60
61#define	BO	r5
62
63#define	AO1	r6
64#define	AO2	r7
65
66#define I	r3
67#define	J	r12
68
69#define A_PRE	256
70
71/**************************************************************************************
72* Macro definitions
73**************************************************************************************/
74
75.macro COPY2x2
76
77	pld	[ AO1, #A_PRE  ]
78	pld	[ AO2, #A_PRE  ]
79	fldd	d0 , [ AO1, #0  ]
80	fldd	d1 , [ AO1, #8  ]
81	fldd	d4 , [ AO1, #16 ]
82	fldd	d5 , [ AO1, #24 ]
83
84	fldd	d2 , [ AO2, #0  ]
85	fldd	d3 , [ AO2, #8  ]
86	add	AO1, AO1, #32
87	fldd	d6 , [ AO2, #16 ]
88	fldd	d7 , [ AO2, #24 ]
89
90	vstmia.f64	BO!, { d0 - d7 }
91	add	AO2, AO2, #32
92
93.endm
94
95
96.macro COPY1x2
97
98	fldd	d0 , [ AO1, #0  ]
99	fldd	d1 , [ AO1, #8  ]
100	fldd	d2 , [ AO2, #0  ]
101	fldd	d3 , [ AO2, #8  ]
102
103	add	AO1, AO1, #16
104	vstmia.f64	BO!, { d0 - d3 }
105	add	AO2, AO2, #16
106
107.endm
108
109.macro COPY2x1
110
111	fldd	d0 , [ AO1, #0  ]
112	fldd	d1 , [ AO1, #8  ]
113	fldd	d2 , [ AO1, #16 ]
114	fldd	d3 , [ AO1, #24 ]
115
116	vstmia.f64	BO!, { d0 - d3 }
117	add	AO1, AO1, #32
118
119.endm
120
121
122.macro COPY1x1
123
124	fldd	d0 , [ AO1, #0  ]
125	fldd	d1 , [ AO1, #8  ]
126
127	vstmia.f64	BO!, { d0 - d1 }
128	add	AO1, AO1, #16
129
130.endm
131
132
133
134
135
136/**************************************************************************************
137* End of macro definitions
138**************************************************************************************/
139
140	PROLOGUE
141
142	.align 5
143
144	push	{r4 - r9, fp}
145	add	fp, sp, #24
146	sub	sp, sp, #STACKSIZE				// reserve stack
147
148
149	lsl	r3, r3, #4					// lda = lda * 8 * 2
150	str	r3, LDA
151
152	sub	r4, fp, #128
153	vstm	r4, { d8 - d15} 				// store floating point registers
154
155	ldr	BO, B
156
157/*********************************************************************************************/
158
159zgemm_ncopy_L2_BEGIN:
160
161	asrs	J, N, #1					// J = N / 2
162	ble	zgemm_ncopy_L1_BEGIN
163
164zgemm_ncopy_L2_M2_BEGIN:
165
166	mov	AO1, A						// AO1 = A
167	ldr	r4 , LDA
168	add	AO2, AO1, r4
169	add	A  , AO2, r4 					// A = A + 2 * LDA
170
171	asrs	I, M, #1					// I = M / 2
172	ble	zgemm_ncopy_L2_M2_40
173
174zgemm_ncopy_L2_M2_20:
175
176	COPY2x2
177
178	subs	I , I , #1
179	bne	zgemm_ncopy_L2_M2_20
180
181
182zgemm_ncopy_L2_M2_40:
183
184	ands	I, M , #1
185	ble	zgemm_ncopy_L2_M2_END
186
187zgemm_ncopy_L2_M2_60:
188
189	COPY1x2
190
191	subs	I , I , #1
192	bne	zgemm_ncopy_L2_M2_60
193
194
195zgemm_ncopy_L2_M2_END:
196
197	subs    J , J, #1                                               // j--
198        bne     zgemm_ncopy_L2_M2_BEGIN
199
200
201/*********************************************************************************************/
202
203zgemm_ncopy_L1_BEGIN:
204
205	tst	N, #1
206	ble	zgemm_ncopy_L999
207
208
209zgemm_ncopy_L1_M2_BEGIN:
210
211	mov	AO1, A						// AO1 = A
212	ldr	r4 , LDA
213	add	A  , AO1, r4 					// A = A + 1 * LDA
214
215	asrs	I, M, #1					// I = M / 2
216	ble	zgemm_ncopy_L1_M2_40
217
218zgemm_ncopy_L1_M2_20:
219
220	COPY2x1
221
222	subs	I , I , #1
223	bne	zgemm_ncopy_L1_M2_20
224
225
226zgemm_ncopy_L1_M2_40:
227
228	ands	I, M , #1
229	ble	zgemm_ncopy_L1_M2_END
230
231zgemm_ncopy_L1_M2_60:
232
233	COPY1x1
234
235	subs	I , I , #1
236	bne	zgemm_ncopy_L1_M2_60
237
238
239zgemm_ncopy_L1_M2_END:
240
241
242
243zgemm_ncopy_L999:
244
245	sub	r3, fp, #128
246	vldm	r3, { d8 - d15}					// restore floating point registers
247
248	movs	r0, #0						// set return value
249	sub	sp, fp, #24
250	pop	{r4 - r9, fp}
251	bx	lr
252
253	EPILOGUE
254
255