1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2013/11/14 Saar
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*
34**************************************************************************************/
35
36#define ASSEMBLER
37#include "common.h"
38
39#define STACKSIZE 256
40
41#if !defined(__ARM_PCS_VFP)
42
43#if !defined(COMPLEX)
44
45#if !defined(DOUBLE)
46#define OLD_X		[fp, #0 ]
47#define	OLD_INC_X	[fp, #4 ]
48#define	OLD_Y		[fp, #8 ]
49#define	OLD_INC_Y	[fp, #12 ]
50#else
51#define OLD_X		[fp, #8 ]
52#define	OLD_INC_X	[fp, #12]
53#define	OLD_Y		[fp, #16]
54#define	OLD_INC_Y	[fp, #20]
55#endif
56
57#else //COMPLEX
58
59#if !defined(DOUBLE)
60#define OLD_X		[fp, #4 ]
61#define	OLD_INC_X	[fp, #8 ]
62#define	OLD_Y		[fp, #12 ]
63#define	OLD_INC_Y	[fp, #16 ]
64#else
65#define OLD_X		[fp, #16]
66#define	OLD_INC_X	[fp, #20]
67#define	OLD_Y		[fp, #24]
68#define	OLD_INC_Y	[fp, #28]
69#endif
70
71#endif // !defined(__ARM_PCS_VFP)
72
73#else
74#define	OLD_INC_X	[fp, #0 ]
75#define	OLD_Y		[fp, #4 ]
76#define	OLD_INC_Y	[fp, #8 ]
77#endif
78
79
80#define	N	r0
81#define Y	r1
82#define	INC_X	r2
83#define	X	r3
84#define INC_Y	r4
85
86#define I	r12
87
88#define X_PRE	512
89
90/**************************************************************************************
91* Macro definitions
92**************************************************************************************/
93
94/*****************************************************************************************/
95
96
97
98#if	!defined(COMPLEX)
99
100#if	defined(DOUBLE)
101
102.macro KERNEL_F4
103
104	pld	[ X, #X_PRE ]
105	pld	[ Y, #X_PRE ]
106	vldmia.f64	X,  { d0 - d3 }
107	vldmia.f64	Y,  { d4 - d7 }
108	vstmia.f64	Y!, { d0 - d3 }
109	vstmia.f64	X!, { d4 - d7}
110
111.endm
112
113
114.macro KERNEL_F1
115
116	vldmia.f64	X,  { d0 }
117	vldmia.f64	Y,  { d4 }
118	vstmia.f64	Y!, { d0 }
119	vstmia.f64	X!, { d4 }
120
121.endm
122
123.macro KERNEL_S1
124
125	vldmia.f64	X, { d0 }
126	vldmia.f64	Y, { d4 }
127	vstmia.f64	Y, { d0 }
128	vstmia.f64	X, { d4 }
129	add	X, X, INC_X
130	add	Y, Y, INC_Y
131
132.endm
133
134#else
135
136.macro KERNEL_F4
137
138	vldmia.f32	X,  { s0 - s3 }
139	vldmia.f32	Y,  { s4 - s7 }
140	vstmia.f32	Y!, { s0 - s3 }
141	vstmia.f32	X!, { s4 - s7}
142
143.endm
144
145
146.macro KERNEL_F1
147
148	vldmia.f32	X,  { s0 }
149	vldmia.f32	Y,  { s4 }
150	vstmia.f32	Y!, { s0 }
151	vstmia.f32	X!, { s4 }
152
153.endm
154
155.macro KERNEL_S1
156
157	vldmia.f32	X, { s0 }
158	vldmia.f32	Y, { s4 }
159	vstmia.f32	Y, { s0 }
160	vstmia.f32	X, { s4 }
161	add	X, X, INC_X
162	add	Y, Y, INC_Y
163
164.endm
165
166
167#endif
168
169#else
170
171#if	defined(DOUBLE)
172
173.macro KERNEL_F4
174
175	pld	[ X, #X_PRE ]
176	pld	[ Y, #X_PRE ]
177	vldmia.f64	X,  { d0 - d3 }
178	vldmia.f64	Y,  { d4 - d7 }
179	vstmia.f64	Y!, { d0 - d3 }
180	vstmia.f64	X!, { d4 - d7}
181
182	pld	[ X, #X_PRE ]
183	pld	[ Y, #X_PRE ]
184	vldmia.f64	X,  { d0 - d3 }
185	vldmia.f64	Y,  { d4 - d7 }
186	vstmia.f64	Y!, { d0 - d3 }
187	vstmia.f64	X!, { d4 - d7}
188
189.endm
190
191.macro KERNEL_F1
192
193	vldmia.f64	X,  { d0 - d1 }
194	vldmia.f64	Y,  { d4 - d5 }
195	vstmia.f64	Y!, { d0 - d1 }
196	vstmia.f64	X!, { d4 - d5 }
197
198.endm
199
200.macro KERNEL_S1
201
202	vldmia.f64	X,  { d0 - d1 }
203	vldmia.f64	Y,  { d4 - d5 }
204	vstmia.f64	Y,  { d0 - d1 }
205	vstmia.f64	X,  { d4 - d5 }
206	add	X, X, INC_X
207	add	Y, Y, INC_Y
208
209.endm
210
211
212#else
213
214.macro KERNEL_F4
215
216	pld	[ X, #X_PRE ]
217	pld	[ Y, #X_PRE ]
218	vldmia.f32	X,  { s0 - s3 }
219	vldmia.f32	Y,  { s4 - s7 }
220	vstmia.f32	Y!, { s0 - s3 }
221	vstmia.f32	X!, { s4 - s7}
222
223	vldmia.f32	X,  { s0 - s3 }
224	vldmia.f32	Y,  { s4 - s7 }
225	vstmia.f32	Y!, { s0 - s3 }
226	vstmia.f32	X!, { s4 - s7}
227
228.endm
229
230.macro KERNEL_F1
231
232	vldmia.f32	X,  { s0 - s1 }
233	vldmia.f32	Y,  { s4 - s5 }
234	vstmia.f32	Y!, { s0 - s1 }
235	vstmia.f32	X!, { s4 - s5 }
236
237.endm
238
239.macro KERNEL_S1
240
241	vldmia.f32	X,  { s0 - s1 }
242	vldmia.f32	Y,  { s4 - s5 }
243	vstmia.f32	Y,  { s0 - s1 }
244	vstmia.f32	X,  { s4 - s5 }
245	add	X, X, INC_X
246	add	Y, Y, INC_Y
247
248.endm
249
250
251
252#endif
253
254#endif
255
256/**************************************************************************************
257* End of macro definitions
258**************************************************************************************/
259
260	PROLOGUE
261
262	.align 5
263	push    {r4 , fp}
264        add     fp, sp, #8
265
266#if !defined(__ARM_PCS_VFP)
267	ldr	X, OLD_X
268#endif
269	ldr    INC_X , OLD_INC_X
270	ldr         Y, OLD_Y
271	ldr    INC_Y , OLD_INC_Y
272
273
274	cmp	N, #0
275	ble	swap_kernel_L999
276
277	cmp	INC_X, #0
278	beq	swap_kernel_L999
279
280	cmp	INC_Y, #0
281	beq	swap_kernel_L999
282
283	cmp	INC_X, #1
284	bne	swap_kernel_S_BEGIN
285
286	cmp	INC_Y, #1
287	bne	swap_kernel_S_BEGIN
288
289
290swap_kernel_F_BEGIN:
291
292
293	asrs	I, N, #2					// I = N / 4
294	ble	swap_kernel_F1
295
296	.align 5
297
298swap_kernel_F4:
299
300#if !defined(COMPLEX) && !defined(DOUBLE)
301	pld	[ X, #X_PRE ]
302	pld	[ Y, #X_PRE ]
303#endif
304
305	KERNEL_F4
306
307	subs	I, I, #1
308	ble	swap_kernel_F1
309
310	KERNEL_F4
311
312	subs	I, I, #1
313	bne	swap_kernel_F4
314
315swap_kernel_F1:
316
317	ands	I, N, #3
318	ble	swap_kernel_L999
319
320swap_kernel_F10:
321
322	KERNEL_F1
323
324	subs    I, I, #1
325        bne     swap_kernel_F10
326
327	b	swap_kernel_L999
328
329swap_kernel_S_BEGIN:
330
331#if defined(COMPLEX)
332
333#if defined(DOUBLE)
334	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
335	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE * 2
336#else
337	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
338	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE * 2
339#endif
340
341#else
342
343#if defined(DOUBLE)
344	lsl	INC_X, INC_X, #3				// INC_X * SIZE
345	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
346#else
347	lsl	INC_X, INC_X, #2				// INC_X * SIZE
348	lsl	INC_Y, INC_Y, #2				// INC_Y * SIZE
349#endif
350
351#endif
352
353
354	asrs	I, N, #2					// I = N / 4
355	ble	swap_kernel_S1
356
357	.align 5
358
359swap_kernel_S4:
360
361	KERNEL_S1
362	KERNEL_S1
363	KERNEL_S1
364	KERNEL_S1
365
366	subs	I, I, #1
367	bne	swap_kernel_S4
368
369swap_kernel_S1:
370
371	ands	I, N, #3
372	ble	swap_kernel_L999
373
374swap_kernel_S10:
375
376	KERNEL_S1
377
378	subs    I, I, #1
379        bne     swap_kernel_S10
380
381
382swap_kernel_L999:
383
384	mov	r0, #0		// set return value
385
386	sub     sp, fp, #8
387	pop     {r4,fp}
388	bx	lr
389
390	EPILOGUE
391
392