1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2013/11/29 Saar
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*
34**************************************************************************************/
35
36#define ASSEMBLER
37#include "common.h"
38
39#define STACKSIZE 256
40
41#if !defined(__ARM_PCS_VFP)
42#define OLD_ALPHAR	[fp, #0 ]
43#define OLD_ALPHAI	[fp, #8 ]
44#define OLD_A_SOFTFP	[fp, #16]
45#define OLD_LDA		[fp, #20]
46#define X		[fp, #24]
47#define OLD_INC_X	[fp, #28]
48#define Y		[fp, #32]
49#define OLD_INC_Y	[fp, #36]
50#else
51#define OLD_LDA		[fp, #0 ]
52#define X		[fp, #4 ]
53#define OLD_INC_X	[fp, #8 ]
54#define Y		[fp, #12 ]
55#define OLD_INC_Y	[fp, #16 ]
56#endif
57
58#define OLD_A		r3
59#define	OLD_N		r1
60
61#define M	r0
62#define AO1	r1
63#define J	r2
64
65#define AO2	r4
66#define XO	r5
67#define YO	r6
68#define LDA	r7
69#define INC_X	r8
70#define INC_Y	r9
71
72#define I	r12
73
74#define FP_ZERO [fp, #-228]
75#define FP_ZERO_0 [fp, #-228]
76#define FP_ZERO_1 [fp, #-224]
77
78#define N	[fp, #-252 ]
79#define A	[fp, #-256 ]
80
81
82#define X_PRE	512
83#define A_PRE	512
84#define Y_PRE	32
85
86/**************************************************************************************
87* Macro definitions
88**************************************************************************************/
89
90#if !defined(CONJ) && !defined(XCONJ)
91
92        #define KMAC_R  vmls.f64
93        #define KMAC_I  fmacd
94
95        #define FMAC_R1 fmacd
96        #define FMAC_R2 vmls.f64
97        #define FMAC_I1 fmacd
98        #define FMAC_I2 fmacd
99
100#elif defined(CONJ) && !defined(XCONJ)
101
102        #define KMAC_R  fmacd
103        #define KMAC_I  vmls.f64
104
105        #define FMAC_R1 fmacd
106        #define FMAC_R2 vmls.f64
107        #define FMAC_I1 fmacd
108        #define FMAC_I2 fmacd
109
110#elif !defined(CONJ) && defined(XCONJ)
111
112        #define KMAC_R  fmacd
113        #define KMAC_I  vmls.f64
114
115        #define FMAC_R1 fmacd
116        #define FMAC_R2 fmacd
117        #define FMAC_I1 vmls.f64
118        #define FMAC_I2 fmacd
119
120#else
121
122        #define KMAC_R  vmls.f64
123        #define KMAC_I  fmacd
124
125        #define FMAC_R1 fmacd
126        #define FMAC_R2 fmacd
127        #define FMAC_I1 vmls.f64
128        #define FMAC_I2 fmacd
129
130#endif
131
132
133
134.macro INIT_F2
135
136	fldd		d12, FP_ZERO
137	vmov.f64	d13, d12
138	vmov.f64	d14, d12
139	vmov.f64	d15, d12
140
141.endm
142
143.macro KERNEL_F2X4
144
145	KERNEL_F2X1
146	KERNEL_F2X1
147	KERNEL_F2X1
148	KERNEL_F2X1
149
150.endm
151
152.macro KERNEL_F2X1
153
154	vldmia.f64	XO! ,  { d2 - d3 }
155	vldmia.f64	AO1!,  { d4 - d5 }
156
157	fmacd	d12 , d4 , d2
158	fmacd	d13 , d4 , d3
159	vldmia.f64	AO2!,  { d8 - d9   }
160	KMAC_R  d12 , d5 , d3
161        KMAC_I  d13 , d5 , d2
162
163	fmacd	d14 , d8 , d2
164	fmacd	d15 , d8 , d3
165        KMAC_R  d14 , d9 , d3
166        KMAC_I  d15 , d9 , d2
167
168.endm
169
170.macro	SAVE_F2
171
172	vldmia.f64	YO,  { d4 - d7 }
173
174	FMAC_R1 d4 , d0 , d12
175        FMAC_I1 d5 , d0 , d13
176        FMAC_R2 d4 , d1 , d13
177        FMAC_I2 d5 , d1 , d12
178
179        FMAC_R1 d6 , d0 , d14
180        FMAC_I1 d7 , d0 , d15
181        FMAC_R2 d6 , d1 , d15
182        FMAC_I2 d7 , d1 , d14
183
184	vstmia.f64	YO!, { d4 - d7 }
185
186.endm
187
188/************************************************************************************************/
189
190.macro INIT_F1
191
192	fldd		d12, FP_ZERO
193	vmov.f64	d13, d12
194
195.endm
196
197.macro KERNEL_F1X4
198
199	KERNEL_F1X1
200	KERNEL_F1X1
201	KERNEL_F1X1
202	KERNEL_F1X1
203
204.endm
205
206.macro KERNEL_F1X1
207
208	vldmia.f64	XO! ,  { d2 - d3 }
209	vldmia.f64	AO1!,  { d4 - d5 }
210
211	fmacd	d12 , d4 , d2
212	fmacd	d13 , d4 , d3
213	KMAC_R  d12 , d5 , d3
214        KMAC_I  d13 , d5 , d2
215
216.endm
217
218.macro	SAVE_F1
219
220	vldmia.f64	YO,  { d4 - d5 }
221
222	FMAC_R1 d4 , d0 , d12
223        FMAC_I1 d5 , d0 , d13
224        FMAC_R2 d4 , d1 , d13
225        FMAC_I2 d5 , d1 , d12
226
227	vstmia.f64	YO!, { d4 - d5 }
228
229.endm
230
231/************************************************************************************************/
232
233.macro INIT_S2
234
235	fldd		d12, FP_ZERO
236	vmov.f64	d13, d12
237	vmov.f64	d14, d12
238	vmov.f64	d15, d12
239
240.endm
241
242.macro KERNEL_S2X4
243
244	KERNEL_S2X1
245	KERNEL_S2X1
246	KERNEL_S2X1
247	KERNEL_S2X1
248
249.endm
250
251.macro KERNEL_S2X1
252
253	vldmia.f64	XO  ,  { d2 - d3 }
254	vldmia.f64	AO1!,  { d4 - d5 }
255	vldmia.f64	AO2!,  { d8 - d9   }
256
257	fmacd	d12 , d4 , d2
258	fmacd	d13 , d4 , d3
259	KMAC_R  d12 , d5 , d3
260        KMAC_I  d13 , d5 , d2
261
262	fmacd	d14 , d8 , d2
263	fmacd	d15 , d8 , d3
264        KMAC_R  d14 , d9 , d3
265        KMAC_I  d15 , d9 , d2
266
267	add	XO, XO, INC_X
268
269.endm
270
271.macro	SAVE_S2
272
273	vldmia.f64	YO,  { d4 - d5 }
274
275	FMAC_R1 d4 , d0 , d12
276        FMAC_I1 d5 , d0 , d13
277        FMAC_R2 d4 , d1 , d13
278        FMAC_I2 d5 , d1 , d12
279
280	vstmia.f64	YO,  { d4 - d5 }
281
282	add	YO, YO, INC_Y
283
284	vldmia.f64	YO,  { d6 - d7 }
285
286        FMAC_R1 d6 , d0 , d14
287        FMAC_I1 d7 , d0 , d15
288        FMAC_R2 d6 , d1 , d15
289        FMAC_I2 d7 , d1 , d14
290
291	vstmia.f64	YO,  { d6 - d7 }
292
293	add	YO, YO, INC_Y
294
295.endm
296
297/************************************************************************************************/
298
299.macro INIT_S1
300
301	fldd		d12, FP_ZERO
302	vmov.f64	d13, d12
303
304.endm
305
306.macro KERNEL_S1X4
307
308	KERNEL_S1X1
309	KERNEL_S1X1
310	KERNEL_S1X1
311	KERNEL_S1X1
312
313.endm
314
315.macro KERNEL_S1X1
316
317	vldmia.f64	XO  ,  { d2 - d3 }
318	vldmia.f64	AO1!,  { d4 - d5 }
319
320	fmacd	d12 , d4 , d2
321	fmacd	d13 , d4 , d3
322	KMAC_R  d12 , d5 , d3
323        KMAC_I  d13 , d5 , d2
324
325	add	XO, XO, INC_X
326
327.endm
328
329.macro	SAVE_S1
330
331	vldmia.f64	YO,  { d4 - d5 }
332
333	FMAC_R1 d4 , d0 , d12
334        FMAC_I1 d5 , d0 , d13
335        FMAC_R2 d4 , d1 , d13
336        FMAC_I2 d5 , d1 , d12
337
338	vstmia.f64	YO,  { d4 - d5 }
339
340	add	YO, YO, INC_Y
341
342.endm
343
344
345
346/**************************************************************************************
347* End of macro definitions
348**************************************************************************************/
349
350	PROLOGUE
351
352	.align 5
353	push    {r4 - r9 , fp}
354        add     fp, sp, #28
355	sub     sp, sp, #STACKSIZE                              // reserve stack
356
357        sub     r12, fp, #192
358
359#if	defined(DOUBLE)
360        vstm    r12, { d8 - d15 }                                 // store floating point registers
361#else
362        vstm    r12, { s8 - s15 }                                 // store floating point registers
363#endif
364
365        movs    r12, #0
366        str     r12, FP_ZERO
367        str     r12, FP_ZERO_1
368
369	cmp	M, #0
370	ble	zgemvt_kernel_L999
371
372	cmp	OLD_N, #0
373	ble	zgemvt_kernel_L999
374
375#if !defined(__ARM_PCS_VFP)
376	vldr	d0, OLD_ALPHAR
377	vldr	d1, OLD_ALPHAI
378	ldr	OLD_A, OLD_A_SOFTFP
379#endif
380
381	str	OLD_A, A
382	str	OLD_N, N
383
384	ldr    INC_X , OLD_INC_X
385	ldr    INC_Y , OLD_INC_Y
386
387	cmp	INC_X, #0
388	beq	zgemvt_kernel_L999
389
390	cmp	INC_Y, #0
391	beq	zgemvt_kernel_L999
392
393	ldr	LDA, OLD_LDA
394
395
396#if defined(DOUBLE)
397	lsl	LDA, LDA, #4				// LDA * SIZE
398#else
399	lsl	LDA, LDA, #3				// LDA * SIZE
400#endif
401
402	cmp	INC_X, #1
403	bne	zgemvt_kernel_S2_BEGIN
404
405	cmp	INC_Y, #1
406	bne	zgemvt_kernel_S2_BEGIN
407
408
409zgemvt_kernel_F2_BEGIN:
410
411	ldr	YO , Y
412
413	ldr	J, N
414	asrs	J, J, #1					// J = N / 2
415	ble	zgemvt_kernel_F1_BEGIN
416
417zgemvt_kernel_F2X4:
418
419	ldr	AO1, A
420	add	AO2, AO1, LDA
421	add	r3 , AO2, LDA
422	str	r3 , A
423
424	ldr	XO , X
425
426	INIT_F2
427
428	asrs	I, M, #2					// I = M / 4
429	ble	zgemvt_kernel_F2X1
430
431
432zgemvt_kernel_F2X4_10:
433
434	KERNEL_F2X4
435
436	subs	I, I, #1
437	bne	zgemvt_kernel_F2X4_10
438
439
440zgemvt_kernel_F2X1:
441
442	ands	I, M , #3
443	ble	zgemvt_kernel_F2_END
444
445zgemvt_kernel_F2X1_10:
446
447	KERNEL_F2X1
448
449	subs	I, I, #1
450	bne	zgemvt_kernel_F2X1_10
451
452
453zgemvt_kernel_F2_END:
454
455	SAVE_F2
456
457	subs	J , J , #1
458	bne	zgemvt_kernel_F2X4
459
460
461zgemvt_kernel_F1_BEGIN:
462
463	ldr	J, N
464	ands	J, J, #1
465	ble	zgemvt_kernel_L999
466
467zgemvt_kernel_F1X4:
468
469	ldr	AO1, A
470
471	ldr	XO , X
472
473	INIT_F1
474
475	asrs	I, M, #2					// I = M / 4
476	ble	zgemvt_kernel_F1X1
477
478
479zgemvt_kernel_F1X4_10:
480
481	KERNEL_F1X4
482
483	subs	I, I, #1
484	bne	zgemvt_kernel_F1X4_10
485
486
487zgemvt_kernel_F1X1:
488
489	ands	I, M , #3
490	ble	zgemvt_kernel_F1_END
491
492zgemvt_kernel_F1X1_10:
493
494	KERNEL_F1X1
495
496	subs	I, I, #1
497	bne	zgemvt_kernel_F1X1_10
498
499
500zgemvt_kernel_F1_END:
501
502	SAVE_F1
503
504	b	zgemvt_kernel_L999
505
506
507
508/*************************************************************************************************************/
509
510zgemvt_kernel_S2_BEGIN:
511
512#if defined(DOUBLE)
513	lsl	INC_X, INC_X, #4				// INC_X * SIZE
514	lsl	INC_Y, INC_Y, #4				// INC_Y * SIZE
515#else
516	lsl	INC_X, INC_X, #3				// INC_X * SIZE
517	lsl	INC_Y, INC_Y, #3				// INC_Y * SIZE
518#endif
519
520	ldr	YO , Y
521
522	ldr	J, N
523	asrs	J, J, #1					// J = N / 2
524	ble	zgemvt_kernel_S1_BEGIN
525
526zgemvt_kernel_S2X4:
527
528	ldr	AO1, A
529	add	AO2, AO1, LDA
530	add	r3 , AO2, LDA
531	str	r3 , A
532
533	ldr	XO , X
534
535	INIT_S2
536
537	asrs	I, M, #2					// I = M / 4
538	ble	zgemvt_kernel_S2X1
539
540
541zgemvt_kernel_S2X4_10:
542
543	KERNEL_S2X4
544
545	subs	I, I, #1
546	bne	zgemvt_kernel_S2X4_10
547
548
549zgemvt_kernel_S2X1:
550
551	ands	I, M , #3
552	ble	zgemvt_kernel_S2_END
553
554zgemvt_kernel_S2X1_10:
555
556	KERNEL_S2X1
557
558	subs	I, I, #1
559	bne	zgemvt_kernel_S2X1_10
560
561
562zgemvt_kernel_S2_END:
563
564	SAVE_S2
565
566	subs	J , J , #1
567	bne	zgemvt_kernel_S2X4
568
569
570zgemvt_kernel_S1_BEGIN:
571
572	ldr	J, N
573	ands	J, J, #1
574	ble	zgemvt_kernel_L999
575
576zgemvt_kernel_S1X4:
577
578	ldr	AO1, A
579
580	ldr	XO , X
581
582	INIT_S1
583
584	asrs	I, M, #2					// I = M / 4
585	ble	zgemvt_kernel_S1X1
586
587
588zgemvt_kernel_S1X4_10:
589
590	KERNEL_S1X4
591
592	subs	I, I, #1
593	bne	zgemvt_kernel_S1X4_10
594
595
596zgemvt_kernel_S1X1:
597
598	ands	I, M , #3
599	ble	zgemvt_kernel_S1_END
600
601zgemvt_kernel_S1X1_10:
602
603	KERNEL_S1X1
604
605	subs	I, I, #1
606	bne	zgemvt_kernel_S1X1_10
607
608
609zgemvt_kernel_S1_END:
610
611	SAVE_S1
612
613
614
615/*************************************************************************************************************/
616
617zgemvt_kernel_L999:
618
619        sub     r3, fp, #192
620
621#if	defined(DOUBLE)
622        vldm    r3, { d8 - d15 }                                 // restore floating point registers
623#else
624        vldm    r3, { s8 - s15 }                                 // restore floating point registers
625#endif
626
627	mov	r0, #0		// set return value
628
629	sub     sp, fp, #28
630	pop     {r4 -r9 ,fp}
631	bx	lr
632
633	EPILOGUE
634
635