1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2013/10/16 Saar
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*
34**************************************************************************************/
35
36#define ASSEMBLER
37#include "common.h"
38
39#define STACKSIZE 256
40
41#define	OLD_M	r0
42#define	OLD_N	r1
43#define	OLD_K	r2
44#define	OLD_A	r3
45#define OLD_ALPHA_R s0
46#define OLD_ALPHA_I s1
47
48/******************************************************
49* [fp, #-128] - [fp, #-64] is reserved
50* for store and restore of floating point
51* registers
52*******************************************************/
53
54#define KKK     [fp, #-240]
55#define KK      [fp, #-244 ]
56#define A	[fp, #-248 ]
57#define LDC	[fp, #-252 ]
58#define M	[fp, #-256 ]
59#define N	[fp, #-260 ]
60#define K	[fp, #-264 ]
61
62#define FP_ZERO [fp, #-232]
63#define FP_ZERO_0 [fp, #-232]
64#define FP_ZERO_1 [fp, #-228]
65
66
67#define ALPHA_I	[fp, #-272]
68#define ALPHA_R	[fp, #-280]
69
70#if !defined(__ARM_PCS_VFP)
71#define OLD_ALPHAR_SOFTFP	r3
72#define OLD_ALPHAI_SOFTFP	[fp, #4]
73#define OLD_A_SOFTFP	[fp, #8 ]
74#define B	[fp, #12 ]
75#define C	[fp, #16 ]
76#define OLD_LDC	[fp, #20 ]
77#define OFFSET  [fp, #24 ]
78#else
79#define B	[fp, #4 ]
80#define C	[fp, #8 ]
81#define OLD_LDC	[fp, #12 ]
82#define OFFSET  [fp, #16 ]
83#endif
84
85#define I	r0
86#define J	r1
87#define L	r2
88
89#define	AO	r5
90#define	BO	r6
91
92#define	CO1	r8
93#define	CO2	r9
94
95#define K1	r7
96#define BC	r12
97
98#define A_PRE	96
99#define B_PRE	96
100#define C_PRE	64
101
102
103
104/**************************************************************************************
105* Macro definitions
106**************************************************************************************/
107
108
109#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
110
111	#define	KMAC_R	vmls.f32
112	#define	KMAC_I	fmacs
113
114	#define	FMAC_R1	fmacs
115	#define	FMAC_R2	vmls.f32
116	#define	FMAC_I1	fmacs
117	#define	FMAC_I2	fmacs
118
119#elif defined(CN) || defined(CT)
120
121	#define	KMAC_R	fmacs
122	#define	KMAC_I	vmls.f32
123
124	#define	FMAC_R1	fmacs
125	#define	FMAC_R2	vmls.f32
126	#define	FMAC_I1	fmacs
127	#define	FMAC_I2	fmacs
128
129#elif defined(NC) || defined(TC)
130
131	#define	KMAC_R	fmacs
132	#define	KMAC_I	vmls.f32
133
134	#define	FMAC_R1	fmacs
135	#define	FMAC_R2	fmacs
136	#define	FMAC_I1	vmls.f32
137	#define	FMAC_I2	fmacs
138
139#else
140
141	#define	KMAC_R  vmls.f32
142	#define	KMAC_I	fmacs
143
144	#define	FMAC_R1	fmacs
145	#define	FMAC_R2	fmacs
146	#define	FMAC_I1	vmls.f32
147	#define	FMAC_I2	fmacs
148
149#endif
150
151
152.macro INIT2x2
153
154	flds			s8 , FP_ZERO
155	vmov.f32		s9 , s8
156	vmov.f32		s10, s8
157	vmov.f32		s11, s8
158	vmov.f32		s12, s8
159	vmov.f32		s13, s8
160	vmov.f32		s14, s8
161	vmov.f32		s15, s8
162
163.endm
164
165.macro KERNEL2x2_I
166
167	pld	[ AO, #A_PRE ]
168	vldmia.f32	AO!, { s0 - s3 }
169	pld	[ BO, #B_PRE ]
170	vldmia.f32	BO!, { s4 - s7 }
171
172
173	fmuls	s8  , s0,  s4
174	fmuls	s9  , s0,  s5
175	fmuls	s10 , s2,  s4
176	fmuls	s11 , s2,  s5
177
178	KMAC_R	s8  , s1,  s5
179	KMAC_I	s9  , s1,  s4
180	KMAC_R	s10 , s3,  s5
181	KMAC_I	s11 , s3,  s4
182
183	fmuls	s12 , s0,  s6
184	fmuls	s13 , s0,  s7
185	fmuls	s14 , s2,  s6
186	fmuls	s15 , s2,  s7
187
188	KMAC_R	s12 , s1,  s7
189	KMAC_I	s13 , s1,  s6
190	KMAC_R	s14 , s3,  s7
191	KMAC_I	s15 , s3,  s6
192
193.endm
194
195
196
197.macro KERNEL2x2_M1
198
199	pld	[ AO, #A_PRE ]
200	vldmia.f32	AO!, { s0 - s3 }
201	pld	[ BO, #B_PRE ]
202	vldmia.f32	BO!, { s4 - s7 }
203
204	fmacs	s8  , s0,  s4
205	fmacs	s9  , s0,  s5
206	fmacs	s10 , s2,  s4
207	fmacs	s11 , s2,  s5
208
209	KMAC_R	s8  , s1,  s5
210	KMAC_I	s9  , s1,  s4
211	KMAC_R	s10 , s3,  s5
212	KMAC_I	s11 , s3,  s4
213
214	fmacs	s12 , s0,  s6
215	fmacs	s13 , s0,  s7
216	fmacs	s14 , s2,  s6
217	fmacs	s15 , s2,  s7
218
219	KMAC_R	s12 , s1,  s7
220	KMAC_I	s13 , s1,  s6
221	KMAC_R	s14 , s3,  s7
222	KMAC_I	s15 , s3,  s6
223
224.endm
225
226.macro KERNEL2x2_M2
227
228	vldmia.f32	AO!, { s0 - s3 }
229	vldmia.f32	BO!, { s4 - s7 }
230
231	fmacs	s8  , s0,  s4
232	fmacs	s9  , s0,  s5
233	fmacs	s10 , s2,  s4
234	fmacs	s11 , s2,  s5
235
236	KMAC_R	s8  , s1,  s5
237	KMAC_I	s9  , s1,  s4
238	KMAC_R	s10 , s3,  s5
239	KMAC_I	s11 , s3,  s4
240
241	fmacs	s12 , s0,  s6
242	fmacs	s13 , s0,  s7
243	fmacs	s14 , s2,  s6
244	fmacs	s15 , s2,  s7
245
246	KMAC_R	s12 , s1,  s7
247	KMAC_I	s13 , s1,  s6
248	KMAC_R	s14 , s3,  s7
249	KMAC_I	s15 , s3,  s6
250
251
252.endm
253
254
255.macro KERNEL2x2_E
256
257	vldmia.f32	AO!, { s0 - s3 }
258	vldmia.f32	BO!, { s4 - s7 }
259
260	fmacs	s8  , s0,  s4
261	fmacs	s9  , s0,  s5
262	fmacs	s10 , s2,  s4
263	fmacs	s11 , s2,  s5
264
265	KMAC_R	s8  , s1,  s5
266	KMAC_I	s9  , s1,  s4
267	KMAC_R	s10 , s3,  s5
268	KMAC_I	s11 , s3,  s4
269
270	fmacs	s12 , s0,  s6
271	fmacs	s13 , s0,  s7
272	fmacs	s14 , s2,  s6
273	fmacs	s15 , s2,  s7
274
275	KMAC_R	s12 , s1,  s7
276	KMAC_I	s13 , s1,  s6
277	KMAC_R	s14 , s3,  s7
278	KMAC_I	s15 , s3,  s6
279
280
281.endm
282
283.macro KERNEL2x2_SUB
284
285	vldmia.f32	AO!, { s0 - s3 }
286	vldmia.f32	BO!, { s4 - s7 }
287
288	fmacs	s8  , s0,  s4
289	fmacs	s9  , s0,  s5
290	fmacs	s10 , s2,  s4
291	fmacs	s11 , s2,  s5
292
293	KMAC_R	s8  , s1,  s5
294	KMAC_I	s9  , s1,  s4
295	KMAC_R	s10 , s3,  s5
296	KMAC_I	s11 , s3,  s4
297
298	fmacs	s12 , s0,  s6
299	fmacs	s13 , s0,  s7
300	fmacs	s14 , s2,  s6
301	fmacs	s15 , s2,  s7
302
303	KMAC_R	s12 , s1,  s7
304	KMAC_I	s13 , s1,  s6
305	KMAC_R	s14 , s3,  s7
306	KMAC_I	s15 , s3,  s6
307
308
309.endm
310
311.macro SAVE2x2
312
313	ldr	r3  , LDC
314	add	CO2 , CO1, r3
315
316	flds		s0, ALPHA_R
317	flds		s1, ALPHA_I
318
319	flds		s4, FP_ZERO
320	vmov.f32	s5, s4
321	vmov.f32	s6, s4
322	vmov.f32	s7, s4
323
324	FMAC_R1 s4 , s0 , s8
325	FMAC_I1 s5 , s0 , s9
326	FMAC_R2 s4 , s1 , s9
327	FMAC_I2	s5 , s1 , s8
328
329	FMAC_R1 s6 , s0 , s10
330	FMAC_I1 s7 , s0 , s11
331	FMAC_R2 s6 , s1 , s11
332	FMAC_I2	s7 , s1 , s10
333
334	vstmia.f32 CO1, { s4 - s7 }
335
336	flds		s4, FP_ZERO
337	vmov.f32	s5, s4
338	vmov.f32	s6, s4
339	vmov.f32	s7, s4
340
341	FMAC_R1 s4 , s0 , s12
342	FMAC_I1 s5 , s0 , s13
343	FMAC_R2 s4 , s1 , s13
344	FMAC_I2	s5 , s1 , s12
345
346	FMAC_R1 s6 , s0 , s14
347	FMAC_I1 s7 , s0 , s15
348	FMAC_R2 s6 , s1 , s15
349	FMAC_I2	s7 , s1 , s14
350
351	vstmia.f32 CO2, { s4 - s7 }
352
353	add	CO1, CO1, #16
354
355.endm
356
357/******************************************************************************/
358
359.macro INIT1x2
360
361	flds			s8 , FP_ZERO
362	vmov.f32		s9 , s8
363	vmov.f32		s12, s8
364	vmov.f32		s13, s8
365
366.endm
367
368.macro KERNEL1x2_I
369
370	flds	s0 , [ AO ]
371	flds	s1 , [ AO, #4 ]
372
373	flds	s4 , [ BO ]
374	flds	s5 , [ BO, #4 ]
375
376	flds	s6 , [ BO, #8 ]
377	flds	s7 , [ BO, #12 ]
378
379	fmuls	s8  , s0,  s4
380	KMAC_R	s8  , s1,  s5
381	fmuls	s9  , s0,  s5
382	KMAC_I	s9  , s1,  s4
383
384	fmuls	s12 , s0,  s6
385	KMAC_R	s12 , s1,  s7
386	fmuls	s13 , s0,  s7
387	KMAC_I	s13 , s1,  s6
388
389	add	BO , BO, #16
390	add	AO , AO, #8
391
392.endm
393
394
395
396.macro KERNEL1x2_M1
397
398	flds	s0 , [ AO ]
399	flds	s1 , [ AO, #4 ]
400
401	flds	s4 , [ BO ]
402	flds	s5 , [ BO, #4 ]
403	flds	s6 , [ BO, #8 ]
404	flds	s7 , [ BO, #12 ]
405
406	fmacs	s8  , s0,  s4
407	KMAC_R	s8  , s1,  s5
408	fmacs	s9  , s0,  s5
409	KMAC_I	s9  , s1,  s4
410
411	fmacs	s12 , s0,  s6
412	KMAC_R	s12 , s1,  s7
413	fmacs	s13 , s0,  s7
414	KMAC_I	s13 , s1,  s6
415
416	add	BO , BO, #16
417	add	AO , AO, #8
418
419.endm
420
421.macro KERNEL1x2_M2
422
423	flds	s0 , [ AO ]
424	flds	s1 , [ AO, #4 ]
425
426	flds	s4 , [ BO ]
427	flds	s5 , [ BO, #4 ]
428	flds	s6 , [ BO, #8 ]
429	flds	s7 , [ BO, #12 ]
430
431	fmacs	s8  , s0,  s4
432	KMAC_R	s8  , s1,  s5
433	fmacs	s9  , s0,  s5
434	KMAC_I	s9  , s1,  s4
435
436	fmacs	s12 , s0,  s6
437	KMAC_R	s12 , s1,  s7
438	fmacs	s13 , s0,  s7
439	KMAC_I	s13 , s1,  s6
440
441	add	BO , BO, #16
442	add	AO , AO, #8
443
444
445.endm
446
447
448.macro KERNEL1x2_E
449
450	flds	s0 , [ AO ]
451	flds	s1 , [ AO, #4 ]
452
453	flds	s4 , [ BO ]
454	flds	s5 , [ BO, #4 ]
455	flds	s6 , [ BO, #8 ]
456	flds	s7 , [ BO, #12 ]
457
458	fmacs	s8  , s0,  s4
459	KMAC_R	s8  , s1,  s5
460	fmacs	s9  , s0,  s5
461	KMAC_I	s9  , s1,  s4
462
463	fmacs	s12 , s0,  s6
464	KMAC_R	s12 , s1,  s7
465	fmacs	s13 , s0,  s7
466	KMAC_I	s13 , s1,  s6
467
468	add	BO , BO, #16
469	add	AO , AO, #8
470
471.endm
472
473.macro KERNEL1x2_SUB
474
475	flds	s0 , [ AO ]
476	flds	s1 , [ AO, #4 ]
477
478	flds	s4 , [ BO ]
479	flds	s5 , [ BO, #4 ]
480	flds	s6 , [ BO, #8 ]
481	flds	s7 , [ BO, #12 ]
482
483	fmacs	s8  , s0,  s4
484	KMAC_R	s8  , s1,  s5
485	fmacs	s9  , s0,  s5
486	KMAC_I	s9  , s1,  s4
487
488	fmacs	s12 , s0,  s6
489	KMAC_R	s12 , s1,  s7
490	fmacs	s13 , s0,  s7
491	KMAC_I	s13 , s1,  s6
492
493	add	BO , BO, #16
494	add	AO , AO, #8
495
496
497.endm
498
499
500.macro SAVE1x2
501
502	ldr	r3  , LDC
503	add	CO2 , CO1, r3
504
505	flds		s0, ALPHA_R
506	flds		s1, ALPHA_I
507
508	flds		s4, FP_ZERO
509	vmov.f32	s5, s4
510
511	FMAC_R1 s4 , s0 , s8
512	FMAC_I1 s5 , s0 , s9
513	FMAC_R2 s4 , s1 , s9
514	FMAC_I2	s5 , s1 , s8
515
516	vstmia.f32 CO1, { s4 - s5 }
517
518	flds		s4, FP_ZERO
519	vmov.f32	s5, s4
520
521	FMAC_R1 s4 , s0 , s12
522	FMAC_I1 s5 , s0 , s13
523	FMAC_R2 s4 , s1 , s13
524	FMAC_I2	s5 , s1 , s12
525
526	vstmia.f32 CO2, { s4 - s5 }
527
528	add	CO1, CO1, #8
529
530.endm
531
532
533/******************************************************************************/
534
535.macro INIT2x1
536
537	flds			s8 , FP_ZERO
538	vmov.f32		s9 , s8
539	vmov.f32		s10, s8
540	vmov.f32		s11, s8
541
542.endm
543
544.macro KERNEL2x1_I
545
546	flds	s0 , [ AO ]
547	flds	s1 , [ AO, #4 ]
548
549	flds	s2 , [ AO, #8 ]
550	flds	s3 , [ AO, #12 ]
551
552	flds	s4 , [ BO ]
553	flds	s5 , [ BO, #4 ]
554
555	fmuls	s8  , s0,  s4
556	KMAC_R	s8  , s1,  s5
557	fmuls	s9  , s0,  s5
558	KMAC_I	s9  , s1,  s4
559
560	fmuls	s10 , s2,  s4
561	KMAC_R	s10 , s3,  s5
562	fmuls	s11 , s2,  s5
563	KMAC_I	s11 , s3,  s4
564
565	add	BO , BO, #8
566	add	AO , AO, #16
567
568.endm
569
570
571
572.macro KERNEL2x1_M1
573
574	flds	s0 , [ AO ]
575	flds	s1 , [ AO, #4 ]
576	flds	s2 , [ AO, #8 ]
577	flds	s3 , [ AO, #12 ]
578
579	flds	s4 , [ BO ]
580	flds	s5 , [ BO, #4 ]
581
582	fmacs	s8  , s0,  s4
583	KMAC_R	s8  , s1,  s5
584	fmacs	s9  , s0,  s5
585	KMAC_I	s9  , s1,  s4
586
587	fmacs	s10 , s2,  s4
588	KMAC_R	s10 , s3,  s5
589	fmacs	s11 , s2,  s5
590	KMAC_I	s11 , s3,  s4
591
592	add	BO , BO, #8
593	add	AO , AO, #16
594
595.endm
596
597.macro KERNEL2x1_M2
598
599	flds	s0 , [ AO ]
600	flds	s1 , [ AO, #4 ]
601	flds	s2 , [ AO, #8 ]
602	flds	s3 , [ AO, #12 ]
603
604	flds	s4 , [ BO ]
605	flds	s5 , [ BO, #4 ]
606
607	fmacs	s8  , s0,  s4
608	KMAC_R	s8  , s1,  s5
609	fmacs	s9  , s0,  s5
610	KMAC_I	s9  , s1,  s4
611
612	fmacs	s10 , s2,  s4
613	KMAC_R	s10 , s3,  s5
614	fmacs	s11 , s2,  s5
615	KMAC_I	s11 , s3,  s4
616
617	add	BO , BO, #8
618	add	AO , AO, #16
619
620
621.endm
622
623
624.macro KERNEL2x1_E
625
626	flds	s0 , [ AO ]
627	flds	s1 , [ AO, #4 ]
628	flds	s2 , [ AO, #8 ]
629	flds	s3 , [ AO, #12 ]
630
631	flds	s4 , [ BO ]
632	flds	s5 , [ BO, #4 ]
633
634	fmacs	s8  , s0,  s4
635	KMAC_R	s8  , s1,  s5
636	fmacs	s9  , s0,  s5
637	KMAC_I	s9  , s1,  s4
638
639	fmacs	s10 , s2,  s4
640	KMAC_R	s10 , s3,  s5
641	fmacs	s11 , s2,  s5
642	KMAC_I	s11 , s3,  s4
643
644	add	BO , BO, #8
645	add	AO , AO, #16
646
647.endm
648
649.macro KERNEL2x1_SUB
650
651	flds	s0 , [ AO ]
652	flds	s1 , [ AO, #4 ]
653	flds	s2 , [ AO, #8 ]
654	flds	s3 , [ AO, #12 ]
655
656	flds	s4 , [ BO ]
657	flds	s5 , [ BO, #4 ]
658
659	fmacs	s8  , s0,  s4
660	KMAC_R	s8  , s1,  s5
661	fmacs	s9  , s0,  s5
662	KMAC_I	s9  , s1,  s4
663
664	fmacs	s10 , s2,  s4
665	KMAC_R	s10 , s3,  s5
666	fmacs	s11 , s2,  s5
667	KMAC_I	s11 , s3,  s4
668
669	add	BO , BO, #8
670	add	AO , AO, #16
671
672
673.endm
674
675
676.macro SAVE2x1
677
678	flds		s0, ALPHA_R
679	flds		s1, ALPHA_I
680
681	flds		s4, FP_ZERO
682	vmov.f32	s5, s4
683	vmov.f32	s6, s4
684	vmov.f32	s7, s4
685
686	FMAC_R1 s4 , s0 , s8
687	FMAC_I1 s5 , s0 , s9
688	FMAC_R2 s4 , s1 , s9
689	FMAC_I2	s5 , s1 , s8
690
691	FMAC_R1 s6 , s0 , s10
692	FMAC_I1 s7 , s0 , s11
693	FMAC_R2 s6 , s1 , s11
694	FMAC_I2	s7 , s1 , s10
695
696	vstmia.f32 CO1, { s4 - s7 }
697
698	add	CO1, CO1, #16
699
700.endm
701
702
703/******************************************************************************/
704
705.macro INIT1x1
706
707	flds			s8 , FP_ZERO
708	vmov.f32		s9 , s8
709
710.endm
711
712.macro KERNEL1x1_I
713
714	flds	s0 , [ AO ]
715	flds	s1 , [ AO, #4 ]
716
717	flds	s4 , [ BO ]
718	flds	s5 , [ BO, #4 ]
719
720	fmuls	s8  , s0,  s4
721	KMAC_R	s8  , s1,  s5
722	fmuls	s9  , s0,  s5
723	KMAC_I	s9  , s1,  s4
724
725	add	BO , BO, #8
726	add	AO , AO, #8
727
728.endm
729
730
731
732.macro KERNEL1x1_M1
733
734	flds	s0 , [ AO ]
735	flds	s1 , [ AO, #4 ]
736
737	flds	s4 , [ BO ]
738	flds	s5 , [ BO, #4 ]
739
740	fmacs	s8  , s0,  s4
741	KMAC_R	s8  , s1,  s5
742	fmacs	s9  , s0,  s5
743	KMAC_I	s9  , s1,  s4
744
745	add	BO , BO, #8
746	add	AO , AO, #8
747
748.endm
749
750.macro KERNEL1x1_M2
751
752	flds	s0 , [ AO ]
753	flds	s1 , [ AO, #4 ]
754
755	flds	s4 , [ BO ]
756	flds	s5 , [ BO, #4 ]
757
758	fmacs	s8  , s0,  s4
759	KMAC_R	s8  , s1,  s5
760	fmacs	s9  , s0,  s5
761	KMAC_I	s9  , s1,  s4
762
763	add	BO , BO, #8
764	add	AO , AO, #8
765
766
767.endm
768
769
770.macro KERNEL1x1_E
771
772	flds	s0 , [ AO ]
773	flds	s1 , [ AO, #4 ]
774
775	flds	s4 , [ BO ]
776	flds	s5 , [ BO, #4 ]
777
778	fmacs	s8  , s0,  s4
779	KMAC_R	s8  , s1,  s5
780	fmacs	s9  , s0,  s5
781	KMAC_I	s9  , s1,  s4
782
783	add	BO , BO, #8
784	add	AO , AO, #8
785
786.endm
787
788.macro KERNEL1x1_SUB
789
790	flds	s0 , [ AO ]
791	flds	s1 , [ AO, #4 ]
792
793	flds	s4 , [ BO ]
794	flds	s5 , [ BO, #4 ]
795
796	fmacs	s8  , s0,  s4
797	KMAC_R	s8  , s1,  s5
798	fmacs	s9  , s0,  s5
799	KMAC_I	s9  , s1,  s4
800
801	add	BO , BO, #8
802	add	AO , AO, #8
803
804
805.endm
806
807
808.macro SAVE1x1
809
810	flds		s0, ALPHA_R
811	flds		s1, ALPHA_I
812
813	flds		s4, FP_ZERO
814	vmov.f32	s5, s4
815
816	FMAC_R1 s4 , s0 , s8
817	FMAC_I1 s5 , s0 , s9
818	FMAC_R2 s4 , s1 , s9
819	FMAC_I2	s5 , s1 , s8
820
821	vstmia.f32 CO1, { s4 - s5 }
822
823	add	CO1, CO1, #8
824
825.endm
826
827/**************************************************************************************
828* End of macro definitions
829**************************************************************************************/
830
831	PROLOGUE
832
833	.align 5
834
835	push	{r4 - r9, fp}
836	add	fp, sp, #24
837	sub	sp, sp, #STACKSIZE				// reserve stack
838
839#if !defined(__ARM_PCS_VFP)
840	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
841	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
842	ldr	OLD_A, OLD_A_SOFTFP
843#endif
844	str	OLD_M, M
845	str	OLD_N, N
846	str	OLD_K, K
847	str	OLD_A, A
848	vstr	OLD_ALPHA_R, ALPHA_R
849	vstr	OLD_ALPHA_I, ALPHA_I
850
851	sub	r3, fp, #128
852	vstm	r3, { s8 - s15} 				// store floating point registers
853
854        movs    r4, #0
855        str     r4, FP_ZERO
856        str     r4, FP_ZERO_1
857
858	ldr	r3, OLD_LDC
859	lsl	r3, r3, #3					// ldc = ldc * 4 * 2
860	str	r3, LDC
861
862        ldr     r3, OFFSET
863#ifndef LEFT
864        neg     r3 , r3
865#endif
866        str     r3 , KK
867
868	ldr	BC, B
869
870	ldr	J, N
871	asrs	J, J, #1					// J = J / 2
872	ble	_L1_BEGIN
873
874_L2_BEGIN:
875
876	ldr	CO1, C						// CO1 = C
877	ldr	r4 , LDC
878	lsl	r4 , r4 , #1					// LDC * 2
879	add	r3 , r4, CO1
880	str	r3 , C						// store C
881
882#if defined(LEFT)
883        ldr     r3 , OFFSET
884        str     r3 , KK
885#endif
886
887	ldr	AO, A						// AO = A
888        pld     [AO , #A_PRE-64]
889        pld     [AO , #A_PRE-32]
890
891
892
893_L2_M2_BEGIN:
894
895	ldr	I, M
896	asrs	I, I, #1					// I = I / 2
897	ble	_L2_M1_BEGIN
898
899_L2_M2_20:
900
901#if  (defined(LEFT) &&  defined(TRANSA)) || \
902    (!defined(LEFT) && !defined(TRANSA))
903
904        mov     BO, BC
905#else
906        mov     BO, BC
907        ldr     r3 , KK
908        lsls    r4 , r3 , #4                                    // 2 * 4 * 2 float values
909        add     BO , BO , r4
910        lsls    r4 , r3 , #4                                    // 2 * 4 * 2 float values
911        add     AO , AO , r4
912
913#endif
914
915#ifndef TRMMKERNEL
916        ldr     K1, K
917#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
918        ldr     K1, K
919        ldr     r3, KK
920        sub     K1, K1, r3
921        str     K1, KKK
922#else
923        ldr     K1, KK
924#ifdef LEFT
925        add     K1, K1, #2        // number of values in AO
926#else
927        add     K1, K1, #2        // number of values in BO
928#endif
929        str     K1, KKK
930#endif
931
932	asrs	L , K1, #3					// L = L / 8
933	cmp	L , #3
934	blt	_L2_M2_30
935	.align 5
936
937
938
939	KERNEL2x2_I
940	KERNEL2x2_M2
941	KERNEL2x2_M1
942	KERNEL2x2_M2
943
944	KERNEL2x2_M1
945	KERNEL2x2_M2
946	KERNEL2x2_M1
947	KERNEL2x2_M2
948
949	sub	L, L, #2
950
951_L2_M2_22:
952
953	KERNEL2x2_M1
954	KERNEL2x2_M2
955	KERNEL2x2_M1
956	KERNEL2x2_M2
957
958	KERNEL2x2_M1
959	KERNEL2x2_M2
960	KERNEL2x2_M1
961	KERNEL2x2_M2
962
963	subs	L, L, #1
964	bgt	_L2_M2_22
965
966	KERNEL2x2_M1
967	KERNEL2x2_M2
968	KERNEL2x2_M1
969	KERNEL2x2_M2
970
971	KERNEL2x2_M1
972	KERNEL2x2_M2
973	KERNEL2x2_M1
974	KERNEL2x2_E
975
976	b	 _L2_M2_44
977
978
979_L2_M2_30:
980	tst	L, #3
981	ble	_L2_M2_40
982
983	tst	L, #2
984	ble	_L2_M2_32
985
986	KERNEL2x2_I
987	KERNEL2x2_M2
988	KERNEL2x2_M1
989	KERNEL2x2_M2
990
991	KERNEL2x2_M1
992	KERNEL2x2_M2
993	KERNEL2x2_M1
994	KERNEL2x2_M2
995
996	KERNEL2x2_M1
997	KERNEL2x2_M2
998	KERNEL2x2_M1
999	KERNEL2x2_M2
1000
1001
1002	KERNEL2x2_M1
1003	KERNEL2x2_M2
1004	KERNEL2x2_M1
1005	KERNEL2x2_E
1006
1007	b	 _L2_M2_44
1008
1009_L2_M2_32:
1010
1011	tst	L, #1
1012	ble	_L2_M2_40
1013
1014	KERNEL2x2_I
1015	KERNEL2x2_M2
1016	KERNEL2x2_M1
1017	KERNEL2x2_M2
1018
1019	KERNEL2x2_M1
1020	KERNEL2x2_M2
1021	KERNEL2x2_M1
1022	KERNEL2x2_E
1023
1024	b	 _L2_M2_44
1025
1026
1027_L2_M2_40:
1028
1029	INIT2x2
1030
1031
1032_L2_M2_44:
1033
1034	ands	L , K1, #7					// L = L % 8
1035	ble	_L2_M2_100
1036
1037_L2_M2_46:
1038
1039	KERNEL2x2_SUB
1040
1041	subs	L, L, #1
1042	bne	_L2_M2_46
1043
1044_L2_M2_100:
1045
1046	SAVE2x2
1047
1048#if  (defined(LEFT) &&  defined(TRANSA)) || \
1049    (!defined(LEFT) && !defined(TRANSA))
1050        ldr     r3 , K
1051        ldr     r4 , KKK
1052        sub     r3 , r3 , r4
1053        lsls    r4 , r3 , #4                    // 2 * 4 * 2 float values
1054        add     BO , BO , r4
1055        lsls    r4 , r3 , #4                    // 2 * 4 * 2 float values
1056        add     AO , AO , r4
1057#endif
1058
1059#if defined(LEFT)
1060        ldr     r3 , KK
1061        add     r3 , r3 , #2                    // number of values in AO
1062        str     r3 , KK
1063#endif
1064
1065
1066_L2_M2_END:
1067
1068	subs	I, I, #1
1069	bne	_L2_M2_20
1070
1071
1072_L2_M1_BEGIN:
1073
1074	ldr	I, M
1075	tst	I, #1					// I = I % 2
1076	ble	_L2_END
1077
1078_L2_M1_20:
1079
1080	INIT1x2
1081
1082#if  (defined(LEFT) &&  defined(TRANSA)) || \
1083    (!defined(LEFT) && !defined(TRANSA))
1084
1085        mov     BO, BC
1086#else
1087        mov     BO, BC
1088        ldr     r3 , KK
1089        lsls    r4 , r3 , #4                                    // 2 * 4 * 2 float values
1090        add     BO , BO , r4
1091        lsls    r4 , r3 , #3                                    // 1 * 4 * 2 float values
1092        add     AO , AO , r4
1093
1094#endif
1095
1096#ifndef TRMMKERNEL
1097        ldr     K1, K
1098#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1099        ldr     K1, K
1100        ldr     r3, KK
1101        sub     K1, K1, r3
1102        str     K1, KKK
1103#else
1104        ldr     K1, KK
1105#ifdef LEFT
1106        add     K1, K1, #1        // number of values in AO
1107#else
1108        add     K1, K1, #2        // number of values in BO
1109#endif
1110        str     K1, KKK
1111#endif
1112
1113	asrs	L , K1, #3					// L = L / 8
1114	ble	_L2_M1_40
1115
1116_L2_M1_22:
1117
1118	KERNEL1x2_SUB
1119	KERNEL1x2_SUB
1120	KERNEL1x2_SUB
1121	KERNEL1x2_SUB
1122
1123	KERNEL1x2_SUB
1124	KERNEL1x2_SUB
1125	KERNEL1x2_SUB
1126	KERNEL1x2_SUB
1127
1128	subs	L, L, #1
1129	bgt	_L2_M1_22
1130
1131
1132_L2_M1_40:
1133
1134	ands	L , K1, #7					// L = L % 8
1135	ble	_L2_M1_100
1136
1137_L2_M1_42:
1138
1139	KERNEL1x2_SUB
1140
1141	subs	L, L, #1
1142	bgt	_L2_M1_42
1143
1144_L2_M1_100:
1145
1146	SAVE1x2
1147
1148#if  (defined(LEFT) &&  defined(TRANSA)) || \
1149    (!defined(LEFT) && !defined(TRANSA))
1150        ldr     r3 , K
1151        ldr     r4 , KKK
1152        sub     r3 , r3 , r4
1153        lsls    r4 , r3 , #4                    // 2 * 4 * 2 float values
1154        add     BO , BO , r4
1155        lsls    r4 , r3 , #3                    // 1 * 4 * 2 float values
1156        add     AO , AO , r4
1157#endif
1158
1159#if defined(LEFT)
1160        ldr     r3 , KK
1161        add     r3 , r3 , #1                    // number of values in AO
1162        str     r3 , KK
1163#endif
1164
1165
1166
1167_L2_END:
1168
1169	mov	r3, BC
1170	ldr	r4, K
1171	lsl	r4, r4, #4					// k * 2 * 4 * 2
1172	add	r3, r3, r4					// B = B + K * 2 * 8
1173	mov	BC, r3
1174
1175#if !defined(LEFT)
1176        ldr     r3 , KK
1177        add     r3 , r3 , #2                                    // number of values in BO
1178        str     r3 , KK
1179#endif
1180
1181	subs	J , #1						// j--
1182	bgt	_L2_BEGIN
1183
1184
1185
1186/*********************************************************************************************/
1187
1188_L1_BEGIN:
1189
1190	ldr	J , N
1191	tst	J , #1
1192	ble	_L999
1193
1194
1195	ldr	CO1, C						// CO1 = C
1196	ldr	r4 , LDC
1197	add	r3 , r4, CO1
1198	str	r3 , C						// store C
1199
1200#if defined(LEFT)
1201        ldr     r3 , OFFSET
1202        str     r3 , KK
1203#endif
1204
1205	ldr	AO, A						// AO = A
1206
1207_L1_M2_BEGIN:
1208
1209	ldr	I, M
1210	asrs	I, I, #1					// I = I / 2
1211	ble	_L1_M1_BEGIN
1212
1213_L1_M2_20:
1214
1215#if  (defined(LEFT) &&  defined(TRANSA)) || \
1216    (!defined(LEFT) && !defined(TRANSA))
1217
1218        mov     BO, BC
1219#else
1220        mov     BO, BC
1221        ldr     r3 , KK
1222        lsls    r4 , r3 , #3                                    // 1 * 4 * 2 float values
1223        add     BO , BO , r4
1224        lsls    r4 , r3 , #4                                    // 2 * 4 * 2 float values
1225        add     AO , AO , r4
1226
1227#endif
1228
1229#ifndef TRMMKERNEL
1230        ldr     K1, K
1231#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1232        ldr     K1, K
1233        ldr     r3, KK
1234        sub     K1, K1, r3
1235        str     K1, KKK
1236#else
1237        ldr     K1, KK
1238#ifdef LEFT
1239        add     K1, K1, #2        // number of values in AO
1240#else
1241        add     K1, K1, #1        // number of values in BO
1242#endif
1243        str     K1, KKK
1244#endif
1245
1246	asrs	L , K1, #3					// L = L / 8
1247	cmp	L , #3
1248	blt	_L1_M2_30
1249	.align 5
1250
1251
1252
1253	KERNEL2x1_I
1254	KERNEL2x1_M2
1255	KERNEL2x1_M1
1256	KERNEL2x1_M2
1257
1258	KERNEL2x1_M1
1259	KERNEL2x1_M2
1260	KERNEL2x1_M1
1261	KERNEL2x1_M2
1262
1263	sub	L, L, #2
1264
1265_L1_M2_22:
1266
1267	KERNEL2x1_M1
1268	KERNEL2x1_M2
1269	KERNEL2x1_M1
1270	KERNEL2x1_M2
1271
1272	KERNEL2x1_M1
1273	KERNEL2x1_M2
1274	KERNEL2x1_M1
1275	KERNEL2x1_M2
1276
1277	subs	L, L, #1
1278	bgt	_L1_M2_22
1279
1280	KERNEL2x1_M1
1281	KERNEL2x1_M2
1282	KERNEL2x1_M1
1283	KERNEL2x1_M2
1284
1285	KERNEL2x1_M1
1286	KERNEL2x1_M2
1287	KERNEL2x1_M1
1288	KERNEL2x1_E
1289
1290	b	 _L1_M2_44
1291
1292
1293_L1_M2_30:
1294	tst	L, #3
1295	ble	_L1_M2_40
1296
1297	tst	L, #2
1298	ble	_L1_M2_32
1299
1300	KERNEL2x1_I
1301	KERNEL2x1_M2
1302	KERNEL2x1_M1
1303	KERNEL2x1_M2
1304
1305	KERNEL2x1_M1
1306	KERNEL2x1_M2
1307	KERNEL2x1_M1
1308	KERNEL2x1_M2
1309
1310	KERNEL2x1_M1
1311	KERNEL2x1_M2
1312	KERNEL2x1_M1
1313	KERNEL2x1_M2
1314
1315
1316	KERNEL2x1_M1
1317	KERNEL2x1_M2
1318	KERNEL2x1_M1
1319	KERNEL2x1_E
1320
1321	b	 _L1_M2_44
1322
1323_L1_M2_32:
1324
1325	tst	L, #1
1326	ble	_L1_M2_40
1327
1328	KERNEL2x1_I
1329	KERNEL2x1_M2
1330	KERNEL2x1_M1
1331	KERNEL2x1_M2
1332
1333	KERNEL2x1_M1
1334	KERNEL2x1_M2
1335	KERNEL2x1_M1
1336	KERNEL2x1_E
1337
1338	b	 _L1_M2_44
1339
1340
1341_L1_M2_40:
1342
1343	INIT2x1
1344
1345
1346_L1_M2_44:
1347
1348	ands	L , K1, #7					// L = L % 8
1349	ble	_L1_M2_100
1350
1351_L1_M2_46:
1352
1353	KERNEL2x1_SUB
1354
1355	subs	L, L, #1
1356	bne	_L1_M2_46
1357
1358_L1_M2_100:
1359
1360	SAVE2x1
1361
1362#if  (defined(LEFT) &&  defined(TRANSA)) || \
1363    (!defined(LEFT) && !defined(TRANSA))
1364        ldr     r3 , K
1365        ldr     r4 , KKK
1366        sub     r3 , r3 , r4
1367        lsls    r4 , r3 , #3                    // 1 * 4 * 2 float values
1368        add     BO , BO , r4
1369        lsls    r4 , r3 , #4                    // 2 * 4 * 2 float values
1370        add     AO , AO , r4
1371#endif
1372
1373#if defined(LEFT)
1374        ldr     r3 , KK
1375        add     r3 , r3 , #2                    // number of values in AO
1376        str     r3 , KK
1377#endif
1378
1379
1380
1381_L1_M2_END:
1382
1383	subs	I, I, #1
1384	bne	_L1_M2_20
1385
1386
1387_L1_M1_BEGIN:
1388
1389	ldr	I, M
1390	tst	I, #1					// I = I % 2
1391	ble	_L1_END
1392
1393_L1_M1_20:
1394
1395	INIT1x1
1396
1397#if  (defined(LEFT) &&  defined(TRANSA)) || \
1398    (!defined(LEFT) && !defined(TRANSA))
1399
1400        mov     BO, BC
1401#else
1402        mov     BO, BC
1403        ldr     r3 , KK
1404        lsls    r4 , r3 , #3                                    // 1 * 4 * 2 float values
1405        add     BO , BO , r4
1406        lsls    r4 , r3 , #3                                    // 1 * 4 * 2 float values
1407        add     AO , AO , r4
1408
1409#endif
1410
1411#ifndef TRMMKERNEL
1412        ldr     K1, K
1413#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1414        ldr     K1, K
1415        ldr     r3, KK
1416        sub     K1, K1, r3
1417        str     K1, KKK
1418#else
1419        ldr     K1, KK
1420#ifdef LEFT
1421        add     K1, K1, #1        // number of values in AO
1422#else
1423        add     K1, K1, #1        // number of values in BO
1424#endif
1425        str     K1, KKK
1426#endif
1427
1428	asrs	L , K1, #3					// L = L / 8
1429	ble	_L1_M1_40
1430
1431_L1_M1_22:
1432
1433	KERNEL1x1_SUB
1434	KERNEL1x1_SUB
1435	KERNEL1x1_SUB
1436	KERNEL1x1_SUB
1437
1438	KERNEL1x1_SUB
1439	KERNEL1x1_SUB
1440	KERNEL1x1_SUB
1441	KERNEL1x1_SUB
1442
1443	subs	L, L, #1
1444	bgt	_L1_M1_22
1445
1446
1447_L1_M1_40:
1448
1449	ands	L , K1, #7					// L = L % 8
1450	ble	_L1_M1_100
1451
1452_L1_M1_42:
1453
1454	KERNEL1x1_SUB
1455
1456	subs	L, L, #1
1457	bgt	_L1_M1_42
1458
1459_L1_M1_100:
1460
1461	SAVE1x1
1462
1463
1464_L1_END:
1465
1466
1467
1468_L999:
1469
1470	sub	r3, fp, #128
1471	vldm	r3, { s8 - s15}					// restore floating point registers
1472
1473	movs	r0, #0						// set return value
1474	sub	sp, fp, #24
1475	pop	{r4 - r9, fp}
1476	bx	lr
1477
1478	EPILOGUE
1479
1480