1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2013/10/16 Saar
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*
34**************************************************************************************/
35
36#define ASSEMBLER
37#include "common.h"
38
39#define STACKSIZE 256
40
41#define	OLD_M	r0
42#define	OLD_N	r1
43#define	OLD_K	r2
44#define	OLD_A	r3
45#define OLD_ALPHA_R s0
46#define OLD_ALPHA_I s1
47
48/******************************************************
49* [fp, #-128] - [fp, #-64] is reserved
50* for store and restore of floating point
51* registers
52*******************************************************/
53
54#define KKK     [fp, #-240]
55#define KK      [fp, #-244 ]
56#define A	[fp, #-248 ]
57#define LDC	[fp, #-252 ]
58#define M	[fp, #-256 ]
59#define N	[fp, #-260 ]
60#define K	[fp, #-264 ]
61
62#define FP_ZERO [fp, #-236]
63#define FP_ZERO_0 [fp, #-236]
64#define FP_ZERO_1 [fp, #-232]
65
66#define ALPHA_I	[fp, #-272]
67#define ALPHA_R	[fp, #-280]
68
69#if !defined(__ARM_PCS_VFP)
70#define OLD_ALPHAR_SOFTFP	r3
71#define OLD_ALPHAI_SOFTFP	[fp, #4]
72#define OLD_A_SOFTFP	[fp, #8 ]
73#define B	[fp, #12 ]
74#define C	[fp, #16 ]
75#define OLD_LDC	[fp, #20 ]
76#define OFFSET  [fp, #24 ]
77#else
78#define B	[fp, #4 ]
79#define C	[fp, #8 ]
80#define OLD_LDC	[fp, #12 ]
81#define OFFSET  [fp, #16 ]
82#endif
83
84#define I	r0
85#define J	r1
86#define L	r2
87
88#define	AO	r5
89#define	BO	r6
90
91#define	CO1	r8
92#define	CO2	r9
93
94#define K1	r7
95#define BC	r12
96
97#define A_PRE	96
98#define B_PRE	96
99#define C_PRE	64
100
101#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
102
103	#define	FADD_R	fsubs
104	#define	FADD_I	fadds
105
106	#define	FMAC_R1	vnmul.f32
107	#define	FMAC_R2	vmls.f32
108	#define	FMAC_I1	fmuls
109	#define	FMAC_I2	vmls.f32
110
111#elif defined(CN) || defined(CT)
112
113	#define	FADD_R	fadds
114	#define	FADD_I	fsubs
115
116	#define	FMAC_R1	fmuls
117	#define	FMAC_R2	fmacs
118	#define	FMAC_I1	vnmul.f32
119	#define	FMAC_I2	fmacs
120
121#elif defined(NC) || defined(TC)
122
123	#define	FADD_R	fadds
124	#define	FADD_I	fsubs
125
126	#define	FMAC_R1	fmuls
127	#define	FMAC_R2	vmls.f32
128	#define	FMAC_I1	fmuls
129	#define	FMAC_I2	fmacs
130
131#else
132
133	#define	FADD_R  fsubs
134	#define	FADD_I	fadds
135
136	#define	FMAC_R1	vnmul.f32
137	#define	FMAC_R2	fmacs
138	#define	FMAC_I1	vnmul.f32
139	#define	FMAC_I2	vmls.f32
140
141#endif
142
143
144
145/**************************************************************************************
146* Macro definitions
147**************************************************************************************/
148
149.macro INIT2x2
150
151	flds			s16 , FP_ZERO
152	vmov.f32		s17, s16
153	vmov.f32		s18, s16
154	vmov.f32		s19, s16
155	vmov.f32		s20, s16
156	vmov.f32		s21, s16
157	vmov.f32		s22, s16
158	vmov.f32		s23, s16
159	vmov.f32		s24, s16
160	vmov.f32		s25, s16
161	vmov.f32		s26, s16
162	vmov.f32		s27, s16
163	vmov.f32		s28, s16
164	vmov.f32		s29, s16
165	vmov.f32		s30, s16
166	vmov.f32		s31, s16
167
168.endm
169
170.macro KERNEL2x2_I
171	pld	[ AO , #A_PRE ]
172	pld	[ BO , #B_PRE ]
173	vldmia.f32 AO!, { s0 - s1 }
174	vldmia.f32 BO!, { s8 - s9 }
175
176	fmuls	s16  , s0,  s8
177	fmuls	s24  , s1,  s9
178	vldmia.f32 AO!, { s2 - s3 }
179	fmuls	s17  , s0,  s9
180	fmuls	s25  , s1,  s8
181
182	vldmia.f32 BO!, { s10 - s11 }
183	fmuls	s18  , s2,  s8
184	fmuls	s26  , s3,  s9
185	vldmia.f32 AO!, { s4 - s5 }
186	fmuls	s19  , s2,  s9
187	fmuls	s27  , s3,  s8
188
189	vldmia.f32 BO!, { s12 - s13 }
190	fmuls	s20  , s0,  s10
191	fmuls	s28  , s1,  s11
192	vldmia.f32 AO!, { s6 - s7 }
193	fmuls	s21  , s0,  s11
194	fmuls	s29  , s1,  s10
195
196	vldmia.f32 BO!, { s14 - s15 }
197	fmuls	s22  , s2,  s10
198	fmuls	s30  , s3,  s11
199	fmuls	s23  , s2,  s11
200	fmuls	s31  , s3,  s10
201
202.endm
203
204
205
206.macro KERNEL2x2_M1
207
208	fmacs	s16  , s0,  s8
209	vldmia.f32 AO!, { s4 - s5 }
210	fmacs	s24  , s1,  s9
211	fmacs	s17  , s0,  s9
212	vldmia.f32 BO!, { s12 - s13 }
213	fmacs	s25  , s1,  s8
214
215	fmacs	s18  , s2,  s8
216	vldmia.f32 AO!, { s6 - s7 }
217	fmacs	s26  , s3,  s9
218	fmacs	s19  , s2,  s9
219	vldmia.f32 BO!, { s14 - s15 }
220	fmacs	s27  , s3,  s8
221
222	fmacs	s20  , s0,  s10
223	fmacs	s28  , s1,  s11
224	fmacs	s21  , s0,  s11
225	fmacs	s29  , s1,  s10
226
227	fmacs	s22  , s2,  s10
228	fmacs	s30  , s3,  s11
229	fmacs	s23  , s2,  s11
230	fmacs	s31  , s3,  s10
231
232.endm
233
234.macro KERNEL2x2_M2
235	pld	[ AO , #A_PRE ]
236
237	fmacs	s16  , s4,  s12
238	pld	[ BO , #B_PRE ]
239	fmacs	s24  , s5,  s13
240	fmacs	s17  , s4,  s13
241	vldmia.f32 AO!, { s0 - s1 }
242	fmacs	s25  , s5,  s12
243
244	fmacs	s18  , s6,  s12
245	fmacs	s26  , s7,  s13
246	vldmia.f32 BO!, { s8 - s9 }
247	fmacs	s19  , s6,  s13
248	fmacs	s27  , s7,  s12
249
250	vldmia.f32 AO!, { s2 - s3 }
251	fmacs	s20  , s4,  s14
252	fmacs	s28  , s5,  s15
253	vldmia.f32 BO!, { s10 - s11 }
254	fmacs	s21  , s4,  s15
255	fmacs	s29  , s5,  s14
256
257	fmacs	s22  , s6,  s14
258	fmacs	s30  , s7,  s15
259	fmacs	s23  , s6,  s15
260	fmacs	s31  , s7,  s14
261
262.endm
263
264
265.macro KERNEL2x2_E
266
267	fmacs	s16  , s4,  s12
268	fmacs	s24  , s5,  s13
269	fmacs	s17  , s4,  s13
270	fmacs	s25  , s5,  s12
271
272	fmacs	s18  , s6,  s12
273	fmacs	s26  , s7,  s13
274	fmacs	s19  , s6,  s13
275	fmacs	s27  , s7,  s12
276
277	fmacs	s20  , s4,  s14
278	fmacs	s28  , s5,  s15
279	fmacs	s21  , s4,  s15
280	fmacs	s29  , s5,  s14
281
282	fmacs	s22  , s6,  s14
283	fmacs	s30  , s7,  s15
284	fmacs	s23  , s6,  s15
285	fmacs	s31  , s7,  s14
286
287.endm
288
289.macro KERNEL2x2_SUB
290
291	vldmia.f32 AO!, { s0 - s1 }
292	vldmia.f32 BO!, { s8 - s9 }
293
294	fmacs	s16  , s0,  s8
295	fmacs	s24  , s1,  s9
296	vldmia.f32 AO!, { s2 - s3 }
297	fmacs	s17  , s0,  s9
298	fmacs	s25  , s1,  s8
299
300	vldmia.f32 BO!, { s10 - s11 }
301	fmacs	s18  , s2,  s8
302	fmacs	s26  , s3,  s9
303	fmacs	s19  , s2,  s9
304	fmacs	s27  , s3,  s8
305
306	fmacs	s20  , s0,  s10
307	fmacs	s28  , s1,  s11
308	fmacs	s21  , s0,  s11
309	fmacs	s29  , s1,  s10
310
311	fmacs	s22  , s2,  s10
312	fmacs	s30  , s3,  s11
313	fmacs	s23  , s2,  s11
314	fmacs	s31  , s3,  s10
315
316.endm
317
318
319
320
321.macro SAVE2x2
322
323	ldr	r3  , LDC
324	add	CO2 , CO1, r3
325	flds		s0, ALPHA_R
326	flds		s1, ALPHA_I
327
328	FADD_R	s16, s24 , s16
329	FADD_I  s17, s25 , s17
330	FADD_R	s18, s26 , s18
331	FADD_I  s19, s27 , s19
332	FADD_R	s20, s28 , s20
333	FADD_I  s21, s29 , s21
334	FADD_R	s22, s30 , s22
335	FADD_I  s23, s31 , s23
336
337	FMAC_R1 s4 , s0 , s16
338	FMAC_I1 s5 , s0 , s17
339	FMAC_R2 s4 , s1 , s17
340	FMAC_I2	s5 , s1 , s16
341
342	FMAC_R1 s6 , s0 , s18
343	FMAC_I1 s7 , s0 , s19
344	FMAC_R2 s6 , s1 , s19
345	FMAC_I2	s7 , s1 , s18
346
347	FMAC_R1 s8 , s0 , s20
348	FMAC_I1 s9 , s0 , s21
349	FMAC_R2 s8 , s1 , s21
350	FMAC_I2	s9 , s1 , s20
351
352	FMAC_R1 s10, s0 , s22
353	FMAC_I1 s11, s0 , s23
354	FMAC_R2 s10, s1 , s23
355	FMAC_I2	s11, s1 , s22
356
357	vstmia.f32 CO1, { s4 - s7 }
358	vstmia.f32 CO2, { s8 - s11 }
359
360	add	CO1, CO1, #16
361
362.endm
363
364/******************************************************************************/
365
366.macro INIT1x2
367
368	flds			s16 , FP_ZERO
369	vmov.f32		s17, s16
370	vmov.f32		s20, s16
371	vmov.f32		s21, s16
372	vmov.f32		s24, s16
373	vmov.f32		s25, s16
374	vmov.f32		s28, s16
375	vmov.f32		s29, s16
376
377.endm
378
379.macro KERNEL1x2_I
380	pld	[ AO , #A_PRE ]
381	pld	[ BO , #B_PRE ]
382	flds	s0 , [ AO ]
383	flds	s1 , [ AO, #4 ]
384	flds	s8 , [ BO ]
385	flds	s9 , [ BO, #4 ]
386	flds	s10, [ BO, #8 ]
387	flds	s11, [ BO, #12 ]
388
389	fmuls	s16  , s0,  s8
390	fmuls	s24  , s1,  s9
391	fmuls	s17  , s0,  s9
392	fmuls	s25  , s1,  s8
393
394	fmuls	s20  , s0,  s10
395	fmuls	s28  , s1,  s11
396	fmuls	s21  , s0,  s11
397	fmuls	s29  , s1,  s10
398
399	add	BO , BO, #16
400	add	AO , AO, #8
401
402	pld	[ BO , #B_PRE ]
403
404	flds	s4 , [ AO, #0 ]
405	flds	s5 , [ AO, #4 ]
406
407	flds	s12, [ BO ]
408	flds	s13, [ BO, #4 ]
409	flds	s14, [ BO, #8 ]
410	flds	s15, [ BO, #12 ]
411
412	add	BO , BO, #16
413	add	AO , AO, #8
414.endm
415
416
417
418.macro KERNEL1x2_M1
419	pld	[ BO , #B_PRE ]
420
421	fmacs	s16  , s0,  s8
422	fmacs	s24  , s1,  s9
423	fmacs	s17  , s0,  s9
424	fmacs	s25  , s1,  s8
425
426	fmacs	s20  , s0,  s10
427	fmacs	s28  , s1,  s11
428	fmacs	s21  , s0,  s11
429	fmacs	s29  , s1,  s10
430
431	flds	s4 , [ AO, #0 ]
432	flds	s5 , [ AO, #4 ]
433
434	flds	s12, [ BO ]
435	flds	s13, [ BO, #4 ]
436	flds	s14, [ BO, #8 ]
437	flds	s15, [ BO, #12 ]
438
439	add	BO , BO, #16
440	add	AO , AO, #8
441.endm
442
443.macro KERNEL1x2_M2
444	pld	[ AO , #A_PRE ]
445	pld	[ BO , #B_PRE ]
446
447	fmacs	s16  , s4,  s12
448	fmacs	s24  , s5,  s13
449	fmacs	s17  , s4,  s13
450	fmacs	s25  , s5,  s12
451
452	fmacs	s20  , s4,  s14
453	fmacs	s28  , s5,  s15
454	fmacs	s21  , s4,  s15
455	fmacs	s29  , s5,  s14
456
457	flds	s0 , [ AO, #0 ]
458	flds	s1 , [ AO, #4 ]
459
460	flds	s8 , [ BO ]
461	flds	s9 , [ BO, #4 ]
462	flds	s10, [ BO, #8 ]
463	flds	s11, [ BO, #12 ]
464
465	add	BO , BO, #16
466	add	AO , AO, #8
467.endm
468
469
470.macro KERNEL1x2_E
471
472	fmacs	s16  , s4,  s12
473	fmacs	s24  , s5,  s13
474	fmacs	s17  , s4,  s13
475	fmacs	s25  , s5,  s12
476
477	fmacs	s20  , s4,  s14
478	fmacs	s28  , s5,  s15
479	fmacs	s21  , s4,  s15
480	fmacs	s29  , s5,  s14
481
482.endm
483
484.macro KERNEL1x2_SUB
485
486	pld	[ AO , #A_PRE ]
487	pld	[ BO , #B_PRE ]
488	flds	s0 , [ AO ]
489	flds	s1 , [ AO, #4 ]
490	flds	s8 , [ BO ]
491	flds	s9 , [ BO, #4 ]
492	flds	s10, [ BO, #8 ]
493	flds	s11, [ BO, #12 ]
494
495	fmacs	s16  , s0,  s8
496	fmacs	s24  , s1,  s9
497	fmacs	s17  , s0,  s9
498	fmacs	s25  , s1,  s8
499
500	fmacs	s20  , s0,  s10
501	fmacs	s28  , s1,  s11
502	fmacs	s21  , s0,  s11
503	fmacs	s29  , s1,  s10
504
505	add	BO , BO, #16
506	add	AO , AO, #8
507
508.endm
509
510
511
512
513.macro SAVE1x2
514
515	ldr	r3  , LDC
516	add	CO2 , CO1, r3
517	flds		s0, ALPHA_R
518	flds		s1, ALPHA_I
519
520	FADD_R	s16, s24 , s16
521	FADD_I  s17, s25 , s17
522	FADD_R	s20, s28 , s20
523	FADD_I  s21, s29 , s21
524
525	FMAC_R1 s4 , s0 , s16
526	FMAC_I1 s5 , s0 , s17
527	FMAC_R2 s4 , s1 , s17
528	FMAC_I2	s5 , s1 , s16
529
530	FMAC_R1 s8 , s0 , s20
531	FMAC_I1 s9 , s0 , s21
532	FMAC_R2 s8 , s1 , s21
533	FMAC_I2	s9 , s1 , s20
534
535	vstmia.f32 CO1, { s4 - s5 }
536	vstmia.f32 CO2, { s8 - s9  }
537
538	add	CO1, CO1, #8
539
540.endm
541
542/******************************************************************************/
543
544.macro INIT2x1
545
546	flds			s16 , FP_ZERO
547	vmov.f32		s17, s16
548	vmov.f32		s18, s16
549	vmov.f32		s19, s16
550	vmov.f32		s24, s16
551	vmov.f32		s25, s16
552	vmov.f32		s26, s16
553	vmov.f32		s27, s16
554
555.endm
556
557.macro KERNEL2x1_I
558	pld	[ AO , #A_PRE ]
559	pld	[ BO , #B_PRE ]
560	flds	s0 , [ AO ]
561	flds	s1 , [ AO, #4 ]
562	flds	s2 , [ AO, #8 ]
563	flds	s3 , [ AO, #12 ]
564	flds	s8 , [ BO ]
565	flds	s9 , [ BO, #4 ]
566
567	fmuls	s16  , s0,  s8
568	fmuls	s24  , s1,  s9
569	fmuls	s17  , s0,  s9
570	fmuls	s25  , s1,  s8
571
572	fmuls	s18  , s2,  s8
573	fmuls	s26  , s3,  s9
574	fmuls	s19  , s2,  s9
575	fmuls	s27  , s3,  s8
576
577	add	BO , BO, #8
578	add	AO , AO, #16
579
580	pld	[ BO , #B_PRE ]
581	pld	[ AO , #A_PRE ]
582
583	flds	s4 , [ AO, #0 ]
584	flds	s5 , [ AO, #4 ]
585	flds	s6 , [ AO, #8 ]
586	flds	s7 , [ AO, #12 ]
587
588	flds	s12, [ BO ]
589	flds	s13, [ BO, #4 ]
590
591	add	BO , BO, #8
592	add	AO , AO, #16
593.endm
594
595
596
597.macro KERNEL2x1_M1
598	pld	[ AO , #A_PRE ]
599	pld	[ BO , #B_PRE ]
600
601	fmacs	s16  , s0,  s8
602	fmacs	s24  , s1,  s9
603	fmacs	s17  , s0,  s9
604	fmacs	s25  , s1,  s8
605
606	fmacs	s18  , s2,  s8
607	fmacs	s26  , s3,  s9
608	fmacs	s19  , s2,  s9
609	fmacs	s27  , s3,  s8
610
611	flds	s4 , [ AO, #0 ]
612	flds	s5 , [ AO, #4 ]
613	flds	s6 , [ AO, #8 ]
614	flds	s7 , [ AO, #12 ]
615
616	flds	s12, [ BO ]
617	flds	s13, [ BO, #4 ]
618
619	add	BO , BO, #8
620	add	AO , AO, #16
621.endm
622
623.macro KERNEL2x1_M2
624	pld	[ AO , #A_PRE ]
625	pld	[ BO , #B_PRE ]
626
627	fmacs	s16  , s4,  s12
628	fmacs	s24  , s5,  s13
629	fmacs	s17  , s4,  s13
630	fmacs	s25  , s5,  s12
631
632	fmacs	s18  , s6,  s12
633	fmacs	s26  , s7,  s13
634	fmacs	s19  , s6,  s13
635	fmacs	s27  , s7,  s12
636
637	flds	s0 , [ AO, #0 ]
638	flds	s1 , [ AO, #4 ]
639	flds	s2 , [ AO, #8 ]
640	flds	s3 , [ AO, #12 ]
641
642	flds	s8 , [ BO ]
643	flds	s9 , [ BO, #4 ]
644
645	add	BO , BO, #8
646	add	AO , AO, #16
647.endm
648
649
650.macro KERNEL2x1_E
651
652	fmacs	s16  , s4,  s12
653	fmacs	s24  , s5,  s13
654	fmacs	s17  , s4,  s13
655	fmacs	s25  , s5,  s12
656
657	fmacs	s18  , s6,  s12
658	fmacs	s26  , s7,  s13
659	fmacs	s19  , s6,  s13
660	fmacs	s27  , s7,  s12
661
662.endm
663
664.macro KERNEL2x1_SUB
665
666	pld	[ AO , #A_PRE ]
667	pld	[ BO , #B_PRE ]
668	flds	s0 , [ AO ]
669	flds	s1 , [ AO, #4 ]
670	flds	s2 , [ AO, #8 ]
671	flds	s3 , [ AO, #12 ]
672	flds	s8 , [ BO ]
673	flds	s9 , [ BO, #4 ]
674
675	fmacs	s16  , s0,  s8
676	fmacs	s24  , s1,  s9
677	fmacs	s17  , s0,  s9
678	fmacs	s25  , s1,  s8
679
680	fmacs	s18  , s2,  s8
681	fmacs	s26  , s3,  s9
682	fmacs	s19  , s2,  s9
683	fmacs	s27  , s3,  s8
684
685	add	BO , BO, #8
686	add	AO , AO, #16
687
688.endm
689
690
691
692
693.macro SAVE2x1
694
695	flds		s0, ALPHA_R
696	flds		s1, ALPHA_I
697
698	FADD_R	s16, s24 , s16
699	FADD_I  s17, s25 , s17
700	FADD_R	s18, s26 , s18
701	FADD_I  s19, s27 , s19
702
703	FMAC_R1 s4 , s0 , s16
704	FMAC_I1 s5 , s0 , s17
705	FMAC_R2 s4 , s1 , s17
706	FMAC_I2	s5 , s1 , s16
707
708	FMAC_R1 s6 , s0 , s18
709	FMAC_I1 s7 , s0 , s19
710	FMAC_R2 s6 , s1 , s19
711	FMAC_I2	s7 , s1 , s18
712
713	vstmia.f32 CO1, { s4 - s7 }
714
715	add	CO1, CO1, #16
716
717.endm
718
719/******************************************************************************/
720
721.macro INIT1x1
722
723	flds			s16 , FP_ZERO
724	vmov.f32		s17, s16
725	vmov.f32		s24, s16
726	vmov.f32		s25, s16
727
728.endm
729
730.macro KERNEL1x1_I
731	pld	[ AO , #A_PRE ]
732	pld	[ BO , #B_PRE ]
733	flds	s0 , [ AO ]
734	flds	s1 , [ AO, #4 ]
735	flds	s8 , [ BO ]
736	flds	s9 , [ BO, #4 ]
737
738	fmuls	s16  , s0,  s8
739	fmuls	s24  , s1,  s9
740	fmuls	s17  , s0,  s9
741	fmuls	s25  , s1,  s8
742
743	add	BO , BO, #8
744	add	AO , AO, #8
745
746	pld	[ BO , #B_PRE ]
747	pld	[ AO , #A_PRE ]
748
749	flds	s4 , [ AO, #0 ]
750	flds	s5 , [ AO, #4 ]
751
752	flds	s12, [ BO ]
753	flds	s13, [ BO, #4 ]
754
755	add	BO , BO, #8
756	add	AO , AO, #8
757.endm
758
759
760
761.macro KERNEL1x1_M1
762
763	fmacs	s16  , s0,  s8
764	fmacs	s24  , s1,  s9
765	fmacs	s17  , s0,  s9
766	fmacs	s25  , s1,  s8
767
768	flds	s4 , [ AO, #0 ]
769	flds	s5 , [ AO, #4 ]
770
771	flds	s12, [ BO ]
772	flds	s13, [ BO, #4 ]
773
774	add	BO , BO, #8
775	add	AO , AO, #8
776.endm
777
778.macro KERNEL1x1_M2
779
780	fmacs	s16  , s4,  s12
781	fmacs	s24  , s5,  s13
782	fmacs	s17  , s4,  s13
783	fmacs	s25  , s5,  s12
784
785	flds	s0 , [ AO, #0 ]
786	flds	s1 , [ AO, #4 ]
787
788	flds	s8 , [ BO ]
789	flds	s9 , [ BO, #4 ]
790
791	add	BO , BO, #8
792	add	AO , AO, #8
793.endm
794
795
796.macro KERNEL1x1_E
797
798	fmacs	s16  , s4,  s12
799	fmacs	s24  , s5,  s13
800	fmacs	s17  , s4,  s13
801	fmacs	s25  , s5,  s12
802
803.endm
804
805.macro KERNEL1x1_SUB
806
807	flds	s0 , [ AO ]
808	flds	s1 , [ AO, #4 ]
809	flds	s8 , [ BO ]
810	flds	s9 , [ BO, #4 ]
811
812	fmacs	s16  , s0,  s8
813	fmacs	s24  , s1,  s9
814	fmacs	s17  , s0,  s9
815	fmacs	s25  , s1,  s8
816
817	add	BO , BO, #8
818	add	AO , AO, #8
819
820.endm
821
822
823
824
825.macro SAVE1x1
826
827	flds		s0, ALPHA_R
828	flds		s1, ALPHA_I
829
830	FADD_R	s16, s24 , s16
831	FADD_I  s17, s25 , s17
832
833	FMAC_R1 s4 , s0 , s16
834	FMAC_I1 s5 , s0 , s17
835	FMAC_R2 s4 , s1 , s17
836	FMAC_I2	s5 , s1 , s16
837
838	vstmia.f32 CO1, { s4 - s5 }
839
840	add	CO1, CO1, #8
841
842.endm
843
844/******************************************************************************/
845
846
847/**************************************************************************************
848* End of macro definitions
849**************************************************************************************/
850
851	PROLOGUE
852
853	.align 5
854
855	push	{r4 - r9, fp}
856	add	fp, sp, #24
857	sub	sp, sp, #STACKSIZE				// reserve stack
858
859#if !defined(__ARM_PCS_VFP)
860	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
861	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
862	ldr	OLD_A, OLD_A_SOFTFP
863#endif
864	str	OLD_M, M
865	str	OLD_N, N
866	str	OLD_K, K
867	str	OLD_A, A
868	vstr	OLD_ALPHA_R, ALPHA_R
869	vstr	OLD_ALPHA_I, ALPHA_I
870
871	sub	r3, fp, #128
872	vstm	r3, { s8 - s31} 				// store floating point registers
873
874        movs    r4, #0
875        str     r4, FP_ZERO
876        str     r4, FP_ZERO_1
877
878	ldr	r3, OLD_LDC
879	lsl	r3, r3, #3					// ldc = ldc * 4 * 2
880	str	r3, LDC
881
882        ldr     r3, OFFSET
883#ifndef LEFT
884        neg     r3 , r3
885#endif
886        str     r3 , KK
887
888	ldr	BC, B
889
890	ldr	J, N
891	asrs	J, J, #1					// J = J / 2
892	ble	_L1_BEGIN
893
894_L2_BEGIN:
895
896	ldr	CO1, C						// CO1 = C
897	ldr	r4 , LDC
898	lsl	r4 , r4 , #1					// LDC * 2
899	add	r3 , r4, CO1
900	str	r3 , C						// store C
901
902#if defined(LEFT)
903        ldr     r3 , OFFSET
904        str     r3 , KK
905#endif
906
907	ldr	AO, A						// AO = A
908        pld     [AO , #A_PRE-64]
909        pld     [AO , #A_PRE-32]
910
911
912
913_L2_M2_BEGIN:
914
915	ldr	I, M
916	asrs	I, I, #1					// I = I / 2
917	ble	_L2_M1_BEGIN
918
919_L2_M2_20:
920
921#if  (defined(LEFT) &&  defined(TRANSA)) || \
922    (!defined(LEFT) && !defined(TRANSA))
923
924        mov     BO, BC
925#else
926        mov     BO, BC
927        ldr     r3 , KK
928        lsls    r4 , r3 , #4                                    // 2 * 4 * 2 float values
929        add     BO , BO , r4
930        lsls    r4 , r3 , #4                                    // 2 * 4 * 2 float values
931        add     AO , AO , r4
932
933#endif
934
935#ifndef TRMMKERNEL
936        ldr     K1, K
937#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
938        ldr     K1, K
939        ldr     r3, KK
940        sub     K1, K1, r3
941        str     K1, KKK
942#else
943        ldr     K1, KK
944#ifdef LEFT
945        add     K1, K1, #2        // number of values in AO
946#else
947        add     K1, K1, #2        // number of values in BO
948#endif
949        str     K1, KKK
950#endif
951
952	asrs	L , K1, #3					// L = L / 8
953	cmp	L , #3
954	blt	_L2_M2_30
955	.align 5
956
957
958
959	KERNEL2x2_I
960	KERNEL2x2_M2
961	KERNEL2x2_M1
962	KERNEL2x2_M2
963
964	KERNEL2x2_M1
965	KERNEL2x2_M2
966	KERNEL2x2_M1
967	KERNEL2x2_M2
968
969	sub	L, L, #2
970
971_L2_M2_22:
972
973	KERNEL2x2_M1
974	KERNEL2x2_M2
975	KERNEL2x2_M1
976	KERNEL2x2_M2
977
978	KERNEL2x2_M1
979	KERNEL2x2_M2
980	KERNEL2x2_M1
981	KERNEL2x2_M2
982
983	subs	L, L, #1
984	bgt	_L2_M2_22
985
986	KERNEL2x2_M1
987	KERNEL2x2_M2
988	KERNEL2x2_M1
989	KERNEL2x2_M2
990
991	KERNEL2x2_M1
992	KERNEL2x2_M2
993	KERNEL2x2_M1
994	KERNEL2x2_E
995
996	b	 _L2_M2_44
997
998
999_L2_M2_30:
1000	tst	L, #3
1001	ble	_L2_M2_40
1002
1003	tst	L, #2
1004	ble	_L2_M2_32
1005
1006	KERNEL2x2_I
1007	KERNEL2x2_M2
1008	KERNEL2x2_M1
1009	KERNEL2x2_M2
1010
1011	KERNEL2x2_M1
1012	KERNEL2x2_M2
1013	KERNEL2x2_M1
1014	KERNEL2x2_M2
1015
1016	KERNEL2x2_M1
1017	KERNEL2x2_M2
1018	KERNEL2x2_M1
1019	KERNEL2x2_M2
1020
1021
1022	KERNEL2x2_M1
1023	KERNEL2x2_M2
1024	KERNEL2x2_M1
1025	KERNEL2x2_E
1026
1027	b	 _L2_M2_44
1028
1029_L2_M2_32:
1030
1031	tst	L, #1
1032	ble	_L2_M2_40
1033
1034	KERNEL2x2_I
1035	KERNEL2x2_M2
1036	KERNEL2x2_M1
1037	KERNEL2x2_M2
1038
1039	KERNEL2x2_M1
1040	KERNEL2x2_M2
1041	KERNEL2x2_M1
1042	KERNEL2x2_E
1043
1044	b	 _L2_M2_44
1045
1046
1047_L2_M2_40:
1048
1049	INIT2x2
1050
1051
1052_L2_M2_44:
1053
1054	ands	L , K1, #7					// L = L % 8
1055	ble	_L2_M2_100
1056
1057_L2_M2_46:
1058
1059	KERNEL2x2_SUB
1060
1061	subs	L, L, #1
1062	bne	_L2_M2_46
1063
1064_L2_M2_100:
1065
1066	SAVE2x2
1067
1068#if  (defined(LEFT) &&  defined(TRANSA)) || \
1069    (!defined(LEFT) && !defined(TRANSA))
1070        ldr     r3 , K
1071        ldr     r4 , KKK
1072        sub     r3 , r3 , r4
1073        lsls    r4 , r3 , #4                    // 2 * 4 * 2 float values
1074        add     BO , BO , r4
1075        lsls    r4 , r3 , #4                    // 2 * 4 * 2 float values
1076        add     AO , AO , r4
1077#endif
1078
1079#if defined(LEFT)
1080        ldr     r3 , KK
1081        add     r3 , r3 , #2                    // number of values in AO
1082        str     r3 , KK
1083#endif
1084
1085
1086_L2_M2_END:
1087
1088	subs	I, I, #1
1089	bne	_L2_M2_20
1090
1091
1092_L2_M1_BEGIN:
1093
1094	ldr	I, M
1095	tst	I, #1					// I = I % 2
1096	ble	_L2_END
1097
1098_L2_M1_20:
1099
1100	INIT1x2
1101
1102#if  (defined(LEFT) &&  defined(TRANSA)) || \
1103    (!defined(LEFT) && !defined(TRANSA))
1104
1105        mov     BO, BC
1106#else
1107        mov     BO, BC
1108        ldr     r3 , KK
1109        lsls    r4 , r3 , #4                                    // 2 * 4 * 2 float values
1110        add     BO , BO , r4
1111        lsls    r4 , r3 , #3                                    // 1 * 4 * 2 float values
1112        add     AO , AO , r4
1113
1114#endif
1115
1116#ifndef TRMMKERNEL
1117        ldr     K1, K
1118#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1119        ldr     K1, K
1120        ldr     r3, KK
1121        sub     K1, K1, r3
1122        str     K1, KKK
1123#else
1124        ldr     K1, KK
1125#ifdef LEFT
1126        add     K1, K1, #1        // number of values in AO
1127#else
1128        add     K1, K1, #2        // number of values in BO
1129#endif
1130        str     K1, KKK
1131#endif
1132
1133	asrs	L , K1, #3					// L = L / 8
1134	ble	_L2_M1_40
1135
1136_L2_M1_22:
1137
1138	KERNEL1x2_SUB
1139	KERNEL1x2_SUB
1140	KERNEL1x2_SUB
1141	KERNEL1x2_SUB
1142
1143	KERNEL1x2_SUB
1144	KERNEL1x2_SUB
1145	KERNEL1x2_SUB
1146	KERNEL1x2_SUB
1147
1148	subs	L, L, #1
1149	bgt	_L2_M1_22
1150
1151
1152_L2_M1_40:
1153
1154	ands	L , K1, #7					// L = L % 8
1155	ble	_L2_M1_100
1156
1157_L2_M1_42:
1158
1159	KERNEL1x2_SUB
1160
1161	subs	L, L, #1
1162	bgt	_L2_M1_42
1163
1164_L2_M1_100:
1165
1166	SAVE1x2
1167
1168#if  (defined(LEFT) &&  defined(TRANSA)) || \
1169    (!defined(LEFT) && !defined(TRANSA))
1170        ldr     r3 , K
1171        ldr     r4 , KKK
1172        sub     r3 , r3 , r4
1173        lsls    r4 , r3 , #4                    // 2 * 4 * 2 float values
1174        add     BO , BO , r4
1175        lsls    r4 , r3 , #3                    // 1 * 4 * 2 float values
1176        add     AO , AO , r4
1177#endif
1178
1179#if defined(LEFT)
1180        ldr     r3 , KK
1181        add     r3 , r3 , #1                    // number of values in AO
1182        str     r3 , KK
1183#endif
1184
1185
1186
1187_L2_END:
1188
1189	mov	r3, BC
1190	ldr	r4, K
1191	lsl	r4, r4, #4					// k * 2 * 4 * 2
1192	add	r3, r3, r4					// B = B + K * 2 * 8
1193	mov	BC, r3
1194
1195#if !defined(LEFT)
1196        ldr     r3 , KK
1197        add     r3 , r3 , #2                                    // number of values in BO
1198        str     r3 , KK
1199#endif
1200
1201	subs	J , #1						// j--
1202	bgt	_L2_BEGIN
1203
1204
1205
1206/*********************************************************************************************/
1207
1208_L1_BEGIN:
1209
1210	ldr	J , N
1211	tst	J , #1
1212	ble	_L999
1213
1214
1215	ldr	CO1, C						// CO1 = C
1216	ldr	r4 , LDC
1217	add	r3 , r4, CO1
1218	str	r3 , C						// store C
1219
1220#if defined(LEFT)
1221        ldr     r3 , OFFSET
1222        str     r3 , KK
1223#endif
1224
1225	ldr	AO, A						// AO = A
1226
1227_L1_M2_BEGIN:
1228
1229	ldr	I, M
1230	asrs	I, I, #1					// I = I / 2
1231	ble	_L1_M1_BEGIN
1232
1233_L1_M2_20:
1234
1235#if  (defined(LEFT) &&  defined(TRANSA)) || \
1236    (!defined(LEFT) && !defined(TRANSA))
1237
1238        mov     BO, BC
1239#else
1240        mov     BO, BC
1241        ldr     r3 , KK
1242        lsls    r4 , r3 , #3                                    // 1 * 4 * 2 float values
1243        add     BO , BO , r4
1244        lsls    r4 , r3 , #4                                    // 2 * 4 * 2 float values
1245        add     AO , AO , r4
1246
1247#endif
1248
1249#ifndef TRMMKERNEL
1250        ldr     K1, K
1251#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1252        ldr     K1, K
1253        ldr     r3, KK
1254        sub     K1, K1, r3
1255        str     K1, KKK
1256#else
1257        ldr     K1, KK
1258#ifdef LEFT
1259        add     K1, K1, #2        // number of values in AO
1260#else
1261        add     K1, K1, #1        // number of values in BO
1262#endif
1263        str     K1, KKK
1264#endif
1265
1266	asrs	L , K1, #3					// L = L / 8
1267	cmp	L , #3
1268	blt	_L1_M2_30
1269	.align 5
1270
1271
1272
1273	KERNEL2x1_I
1274	KERNEL2x1_M2
1275	KERNEL2x1_M1
1276	KERNEL2x1_M2
1277
1278	KERNEL2x1_M1
1279	KERNEL2x1_M2
1280	KERNEL2x1_M1
1281	KERNEL2x1_M2
1282
1283	sub	L, L, #2
1284
1285_L1_M2_22:
1286
1287	KERNEL2x1_M1
1288	KERNEL2x1_M2
1289	KERNEL2x1_M1
1290	KERNEL2x1_M2
1291
1292	KERNEL2x1_M1
1293	KERNEL2x1_M2
1294	KERNEL2x1_M1
1295	KERNEL2x1_M2
1296
1297	subs	L, L, #1
1298	bgt	_L1_M2_22
1299
1300	KERNEL2x1_M1
1301	KERNEL2x1_M2
1302	KERNEL2x1_M1
1303	KERNEL2x1_M2
1304
1305	KERNEL2x1_M1
1306	KERNEL2x1_M2
1307	KERNEL2x1_M1
1308	KERNEL2x1_E
1309
1310	b	 _L1_M2_44
1311
1312
1313_L1_M2_30:
1314	tst	L, #3
1315	ble	_L1_M2_40
1316
1317	tst	L, #2
1318	ble	_L1_M2_32
1319
1320	KERNEL2x1_I
1321	KERNEL2x1_M2
1322	KERNEL2x1_M1
1323	KERNEL2x1_M2
1324
1325	KERNEL2x1_M1
1326	KERNEL2x1_M2
1327	KERNEL2x1_M1
1328	KERNEL2x1_M2
1329
1330	KERNEL2x1_M1
1331	KERNEL2x1_M2
1332	KERNEL2x1_M1
1333	KERNEL2x1_M2
1334
1335
1336	KERNEL2x1_M1
1337	KERNEL2x1_M2
1338	KERNEL2x1_M1
1339	KERNEL2x1_E
1340
1341	b	 _L1_M2_44
1342
1343_L1_M2_32:
1344
1345	tst	L, #1
1346	ble	_L1_M2_40
1347
1348	KERNEL2x1_I
1349	KERNEL2x1_M2
1350	KERNEL2x1_M1
1351	KERNEL2x1_M2
1352
1353	KERNEL2x1_M1
1354	KERNEL2x1_M2
1355	KERNEL2x1_M1
1356	KERNEL2x1_E
1357
1358	b	 _L1_M2_44
1359
1360
1361_L1_M2_40:
1362
1363	INIT2x1
1364
1365
1366_L1_M2_44:
1367
1368	ands	L , K1, #7					// L = L % 8
1369	ble	_L1_M2_100
1370
1371_L1_M2_46:
1372
1373	KERNEL2x1_SUB
1374
1375	subs	L, L, #1
1376	bne	_L1_M2_46
1377
1378_L1_M2_100:
1379
1380	SAVE2x1
1381
1382#if  (defined(LEFT) &&  defined(TRANSA)) || \
1383    (!defined(LEFT) && !defined(TRANSA))
1384        ldr     r3 , K
1385        ldr     r4 , KKK
1386        sub     r3 , r3 , r4
1387        lsls    r4 , r3 , #3                    // 1 * 4 * 2 float values
1388        add     BO , BO , r4
1389        lsls    r4 , r3 , #4                    // 2 * 4 * 2 float values
1390        add     AO , AO , r4
1391#endif
1392
1393#if defined(LEFT)
1394        ldr     r3 , KK
1395        add     r3 , r3 , #2                    // number of values in AO
1396        str     r3 , KK
1397#endif
1398
1399
1400
1401_L1_M2_END:
1402
1403	subs	I, I, #1
1404	bne	_L1_M2_20
1405
1406
1407_L1_M1_BEGIN:
1408
1409	ldr	I, M
1410	tst	I, #1					// I = I % 2
1411	ble	_L1_END
1412
1413_L1_M1_20:
1414
1415	INIT1x1
1416
1417#if  (defined(LEFT) &&  defined(TRANSA)) || \
1418    (!defined(LEFT) && !defined(TRANSA))
1419
1420        mov     BO, BC
1421#else
1422        mov     BO, BC
1423        ldr     r3 , KK
1424        lsls    r4 , r3 , #3                                    // 1 * 4 * 2 float values
1425        add     BO , BO , r4
1426        lsls    r4 , r3 , #3                                    // 1 * 4 * 2 float values
1427        add     AO , AO , r4
1428
1429#endif
1430
1431#ifndef TRMMKERNEL
1432        ldr     K1, K
1433#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1434        ldr     K1, K
1435        ldr     r3, KK
1436        sub     K1, K1, r3
1437        str     K1, KKK
1438#else
1439        ldr     K1, KK
1440#ifdef LEFT
1441        add     K1, K1, #1        // number of values in AO
1442#else
1443        add     K1, K1, #1        // number of values in BO
1444#endif
1445        str     K1, KKK
1446#endif
1447
1448	asrs	L , K1, #3					// L = L / 8
1449	ble	_L1_M1_40
1450
1451_L1_M1_22:
1452
1453	KERNEL1x1_SUB
1454	KERNEL1x1_SUB
1455	KERNEL1x1_SUB
1456	KERNEL1x1_SUB
1457
1458	KERNEL1x1_SUB
1459	KERNEL1x1_SUB
1460	KERNEL1x1_SUB
1461	KERNEL1x1_SUB
1462
1463	subs	L, L, #1
1464	bgt	_L1_M1_22
1465
1466
1467_L1_M1_40:
1468
1469	ands	L , K1, #7					// L = L % 8
1470	ble	_L1_M1_100
1471
1472_L1_M1_42:
1473
1474	KERNEL1x1_SUB
1475
1476	subs	L, L, #1
1477	bgt	_L1_M1_42
1478
1479_L1_M1_100:
1480
1481	SAVE1x1
1482
1483
1484_L1_END:
1485
1486
1487
1488_L999:
1489
1490	sub	r3, fp, #128
1491	vldm	r3, { s8 - s31}					// restore floating point registers
1492
1493	movs	r0, #0						// set return value
1494	sub	sp, fp, #24
1495	pop	{r4 - r9, fp}
1496	bx	lr
1497
1498	EPILOGUE
1499
1500