1/***************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2013/11/28 Saar
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*
34**************************************************************************************/
35
36#define ASSEMBLER
37#include "common.h"
38
39#define STACKSIZE 256
40
41#define	OLD_M	r0
42#define	OLD_N	r1
43#define	OLD_K	r2
44#define	OLD_A	r3
45#define OLD_ALPHA s0
46
47/******************************************************
48* [fp, #-128] - [fp, #-64] is reserved
49* for store and restore of floating point
50* registers
51*******************************************************/
52
53#define LDC	[fp, #-252 ]
54#define M	[fp, #-256 ]
55#define N	[fp, #-260 ]
56#define K	[fp, #-264 ]
57#define A	[fp, #-268 ]
58
59#define FP_ZERO [fp, #-240]
60#define FP_ZERO_0 [fp, # -240]
61#define FP_ZERO_1 [fp, # -236]
62
63#define ALPHA	[fp, #-280]
64
65#if !defined(__ARM_PCS_VFP)
66#define OLD_ALPHA_SOFTFP	r3
67#define OLD_A_SOFTFP	[fp, #4 ]
68#define B	[fp, #8 ]
69#define C	[fp, #12 ]
70#define OLD_LDC	[fp, #16 ]
71#else
72#define B	[fp, #4 ]
73#define C	[fp, #8 ]
74#define OLD_LDC	[fp, #12 ]
75#endif
76
77#define I	r0
78#define J	r1
79#define L	r2
80
81#define	AO	r5
82#define	BO	r6
83
84#define	CO1	r8
85#define	CO2	r9
86
87#define K1	r7
88#define BC	r12
89
90#define A_PRE	96
91#define B_PRE	96
92#define C_PRE	64
93
94/**************************************************************************************
95* Macro definitions
96**************************************************************************************/
97
98.macro INIT4x2
99
100	flds			s8, FP_ZERO
101	vmov.f32		s9, s8
102	vmov.f32		s10, s8
103	vmov.f32		s11, s8
104	vmov.f32		s12, s8
105	vmov.f32		s13, s8
106	vmov.f32		s14, s8
107	vmov.f32		s15, s8
108
109.endm
110
111
112
113.macro KERNEL4x2_SUB
114
115	vldmia.f32	AO! , { s0 - s3 }
116	vldmia.f32	BO! , { s4 - s5 }
117
118	fmacs	s8  , s0,  s4
119	fmacs	s9  , s1,  s4
120	fmacs	s10  , s2,  s4
121	fmacs	s11  , s3,  s4
122
123	fmacs	s12  , s0,  s5
124	fmacs	s13  , s1,  s5
125	fmacs	s14  , s2,  s5
126	fmacs	s15  , s3,  s5
127
128.endm
129
130.macro SAVE4x2
131
132	ldr	r3  , LDC
133	add	CO2 , CO1, r3
134
135	flds		s0, ALPHA
136
137	flds	s4 , [CO1]
138	flds	s5 , [CO1, #4 ]
139	flds	s6 , [CO1, #8 ]
140	flds	s7 , [CO1, #12 ]
141
142	fmacs	s4 , s0 , s8
143	fmacs	s5 , s0 , s9
144	fmacs	s6 , s0 , s10
145	fmacs	s7 , s0 , s11
146
147	fsts	s4 , [CO1]
148	fsts	s5 , [CO1, #4 ]
149	fsts	s6 , [CO1, #8 ]
150	fsts	s7 , [CO1, #12 ]
151
152	flds	s4 , [CO2]
153	flds	s5 , [CO2, #4 ]
154	flds	s6 , [CO2, #8 ]
155	flds	s7 , [CO2, #12 ]
156
157	fmacs	s4 , s0 , s12
158	fmacs	s5 , s0 , s13
159	fmacs	s6 , s0 , s14
160	fmacs	s7 , s0 , s15
161
162	fsts	s4 , [CO2]
163	fsts	s5 , [CO2, #4 ]
164	fsts	s6 , [CO2, #8 ]
165	fsts	s7 , [CO2, #12 ]
166
167	add	CO1, CO1, #16
168
169.endm
170
171
172/******************************************************************************/
173
174.macro INIT2x2
175
176	flds			s8, FP_ZERO
177	vmov.f32		s9, s8
178	vmov.f32		s12, s8
179	vmov.f32		s13, s8
180
181.endm
182
183.macro KERNEL2x2_SUB
184
185	flds	s4 , [ BO ]
186	flds	s5 , [ BO, #4 ]
187
188	flds	s0 , [ AO ]
189	flds	s1 , [ AO, #4 ]
190
191	fmacs	s8  , s0,  s4
192	fmacs	s9  , s1,  s4
193
194	fmacs	s12  , s0,  s5
195	fmacs	s13  , s1,  s5
196
197	add	AO , AO, #8
198	add	BO , BO, #8
199
200.endm
201
202.macro SAVE2x2
203
204	ldr	r3  , LDC
205	add	CO2 , CO1, r3
206
207	flds		s0, ALPHA
208
209	flds	s4 , [CO1]
210	flds	s5 , [CO1, #4 ]
211
212	fmacs	s4 , s0 , s8
213	fmacs	s5 , s0 , s9
214
215	fsts	s4 , [CO1]
216	fsts	s5 , [CO1, #4 ]
217
218	flds	s4 , [CO2]
219	flds	s5 , [CO2, #4 ]
220
221	fmacs	s4 , s0 , s12
222	fmacs	s5 , s0 , s13
223
224	fsts	s4 , [CO2]
225	fsts	s5 , [CO2, #4 ]
226
227	add	CO1, CO1, #8
228
229.endm
230
231
232/******************************************************************************/
233
234.macro INIT1x2
235
236	flds			s8, FP_ZERO
237	vmov.f32		s12, s8
238
239.endm
240
241.macro KERNEL1x2_SUB
242
243	flds	s4 , [ BO ]
244	flds	s5 , [ BO, #4 ]
245
246	flds	s0 , [ AO ]
247
248	fmacs	s8  , s0,  s4
249
250	fmacs	s12  , s0,  s5
251
252	add	AO , AO, #4
253	add	BO , BO, #8
254
255.endm
256
257.macro SAVE1x2
258
259	ldr	r3  , LDC
260	add	CO2 , CO1, r3
261
262	flds		s0, ALPHA
263
264	flds	s4 , [CO1]
265
266	fmacs	s4 , s0 , s8
267
268	fsts	s4 , [CO1]
269
270	flds	s4 , [CO2]
271
272	fmacs	s4 , s0 , s12
273
274	fsts	s4 , [CO2]
275
276	add	CO1, CO1, #4
277
278.endm
279
280
281
282/******************************************************************************/
283
284.macro INIT4x1
285
286	flds			s8, FP_ZERO
287	vmov.f32		s9, s8
288	vmov.f32		s10, s8
289	vmov.f32		s11, s8
290
291.endm
292
293
294
295.macro KERNEL4x1_SUB
296
297	flds	s4 , [ BO ]
298
299	flds	s0 , [ AO ]
300	flds	s1 , [ AO, #4 ]
301	flds	s2 , [ AO, #8 ]
302	flds	s3 , [ AO, #12 ]
303
304	fmacs	s8  , s0,  s4
305	fmacs	s9  , s1,  s4
306	fmacs	s10 , s2,  s4
307	fmacs	s11 , s3,  s4
308
309	add	AO , AO, #16
310	add	BO , BO, #4
311
312.endm
313
314.macro SAVE4x1
315
316	flds		s0, ALPHA
317
318	flds	s4 , [CO1]
319	flds	s5 , [CO1, #4 ]
320	flds	s6 , [CO1, #8 ]
321	flds	s7 , [CO1, #12 ]
322
323	fmacs	s4 , s0 , s8
324	fmacs	s5 , s0 , s9
325	fmacs	s6 , s0 , s10
326	fmacs	s7 , s0 , s11
327
328	fsts	s4 , [CO1]
329	fsts	s5 , [CO1, #4 ]
330	fsts	s6 , [CO1, #8 ]
331	fsts	s7 , [CO1, #12 ]
332
333	add	CO1, CO1, #16
334
335.endm
336
337/******************************************************************************/
338
339.macro INIT2x1
340
341	flds			s8, FP_ZERO
342	vmov.f32		s9 , s8
343
344.endm
345
346.macro KERNEL2x1_SUB
347
348	flds	s4 , [ BO ]
349
350	flds	s0 , [ AO ]
351	flds	s1 , [ AO, #4 ]
352
353	fmacs	s8  , s0,  s4
354	fmacs	s9  , s1,  s4
355
356	add	AO , AO, #8
357	add	BO , BO, #4
358
359.endm
360
361.macro SAVE2x1
362
363	flds		s0, ALPHA
364
365	flds	s4 , [CO1]
366	flds	s5 , [CO1, #4 ]
367
368	fmacs	s4 , s0 , s8
369	fmacs	s5 , s0 , s9
370
371	fsts	s4 , [CO1]
372	fsts	s5 , [CO1, #4 ]
373
374	add	CO1, CO1, #8
375
376.endm
377
378
379/******************************************************************************/
380
381.macro INIT1x1
382
383	flds			s8, FP_ZERO
384
385.endm
386
387.macro KERNEL1x1_SUB
388
389	flds	s4 , [ BO ]
390
391	flds	s0 , [ AO ]
392
393	fmacs	s8  , s0,  s4
394
395	add	AO , AO, #4
396	add	BO , BO, #4
397
398.endm
399
400.macro SAVE1x1
401
402	flds		s0, ALPHA
403
404	flds	s4 , [CO1]
405
406	fmacs	s4 , s0 , s8
407
408	fsts	s4 , [CO1]
409
410	add	CO1, CO1, #4
411
412.endm
413
414
415/**************************************************************************************
416* End of macro definitions
417**************************************************************************************/
418
419	PROLOGUE
420
421	.align 5
422
423	push	{r4 - r9, fp}
424	add	fp, sp, #24
425	sub	sp, sp, #STACKSIZE				// reserve stack
426
427#if !defined(__ARM_PCS_VFP)
428	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
429	ldr	OLD_A, OLD_A_SOFTFP
430#endif
431	str	OLD_M, M
432	str	OLD_N, N
433	str	OLD_K, K
434	str	OLD_A, A
435	vstr	OLD_ALPHA, ALPHA
436
437	sub	r3, fp, #128
438	vstm	r3, { s8 - s15} 				// store floating point registers
439
440        movs    r4, #0
441        str     r4, FP_ZERO
442        str     r4, FP_ZERO_1
443
444	ldr	r3, OLD_LDC
445	lsl	r3, r3, #2					// ldc = ldc * 4
446	str	r3, LDC
447
448	ldr	K1, K
449	ldr	BC, B
450
451	ldr	J, N
452	asrs	J, J, #1					// J = J / 2
453	ble	sgemm_kernel_L1_BEGIN
454
455
456/*********************************************************************************************/
457
458sgemm_kernel_L2_BEGIN:
459
460	ldr	CO1, C						// CO1 = C
461	ldr	r4 , LDC
462	lsl	r4 , r4 , #1					// LDC * 2
463	add	r3 , r4, CO1
464	str	r3 , C						// store C
465
466	ldr	AO, A						// AO = A
467
468sgemm_kernel_L2_M4_BEGIN:
469
470	ldr	I, M
471	asrs	I, I, #2					// I = I / 4
472	ble	sgemm_kernel_L2_M2_BEGIN
473
474sgemm_kernel_L2_M4_20:
475
476	INIT4x2
477
478	mov	BO, BC
479	asrs	L , K1, #3					// L = L / 8
480	ble	sgemm_kernel_L2_M4_40
481	.align 5
482
483sgemm_kernel_L2_M4_22:
484
485	pld [ AO, #A_PRE ]
486	pld [ BO, #B_PRE ]
487	KERNEL4x2_SUB
488	KERNEL4x2_SUB
489	pld [ AO, #A_PRE ]
490	KERNEL4x2_SUB
491	KERNEL4x2_SUB
492
493	pld [ AO, #A_PRE ]
494	pld [ BO, #B_PRE ]
495	KERNEL4x2_SUB
496	KERNEL4x2_SUB
497	pld [ AO, #A_PRE ]
498	KERNEL4x2_SUB
499	KERNEL4x2_SUB
500
501	subs	L, L, #1
502	bgt	sgemm_kernel_L2_M4_22
503
504
505sgemm_kernel_L2_M4_40:
506
507	ands	L , K1, #7					// L = L % 8
508	ble	sgemm_kernel_L2_M4_100
509
510sgemm_kernel_L2_M4_42:
511
512	KERNEL4x2_SUB
513
514	subs	L, L, #1
515	bgt	sgemm_kernel_L2_M4_42
516
517sgemm_kernel_L2_M4_100:
518
519	SAVE4x2
520
521sgemm_kernel_L2_M4_END:
522
523	subs	I, I, #1
524	bgt	sgemm_kernel_L2_M4_20
525
526
527sgemm_kernel_L2_M2_BEGIN:
528
529	ldr	I, M
530	tst	I , #3
531	ble	sgemm_kernel_L2_END
532
533	tst	I, #2					// I = I / 2
534	ble	sgemm_kernel_L2_M1_BEGIN
535
536sgemm_kernel_L2_M2_20:
537
538	INIT2x2
539
540	mov	BO, BC
541	asrs	L , K1, #3					// L = L / 8
542	ble	sgemm_kernel_L2_M2_40
543
544sgemm_kernel_L2_M2_22:
545
546	KERNEL2x2_SUB
547	KERNEL2x2_SUB
548	KERNEL2x2_SUB
549	KERNEL2x2_SUB
550
551	KERNEL2x2_SUB
552	KERNEL2x2_SUB
553	KERNEL2x2_SUB
554	KERNEL2x2_SUB
555
556	subs	L, L, #1
557	bgt	sgemm_kernel_L2_M2_22
558
559
560sgemm_kernel_L2_M2_40:
561
562	ands	L , K1, #7					// L = L % 8
563	ble	sgemm_kernel_L2_M2_100
564
565sgemm_kernel_L2_M2_42:
566
567	KERNEL2x2_SUB
568
569	subs	L, L, #1
570	bgt	sgemm_kernel_L2_M2_42
571
572sgemm_kernel_L2_M2_100:
573
574	SAVE2x2
575
576sgemm_kernel_L2_M2_END:
577
578
579sgemm_kernel_L2_M1_BEGIN:
580
581	tst	I, #1					// I = I % 2
582	ble	sgemm_kernel_L2_END
583
584sgemm_kernel_L2_M1_20:
585
586	INIT1x2
587
588	mov	BO, BC
589	asrs	L , K1, #3					// L = L / 8
590	ble	sgemm_kernel_L2_M1_40
591
592sgemm_kernel_L2_M1_22:
593	KERNEL1x2_SUB
594	KERNEL1x2_SUB
595	KERNEL1x2_SUB
596	KERNEL1x2_SUB
597
598	KERNEL1x2_SUB
599	KERNEL1x2_SUB
600	KERNEL1x2_SUB
601	KERNEL1x2_SUB
602
603	subs	L, L, #1
604	bgt	sgemm_kernel_L2_M1_22
605
606
607sgemm_kernel_L2_M1_40:
608
609	ands	L , K1, #7					// L = L % 8
610	ble	sgemm_kernel_L2_M1_100
611
612sgemm_kernel_L2_M1_42:
613
614	KERNEL1x2_SUB
615
616	subs	L, L, #1
617	bgt	sgemm_kernel_L2_M1_42
618
619sgemm_kernel_L2_M1_100:
620
621	SAVE1x2
622
623
624sgemm_kernel_L2_END:
625
626	mov	r3, BC
627	mov	r4, K1
628	lsl	r4, r4, #3					// k * 2 * 4
629	add	r3, r3, r4					// B = B + K * 2 * 4
630	mov	BC, r3
631
632	subs	J , #1						// j--
633	bgt	sgemm_kernel_L2_BEGIN
634
635/*********************************************************************************************/
636
637sgemm_kernel_L1_BEGIN:
638
639	ldr	J , N
640	tst	J , #1
641	ble	sgemm_kernel_L999
642
643
644	ldr	CO1, C						// CO1 = C
645	ldr	r4 , LDC
646	add	r3 , r4, CO1
647	str	r3 , C						// store C
648
649	ldr	AO, A						// AO = A
650
651
652
653sgemm_kernel_L1_M4_BEGIN:
654
655	ldr	I, M
656	asrs	I, I, #2					// I = I / 4
657	ble	sgemm_kernel_L1_M2_BEGIN
658
659sgemm_kernel_L1_M4_20:
660
661	INIT4x1
662
663	mov	BO, BC
664	asrs	L , K1, #3					// L = L / 8
665	ble	sgemm_kernel_L1_M4_40
666	.align 5
667
668sgemm_kernel_L1_M4_22:
669	KERNEL4x1_SUB
670	KERNEL4x1_SUB
671	KERNEL4x1_SUB
672	KERNEL4x1_SUB
673
674	KERNEL4x1_SUB
675	KERNEL4x1_SUB
676	KERNEL4x1_SUB
677	KERNEL4x1_SUB
678
679	subs	L, L, #1
680	bgt	sgemm_kernel_L1_M4_22
681
682
683sgemm_kernel_L1_M4_40:
684
685	ands	L , K1, #7					// L = L % 8
686	ble	sgemm_kernel_L1_M4_100
687
688sgemm_kernel_L1_M4_42:
689
690	KERNEL4x1_SUB
691
692	subs	L, L, #1
693	bgt	sgemm_kernel_L1_M4_42
694
695sgemm_kernel_L1_M4_100:
696
697	SAVE4x1
698
699sgemm_kernel_L1_M4_END:
700
701	subs	I, I, #1
702	bgt	sgemm_kernel_L1_M4_20
703
704
705sgemm_kernel_L1_M2_BEGIN:
706
707	ldr	I, M
708	tst	I , #3
709	ble	sgemm_kernel_L1_END
710
711	tst	I, #2					// I = I / 2
712	ble	sgemm_kernel_L1_M1_BEGIN
713
714sgemm_kernel_L1_M2_20:
715
716	INIT2x1
717
718	mov	BO, BC
719	asrs	L , K1, #3					// L = L / 8
720	ble	sgemm_kernel_L1_M2_40
721
722sgemm_kernel_L1_M2_22:
723
724	KERNEL2x1_SUB
725	KERNEL2x1_SUB
726	KERNEL2x1_SUB
727	KERNEL2x1_SUB
728
729	KERNEL2x1_SUB
730	KERNEL2x1_SUB
731	KERNEL2x1_SUB
732	KERNEL2x1_SUB
733
734	subs	L, L, #1
735	bgt	sgemm_kernel_L1_M2_22
736
737
738sgemm_kernel_L1_M2_40:
739
740	ands	L , K1, #7					// L = L % 8
741	ble	sgemm_kernel_L1_M2_100
742
743sgemm_kernel_L1_M2_42:
744
745	KERNEL2x1_SUB
746
747	subs	L, L, #1
748	bgt	sgemm_kernel_L1_M2_42
749
750sgemm_kernel_L1_M2_100:
751
752	SAVE2x1
753
754sgemm_kernel_L1_M2_END:
755
756
757sgemm_kernel_L1_M1_BEGIN:
758
759	tst	I, #1					// I = I % 2
760	ble	sgemm_kernel_L1_END
761
762sgemm_kernel_L1_M1_20:
763
764	INIT1x1
765
766	mov	BO, BC
767	asrs	L , K1, #3					// L = L / 8
768	ble	sgemm_kernel_L1_M1_40
769
770sgemm_kernel_L1_M1_22:
771
772	KERNEL1x1_SUB
773	KERNEL1x1_SUB
774	KERNEL1x1_SUB
775	KERNEL1x1_SUB
776
777	KERNEL1x1_SUB
778	KERNEL1x1_SUB
779	KERNEL1x1_SUB
780	KERNEL1x1_SUB
781
782	subs	L, L, #1
783	bgt	sgemm_kernel_L1_M1_22
784
785
786sgemm_kernel_L1_M1_40:
787
788	ands	L , K1, #7					// L = L % 8
789	ble	sgemm_kernel_L1_M1_100
790
791sgemm_kernel_L1_M1_42:
792
793	KERNEL1x1_SUB
794
795	subs	L, L, #1
796	bgt	sgemm_kernel_L1_M1_42
797
798sgemm_kernel_L1_M1_100:
799
800	SAVE1x1
801
802
803sgemm_kernel_L1_END:
804
805
806sgemm_kernel_L999:
807
808	sub	r3, fp, #128
809	vldm	r3, { s8 - s15}					// restore floating point registers
810
811	movs	r0, #0						// set return value
812	sub	sp, fp, #24
813	pop	{r4 - r9, fp}
814	bx	lr
815
816	EPILOGUE
817
818