1/*******************************************************************************
2Copyright (c) 2015, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*******************************************************************************/
27
28#define ASSEMBLER
29#include "common.h"
30
31/*                   X0          X1          X2          s0        X3        x4       x5           x6               x7 */
32/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
33
34#define origM		x0
35#define origN		x1
36#define origK		x2
37#define origPA		x3
38#define origPB		x4
39#define pC		x5
40#define LDC		x6
41#define offset		x7
42#define counterL	x8
43#define counterI	x9
44#define counterJ	x10
45#define pB		x11
46#define pCRow0		x12
47#define pCRow1		x13
48#define pCRow2		x14
49#define pA		x15
50#define temp		x16
51#define tempOffset	x17
52#define tempK		x18
53
54#define alpha0		s10
55#define alphaV0		v10.s[0]
56#define alpha1		s11
57#define alphaV1		v11.s[0]
58#define alpha2		s14
59#define alphaV2		v14.s[0]
60#define alpha3		s15
61#define alphaV3		v15.s[0]
62
63// 00 origM
64// 01 origN
65// 02 origK
66// 03 origPA
67// 04 origPB
68// 05 pC
69// 06 origLDC -> LDC
70// 07 offset
71// 08 counterL
72// 09 counterI
73// 10 counterJ
74// 11 pB
75// 12 pCRow0
76// 13 pCRow1
77// 14 pCRow2
78// 15 pA
79// 16 temp
80// 17 tempOffset
81// 18 must save tempK
82// 19 must save
83// 20 must save
84// 21 must save
85// 22 must save
86// 23 must save
87// 24 must save
88// 25 must save
89// 26 must save
90// 27 must save
91// 28 must save
92// 29 frame
93// 30 link
94// 31 sp
95
96//v00 ALPHA -> pA00, pA01
97//v01 pA02, pA03
98//v02
99//v03
100//v04 pA10, pA11
101//v05 pA12, pA13
102//v06
103//v07
104//v08 must save pB00, pB01
105//v09 must save pB02, pB03
106//v10 must save ALPHA0
107//v11 must save ALPHA1
108//v12 must save pB10, pB11
109//v13 must save pB12, pB13
110//v14 must save ALPHA2
111//v15 must save ALPHA3
112//v16 must save C00, C01
113//v17 must save C02, C03
114//v18
115//v19
116//v20 C10, C11
117//v21 C12, C13
118//v22
119//v23
120//v24 C20, C21
121//v25 C22, C23
122//v26
123//v27
124//v28 C30, C31
125//v29 C32, C33
126//v30
127//v31
128
129/*******************************************************************************
130* Macro definitions
131*******************************************************************************/
132
133.macro INIT4x4
134	fmov		s16, wzr
135	fmov		s17, s16
136	fmov		s20, s17
137	fmov		s21, s16
138	fmov		s24, s17
139	fmov		s25, s16
140	fmov		s28, s17
141	fmov		s29, s16
142.endm
143
144.macro KERNEL4x4_I
145	ld1	{v8.2s, v9.2s}, [pB]
146	add	pB, pB, #16
147	ld1	{v0.2s, v1.2s}, [pA]
148	add	pA, pA, #16
149
150	fmul	v16.2s, v0.2s, v8.s[0]
151	fmul	v29.2s, v1.2s, v9.s[1]
152
153	fmul	v20.2s, v0.2s, v8.s[1]
154	fmul	v25.2s, v1.2s, v9.s[0]
155
156	fmul	v24.2s, v0.2s, v9.s[0]
157	fmul	v21.2s, v1.2s, v8.s[1]
158
159	fmul	v28.2s, v0.2s, v9.s[1]
160	fmul	v17.2s, v1.2s, v8.s[0]
161
162	ld1	{v12.2s, v13.2s}, [pB]
163	add	pB, pB, #16
164	ld1	{v4.2s, v5.2s}, [pA]
165	add	pA, pA, #16
166.endm
167
168.macro KERNEL4x4_M1
169	fmla	v16.2s, v0.2s, v8.s[0]
170	fmla	v29.2s, v1.2s, v9.s[1]
171
172	ld1	{v12.2s, v13.2s}, [pB]		// For next round
173	add	pB, pB, #16
174
175	fmla	v20.2s, v0.2s, v8.s[1]
176	fmla	v25.2s, v1.2s, v9.s[0]
177
178	ld1	{v4.2s, v5.2s}, [pA]		// For next round
179	add	pA, pA, #16
180
181	fmla	v24.2s, v0.2s, v9.s[0]
182	fmla	v21.2s, v1.2s, v8.s[1]
183
184	prfm	PLDL1KEEP, [pB, #512]
185
186	fmla	v28.2s, v0.2s, v9.s[1]
187	fmla	v17.2s, v1.2s, v8.s[0]
188.endm
189
190.macro KERNEL4x4_M2
191	fmla	v16.2s, v4.2s, v12.s[0]
192	fmla	v29.2s, v5.2s, v13.s[1]
193
194	ld1	{v8.2s, v9.2s}, [pB]		// For next round
195	add	pB, pB, #16
196
197	fmla	v20.2s, v4.2s, v12.s[1]
198	fmla	v25.2s, v5.2s, v13.s[0]
199
200	ld1	{v0.2s, v1.2s}, [pA]		// For next round
201	add	pA, pA, #16
202
203	fmla	v24.2s, v4.2s, v13.s[0]
204	fmla	v21.2s, v5.2s, v12.s[1]
205
206	prfm	PLDL1KEEP, [pA, #512]
207
208	fmla	v28.2s, v4.2s, v13.s[1]
209	fmla	v17.2s, v5.2s, v12.s[0]
210.endm
211
212.macro KERNEL4x4_E
213	fmla	v16.2s, v4.2s, v12.s[0]
214	fmla	v29.2s, v5.2s, v13.s[1]
215
216	fmla	v20.2s, v4.2s, v12.s[1]
217	fmla	v25.2s, v5.2s, v13.s[0]
218
219	fmla	v24.2s, v4.2s, v13.s[0]
220	fmla	v21.2s, v5.2s, v12.s[1]
221
222	fmla	v28.2s, v4.2s, v13.s[1]
223	fmla	v17.2s, v5.2s, v12.s[0]
224.endm
225
226.macro KERNEL4x4_SUB
227	ld1	{v8.2s, v9.2s}, [pB]
228	add	pB, pB, #16
229	ld1	{v0.2s, v1.2s}, [pA]
230	add	pA, pA, #16
231
232	fmla	v16.2s, v0.2s, v8.s[0]
233	fmla	v29.2s, v1.2s, v9.s[1]
234
235	fmla	v20.2s, v0.2s, v8.s[1]
236	fmla	v25.2s, v1.2s, v9.s[0]
237
238	fmla	v24.2s, v0.2s, v9.s[0]
239	fmla	v21.2s, v1.2s, v8.s[1]
240
241	fmla	v28.2s, v0.2s, v9.s[1]
242	fmla	v17.2s, v1.2s, v8.s[0]
243.endm
244
245.macro SAVE4x4
246	fmul	v8.2s, v16.2s, alphaV0
247	fmul	v9.2s, v17.2s, alphaV1
248	st1 	{v8.2s, v9.2s}, [pCRow0]
249
250	add	pCRow1, pCRow0, LDC
251	fmul	v12.2s, v20.2s, alphaV2
252	fmul	v13.2s, v21.2s, alphaV3
253	st1 	{v12.2s, v13.2s}, [pCRow1]
254
255	add	pCRow2, pCRow1, LDC
256	fmul	v8.2s, v24.2s, alphaV0
257	fmul	v9.2s, v25.2s, alphaV1
258	st1 	{v8.2s, v9.2s}, [pCRow2]
259
260	add	pCRow1, pCRow2, LDC
261	fmul	v12.2s, v28.2s, alphaV2
262	fmul	v13.2s, v29.2s, alphaV3
263	st1 	{v12.2s, v13.2s}, [pCRow1]
264
265	add	pCRow0, pCRow0, #16
266.endm
267
268/******************************************************************************/
269
270.macro INIT2x4
271	fmov		s16, wzr
272	fmov		s20, s16
273	fmov		s24, s20
274	fmov		s28, s16
275.endm
276
277.macro KERNEL2x4_SUB
278	ld1	{v8.2s, v9.2s}, [pB]
279	add	pB, pB, #16
280	ld1	{v0.2s}, [pA]
281	add	pA, pA, #8
282
283	fmla	v16.2s, v0.2s, v8.s[0]
284	fmla	v20.2s, v0.2s, v8.s[1]
285	fmla	v24.2s, v0.2s, v9.s[0]
286	fmla	v28.2s, v0.2s, v9.s[1]
287.endm
288
289.macro SAVE2x4
290	fmul	v8.2s, v16.2s, alphaV0
291	st1	{v8.2s}, [pCRow0]
292
293	add	pCRow1, pCRow0, LDC
294	fmul	v12.2s, v20.2s, alphaV1
295	st1	{v12.2s}, [pCRow1]
296
297	add	pCRow2, pCRow1, LDC
298	fmul	v8.2s, v24.2s, alphaV2
299	st1	{v8.2s}, [pCRow2]
300
301	add	pCRow1, pCRow2, LDC
302	fmul	v12.2s, v28.2s, alphaV3
303	st1	{v12.2s}, [pCRow1]
304
305	add	pCRow0, pCRow0, #8
306.endm
307
308/******************************************************************************/
309
310.macro INIT1x4
311	fmov		s16, wzr
312	fmov		s20, s16
313.endm
314
315.macro KERNEL1x4_SUB
316	ldr	s0, [pA]
317	add	pA, pA, #4
318
319	ld1	{v8.2s, v9.2s}, [pB]
320	add	pB, pB, #16
321
322	fmla	v16.2s, v8.2s, v0.s[0]
323	fmla	v20.2s, v9.2s, v0.s[0]
324.endm
325
326.macro SAVE1x4
327	add	pCRow1, pCRow0, LDC
328	fmul	v8.2s, v16.2s, alphaV0
329	st1	{v8.s}[0], [pCRow0]
330	st1	{v8.s}[1], [pCRow1]
331
332	add	pCRow2, pCRow1, LDC
333	add	pCRow1, pCRow2, LDC
334	fmul	v12.2s, v20.2s, alphaV1
335	st1	{v12.s}[0], [pCRow2]
336	st1	{v12.s}[1], [pCRow1]
337
338	add	pCRow0, pCRow0, #4
339.endm
340
341/******************************************************************************/
342
343.macro INIT4x2
344	fmov	s16, wzr
345	fmov	s17, s16
346	fmov	s20, s17
347	fmov	s21, s16
348.endm
349
350.macro KERNEL4x2_SUB
351	ld1	{v8.2s}, [pB]
352	add	pB, pB, #8
353	ld1	{v0.2s, v1.2s}, [pA]
354	add	pA, pA, #16
355
356	fmla	v16.2s, v0.2s, v8.s[0]
357	fmla	v17.2s, v1.2s, v8.s[0]
358	fmla	v20.2s, v0.2s, v8.s[1]
359	fmla	v21.2s, v1.2s, v8.s[1]
360.endm
361
362.macro SAVE4x2
363	fmul	v8.2s, v16.2s, alphaV0
364	fmul	v9.2s, v17.2s, alphaV1
365	st1	{v8.2s, v9.2s}, [pCRow0]
366
367	add	pCRow1, pCRow0, LDC
368	fmul	v12.2s, v20.2s, alphaV2
369	fmul	v13.2s, v21.2s, alphaV3
370	st1	{v12.2s, v13.2s}, [pCRow1]
371
372	add	pCRow0, pCRow0, #16
373.endm
374
375/******************************************************************************/
376
377.macro INIT2x2
378	fmov		s16, wzr
379	fmov		s20, s16
380.endm
381
382.macro KERNEL2x2_SUB
383	ld1	{v8.2s}, [pB]
384	add	pB, pB, #8
385
386	ld1	{v0.2s}, [pA]
387	add	pA, pA, #8
388
389	fmla	v16.2s, v0.2s, v8.s[0]
390	fmla	v20.2s, v0.2s, v8.s[1]
391.endm
392
393.macro SAVE2x2
394	fmul	v8.2s, v16.2s, alphaV0
395	st1	{v8.2s}, [pCRow0]
396
397	add	pCRow1 , pCRow0, LDC
398	fmul	v12.2s, v20.2s, alphaV1
399	st1	{v12.2s}, [pCRow1]
400
401	add	pCRow0, pCRow0, #8
402.endm
403
404/******************************************************************************/
405
406.macro INIT1x2
407	fmov		s16, wzr
408.endm
409
410.macro KERNEL1x2_SUB
411	ld1	{v8.2s} , [pB]
412	add	pB , pB, #8
413
414	ldr	s0 , [pA]
415	add	pA, pA, #4
416
417	fmla	v16.2s, v8.2s, v0.s[0]
418.endm
419
420.macro SAVE1x2
421	add	pCRow1 , pCRow0, LDC
422	fmul	v8.2s, v16.2s, alphaV0
423	st1	{v8.s}[0], [pCRow0]
424	st1	{v8.s}[1], [pCRow1]
425
426	add	pCRow0, pCRow0, #4
427.endm
428
429/******************************************************************************/
430
431.macro INIT4x1
432	fmov	s16, wzr
433	fmov	s17, s16
434.endm
435
436.macro KERNEL4x1_SUB
437	ldr	s8, [pB]
438	add	pB , pB, #4
439
440	ld1	{v0.2s, v1.2s}, [pA]
441	add	pA , pA, #16
442
443	fmla	v16.2s, v0.2s, v8.s[0]
444	fmla	v17.2s, v1.2s, v8.s[0]
445.endm
446
447.macro SAVE4x1
448	fmul	v8.2s, v16.2s, alphaV0
449	fmul	v9.2s, v17.2s, alphaV1
450	st1	{v8.2s, v9.2s}, [pCRow0]
451
452	add	pCRow0, pCRow0, #16
453.endm
454
455
456
457
458/******************************************************************************/
459
460.macro INIT2x1
461	fmov		s16, wzr
462.endm
463
464.macro KERNEL2x1_SUB
465	ldr	s8, [pB]
466	add	pB , pB, #4
467
468	ld1	{v0.2s}, [pA]
469	add	pA , pA, #8
470
471	fmla	v16.2s, v0.2s, v8.s[0]
472.endm
473
474.macro SAVE2x1
475	fmul	v8.2s, v16.2s, alphaV0
476	st1	{v8.2s}, [pCRow0]
477
478	add	pCRow0, pCRow0, #8
479.endm
480
481/******************************************************************************/
482
483.macro INIT1x1
484	fmov	s16, wzr
485.endm
486
487.macro KERNEL1x1_SUB
488	ldr	s8, [pB]
489	add	pB , pB, #4
490
491	ldr	s0, [pA]
492	add	pA , pA, #4
493
494	fmadd 	s16, s0, s8, s16
495.endm
496
497.macro SAVE1x1
498	fmul	s8, s16, alpha0
499	str 	s8, [pCRow0]
500
501	add	pCRow0, pCRow0, #4
502.endm
503
504/*******************************************************************************
505* End of macro definitions
506*******************************************************************************/
507
508	PROLOGUE
509
510.Lstrmm_kernel_begin:
511
512	.align 5
513	add	sp, sp, #-(11 * 16)
514	stp	d8, d9, [sp, #(0 * 16)]
515	stp	d10, d11, [sp, #(1 * 16)]
516	stp	d12, d13, [sp, #(2 * 16)]
517	stp	d14, d15, [sp, #(3 * 16)]
518	stp	d16, d17, [sp, #(4 * 16)]
519	stp	x18, x19, [sp, #(5 * 16)]
520	stp	x20, x21, [sp, #(6 * 16)]
521	stp	x22, x23, [sp, #(7 * 16)]
522	stp	x24, x25, [sp, #(8 * 16)]
523	stp	x26, x27, [sp, #(9 * 16)]
524	str	x28, [sp, #(10 * 16)]
525
526	fmov	alpha0, s0
527	fmov	alpha1, s0
528	fmov	alpha2, s0
529	fmov	alpha3, s0
530
531	lsl	LDC, LDC, #2			// ldc = ldc * 4
532
533#if !defined(LEFT)
534	neg	tempOffset, offset
535#endif
536
537	mov	pB, origPB
538
539	mov	counterJ, origN
540	asr 	counterJ, counterJ, #2		// J = J / 4
541	cmp 	counterJ, #0
542	ble	.Lstrmm_kernel_L2_BEGIN
543
544/******************************************************************************/
545
546.Lstrmm_kernel_L4_BEGIN:
547	mov	pCRow0, pC			// pCRow0 = C
548	add	pC, pC, LDC, lsl #2
549
550#if defined(LEFT)
551	mov	tempOffset, offset
552#endif
553
554	mov	pA, origPA			// pA = start of A array
555
556.Lstrmm_kernel_L4_M4_BEGIN:
557
558	mov	counterI, origM
559	asr 	counterI, counterI, #2		// counterI = counterI / 4
560	cmp 	counterI, #0
561	ble	.Lstrmm_kernel_L4_M2_BEGIN
562
563.Lstrmm_kernel_L4_M4_20:
564
565#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
566	mov	pB, origPB
567#else
568	mov	pB, origPB
569	lsl	temp, tempOffset, #4
570	add	pB, pB, temp
571	add	pA, pA, temp
572#endif
573
574#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
575	sub	tempK, origK, tempOffset
576#elif defined(LEFT)
577	add	tempK, tempOffset, #4
578#else
579	add	tempK, tempOffset, #4
580#endif
581
582	asr 	counterL , tempK, #1		// L = K / 2
583	cmp	counterL , #2			// is there at least 4 to do?
584	blt	.Lstrmm_kernel_L4_M4_32
585
586	KERNEL4x4_I				// do one in the K
587	KERNEL4x4_M2				// do another in the K
588
589	subs	counterL, counterL, #2
590	ble	.Lstrmm_kernel_L4_M4_22a
591	.align 5
592
593.Lstrmm_kernel_L4_M4_22:
594
595	KERNEL4x4_M1
596	KERNEL4x4_M2
597
598	subs	counterL, counterL, #1
599	bgt	.Lstrmm_kernel_L4_M4_22
600
601.Lstrmm_kernel_L4_M4_22a:
602
603	KERNEL4x4_M1
604	KERNEL4x4_E
605
606	b	 .Lstrmm_kernel_L4_M4_44
607
608.Lstrmm_kernel_L4_M4_32:
609
610	tst	counterL, #1
611	ble	.Lstrmm_kernel_L4_M4_40
612
613	KERNEL4x4_I
614	KERNEL4x4_E
615
616	b	.Lstrmm_kernel_L4_M4_44
617
618.Lstrmm_kernel_L4_M4_40:
619
620	INIT4x4
621
622.Lstrmm_kernel_L4_M4_44:
623
624	ands	counterL , tempK, #1
625	ble	.Lstrmm_kernel_L4_M4_100
626
627.Lstrmm_kernel_L4_M4_46:
628
629	KERNEL4x4_SUB
630
631.Lstrmm_kernel_L4_M4_100:
632
633	SAVE4x4
634
635#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
636	sub	tempK, origK, tempOffset
637#if defined(LEFT)
638	sub	tempK, tempK, #4
639#else
640	sub	tempK, tempK, #4
641#endif
642	lsl	temp, tempK, #4
643	add	pA, pA, temp
644	add	pB, pB, temp
645#endif
646#if defined(LEFT)
647	add	tempOffset, tempOffset, #4
648#endif
649
650.Lstrmm_kernel_L4_M4_END:
651	subs	counterI, counterI, #1
652	bne	.Lstrmm_kernel_L4_M4_20
653
654.Lstrmm_kernel_L4_M2_BEGIN:
655
656	mov	counterI, origM
657	tst	counterI , #3
658	ble	.Lstrmm_kernel_L4_END
659
660	tst	counterI, #2			// counterI = counterI / 2
661	ble	.Lstrmm_kernel_L4_M1_BEGIN
662
663.Lstrmm_kernel_L4_M2_20:
664
665	INIT2x4
666
667#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
668	mov	pB, origPB
669#else
670	mov	pB, origPB
671	lsl	temp, tempOffset, #3
672	add	pA, pA, temp
673	lsl	temp, tempOffset, #4
674	add	pB, pB, temp
675#endif
676
677#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
678	sub	tempK, origK, tempOffset
679#elif defined(LEFT)
680	add	tempK, tempOffset, #2
681#else
682	add	tempK, tempOffset, #4
683#endif
684
685	asr 	counterL , tempK, #3		// counterL = counterL / 8
686	cmp	counterL , #0
687	ble	.Lstrmm_kernel_L4_M2_40
688
689.Lstrmm_kernel_L4_M2_22:
690
691	KERNEL2x4_SUB
692	KERNEL2x4_SUB
693	KERNEL2x4_SUB
694	KERNEL2x4_SUB
695
696	KERNEL2x4_SUB
697	KERNEL2x4_SUB
698	KERNEL2x4_SUB
699	KERNEL2x4_SUB
700
701	subs	counterL, counterL, #1
702	bgt	.Lstrmm_kernel_L4_M2_22
703
704
705.Lstrmm_kernel_L4_M2_40:
706
707	ands	counterL , tempK, #7		// counterL = counterL % 8
708	ble	.Lstrmm_kernel_L4_M2_100
709
710.Lstrmm_kernel_L4_M2_42:
711
712	KERNEL2x4_SUB
713
714	subs	counterL, counterL, #1
715	bgt	.Lstrmm_kernel_L4_M2_42
716
717.Lstrmm_kernel_L4_M2_100:
718
719	SAVE2x4
720
721#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
722	sub	tempK, origK, tempOffset
723#if defined(LEFT)
724	sub	tempK, tempK, #2
725#else
726	sub	tempK, tempK, #4
727#endif
728	lsl	temp, tempK, #3
729	add	pA, pA, temp
730	lsl	temp, tempK, #4
731	add	pB, pB, temp
732#endif
733#if defined(LEFT)
734	add	tempOffset, tempOffset, #2
735#endif
736
737
738.Lstrmm_kernel_L4_M2_END:
739
740
741.Lstrmm_kernel_L4_M1_BEGIN:
742
743	tst	counterI, #1			// counterI = counterI % 2
744	ble	.Lstrmm_kernel_L4_END
745
746.Lstrmm_kernel_L4_M1_20:
747
748	INIT1x4
749
750#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
751	mov	pB, origPB
752#else
753	mov	pB, origPB
754	lsl	temp, tempOffset, #4
755	add	pB, pB, temp
756	lsl	temp, tempOffset, #2
757	add	pA, pA, temp
758#endif
759
760#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
761	sub	tempK, origK, tempOffset
762#elif defined(LEFT)
763	add	tempK, tempOffset, #1
764#else
765	add	tempK, tempOffset, #4
766#endif
767
768	asr 	counterL , tempK, #3		// counterL = counterL / 8
769	cmp	counterL , #0
770	ble	.Lstrmm_kernel_L4_M1_40
771
772.Lstrmm_kernel_L4_M1_22:
773	KERNEL1x4_SUB
774	KERNEL1x4_SUB
775	KERNEL1x4_SUB
776	KERNEL1x4_SUB
777
778	KERNEL1x4_SUB
779	KERNEL1x4_SUB
780	KERNEL1x4_SUB
781	KERNEL1x4_SUB
782
783	subs	counterL, counterL, #1
784	bgt	.Lstrmm_kernel_L4_M1_22
785
786
787.Lstrmm_kernel_L4_M1_40:
788
789	ands	counterL , tempK, #7		// counterL = counterL % 8
790	ble	.Lstrmm_kernel_L4_M1_100
791
792.Lstrmm_kernel_L4_M1_42:
793
794	KERNEL1x4_SUB
795
796	subs	counterL, counterL, #1
797	bgt	.Lstrmm_kernel_L4_M1_42
798
799.Lstrmm_kernel_L4_M1_100:
800
801	SAVE1x4
802
803#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
804	sub	tempK, origK, tempOffset
805#if defined(LEFT)
806	sub	tempK, tempK, #1
807#else
808	sub	tempK, tempK, #4
809#endif
810	lsl	temp, tempK, #2
811	add	pA, pA, temp
812	lsl	temp, tempK, #4
813	add	pB, pB, temp
814#endif
815#if defined(LEFT)
816	add	tempOffset, tempOffset, #1
817#endif
818
819
820.Lstrmm_kernel_L4_END:
821	add	origPB, origPB, origK, lsl #4	// B = B + K * 4 * 4
822
823#if !defined(LEFT)
824	add	tempOffset, tempOffset, #4
825#endif
826
827	subs	counterJ, counterJ , #1		// j--
828	bgt	.Lstrmm_kernel_L4_BEGIN
829
830
831/******************************************************************************/
832
833.Lstrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
834
835	mov	counterJ , origN
836	tst	counterJ , #3
837	ble	.Lstrmm_kernel_L999
838
839	tst	counterJ , #2
840	ble	.Lstrmm_kernel_L1_BEGIN
841
842	mov	pCRow0, pC			// pCRow0 = pC
843
844	add	pC,pC,LDC, lsl #1
845
846#if defined(LEFT)
847	mov	tempOffset, offset
848#endif
849
850	mov	pA, origPA			// pA = A
851
852.Lstrmm_kernel_L2_M4_BEGIN:
853
854	mov	counterI, origM
855	asr 	counterI, counterI, #2		// counterI = counterI / 4
856	cmp	counterI,#0
857	ble	.Lstrmm_kernel_L2_M2_BEGIN
858
859.Lstrmm_kernel_L2_M4_20:
860
861	INIT4x2
862
863#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
864	mov	pB, origPB
865#else
866	mov	pB, origPB
867	lsl	temp, tempOffset, #3
868	add	pB, pB, temp
869	lsl	temp, tempOffset, #4
870	add	pA, pA, temp
871#endif
872
873#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
874	sub	tempK, origK, tempOffset
875#elif defined(LEFT)
876	add	tempK, tempOffset, #4
877#else
878	add	tempK, tempOffset, #2
879#endif
880
881	asr	counterL , tempK, #3		// counterL = counterL / 8
882	cmp	counterL,#0
883	ble	.Lstrmm_kernel_L2_M4_40
884	.align 5
885
886.Lstrmm_kernel_L2_M4_22:
887	KERNEL4x2_SUB
888	KERNEL4x2_SUB
889	KERNEL4x2_SUB
890	KERNEL4x2_SUB
891
892	KERNEL4x2_SUB
893	KERNEL4x2_SUB
894	KERNEL4x2_SUB
895	KERNEL4x2_SUB
896
897	subs	counterL, counterL, #1
898	bgt	.Lstrmm_kernel_L2_M4_22
899
900
901.Lstrmm_kernel_L2_M4_40:
902
903	ands	counterL , tempK, #7		// counterL = counterL % 8
904	ble	.Lstrmm_kernel_L2_M4_100
905
906.Lstrmm_kernel_L2_M4_42:
907
908	KERNEL4x2_SUB
909
910	subs	counterL, counterL, #1
911	bgt	.Lstrmm_kernel_L2_M4_42
912
913.Lstrmm_kernel_L2_M4_100:
914
915	SAVE4x2
916
917#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
918	sub	tempK, origK, tempOffset
919#if defined(LEFT)
920	sub	tempK, tempK, #4
921#else
922	sub	tempK, tempK, #2
923#endif
924	lsl	temp, tempK, #4
925	add	pA, pA, temp
926	lsl	temp, tempK, #3
927	add	pB, pB, temp
928#endif
929#if defined(LEFT)
930	add	tempOffset, tempOffset, #4
931#endif
932
933.Lstrmm_kernel_L2_M4_END:
934
935	subs	counterI, counterI, #1
936	bgt	.Lstrmm_kernel_L2_M4_20
937
938
939.Lstrmm_kernel_L2_M2_BEGIN:
940
941	mov	counterI, origM
942	tst	counterI , #3
943	ble	.Lstrmm_kernel_L2_END
944
945	tst	counterI, #2			// counterI = counterI / 2
946	ble	.Lstrmm_kernel_L2_M1_BEGIN
947
948.Lstrmm_kernel_L2_M2_20:
949
950	INIT2x2
951
952#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
953	mov	pB, origPB
954#else
955	mov	pB, origPB
956	lsl	temp, tempOffset, #3
957	add	pB, pB, temp
958	lsl	temp, tempOffset, #3
959	add	pA, pA, temp
960#endif
961
962#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
963	sub	tempK, origK, tempOffset
964#elif defined(LEFT)
965	add	tempK, tempOffset, #2
966#else
967	add	tempK, tempOffset, #2
968#endif
969
970	asr	counterL , tempK, #3		// counterL = counterL / 8
971        cmp	counterL,#0
972	ble	.Lstrmm_kernel_L2_M2_40
973
974.Lstrmm_kernel_L2_M2_22:
975
976	KERNEL2x2_SUB
977	KERNEL2x2_SUB
978	KERNEL2x2_SUB
979	KERNEL2x2_SUB
980
981	KERNEL2x2_SUB
982	KERNEL2x2_SUB
983	KERNEL2x2_SUB
984	KERNEL2x2_SUB
985
986	subs	counterL, counterL, #1
987	bgt	.Lstrmm_kernel_L2_M2_22
988
989
990.Lstrmm_kernel_L2_M2_40:
991
992	ands	counterL , tempK, #7		// counterL = counterL % 8
993	ble	.Lstrmm_kernel_L2_M2_100
994
995.Lstrmm_kernel_L2_M2_42:
996
997	KERNEL2x2_SUB
998
999	subs	counterL, counterL, #1
1000	bgt	.Lstrmm_kernel_L2_M2_42
1001
1002.Lstrmm_kernel_L2_M2_100:
1003
1004	SAVE2x2
1005#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1006	sub	tempK, origK, tempOffset
1007#if defined(LEFT)
1008	sub	tempK, tempK, #2
1009#else
1010	sub	tempK, tempK, #2
1011#endif
1012	lsl	temp, tempK, #3
1013	add	pA, pA, temp
1014	lsl	temp, tempK, #3
1015	add	pB, pB, temp
1016#endif
1017#if defined(LEFT)
1018	add	tempOffset, tempOffset, #2
1019#endif
1020
1021.Lstrmm_kernel_L2_M2_END:
1022
1023
1024.Lstrmm_kernel_L2_M1_BEGIN:
1025
1026	tst	counterI, #1			// counterI = counterI % 2
1027	ble	.Lstrmm_kernel_L2_END
1028
1029.Lstrmm_kernel_L2_M1_20:
1030
1031	INIT1x2
1032
1033#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1034	mov	pB, origPB
1035#else
1036	mov	pB, origPB
1037	lsl	temp, tempOffset, #3
1038	add	pB, pB, temp
1039	lsl	temp, tempOffset, #2
1040	add	pA, pA, temp
1041#endif
1042
1043#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1044	sub	tempK, origK, tempOffset
1045#elif defined(LEFT)
1046	add	tempK, tempOffset, #1
1047#else
1048	add	tempK, tempOffset, #2
1049#endif
1050
1051	asr 	counterL , tempK, #3		// counterL = counterL / 8
1052        cmp     counterL, #0
1053	ble	.Lstrmm_kernel_L2_M1_40
1054
1055.Lstrmm_kernel_L2_M1_22:
1056	KERNEL1x2_SUB
1057	KERNEL1x2_SUB
1058	KERNEL1x2_SUB
1059	KERNEL1x2_SUB
1060
1061	KERNEL1x2_SUB
1062	KERNEL1x2_SUB
1063	KERNEL1x2_SUB
1064	KERNEL1x2_SUB
1065
1066	subs	counterL, counterL, #1
1067	bgt	.Lstrmm_kernel_L2_M1_22
1068
1069
1070.Lstrmm_kernel_L2_M1_40:
1071
1072	ands	counterL , tempK, #7		// counterL = counterL % 8
1073	ble	.Lstrmm_kernel_L2_M1_100
1074
1075.Lstrmm_kernel_L2_M1_42:
1076
1077	KERNEL1x2_SUB
1078
1079	subs	counterL, counterL, #1
1080	bgt	.Lstrmm_kernel_L2_M1_42
1081
1082.Lstrmm_kernel_L2_M1_100:
1083
1084	SAVE1x2
1085
1086#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1087	sub	tempK, origK, tempOffset
1088#if defined(LEFT)
1089	sub	tempK, tempK, #1
1090#else
1091	sub	tempK, tempK, #2
1092#endif
1093	lsl	temp, tempK, #2
1094	add	pA, pA, temp
1095	lsl	temp, tempK, #3
1096	add	pB, pB, temp
1097#endif
1098#if defined(LEFT)
1099	add	tempOffset, tempOffset, #1
1100#endif
1101
1102.Lstrmm_kernel_L2_END:
1103#if !defined(LEFT)
1104	add	tempOffset, tempOffset, #2
1105#endif
1106	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
1107
1108/******************************************************************************/
1109
1110.Lstrmm_kernel_L1_BEGIN:
1111
1112	mov	counterJ , origN
1113	tst	counterJ , #1
1114	ble	.Lstrmm_kernel_L999 // done
1115
1116
1117	mov	pCRow0, pC			// pCRow0 = C
1118	add	pC , pC , LDC			// Update pC to point to next
1119
1120#if defined(LEFT)
1121	mov	tempOffset, offset
1122#endif
1123
1124	mov	pA, origPA			// pA = A
1125
1126.Lstrmm_kernel_L1_M4_BEGIN:
1127
1128	mov	counterI, origM
1129	asr 	counterI, counterI, #2		// counterI = counterI / 4
1130	cmp	counterI, #0
1131	ble	.Lstrmm_kernel_L1_M2_BEGIN
1132
1133.Lstrmm_kernel_L1_M4_20:
1134
1135	INIT4x1
1136
1137#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1138	mov	pB, origPB
1139#else
1140	mov	pB, origPB
1141	lsl	temp, tempOffset, #2
1142	add	pB, pB, temp
1143	lsl	temp, tempOffset, #4
1144	add	pA, pA, temp
1145#endif
1146
1147#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1148	sub	tempK, origK, tempOffset
1149#elif defined(LEFT)
1150	add	tempK, tempOffset, #4
1151#else
1152	add	tempK, tempOffset, #1
1153#endif
1154
1155	asr	counterL , tempK, #3		// counterL = counterL / 8
1156	cmp	counterL , #0
1157	ble	.Lstrmm_kernel_L1_M4_40
1158	.align 5
1159
1160.Lstrmm_kernel_L1_M4_22:
1161	KERNEL4x1_SUB
1162	KERNEL4x1_SUB
1163	KERNEL4x1_SUB
1164	KERNEL4x1_SUB
1165
1166	KERNEL4x1_SUB
1167	KERNEL4x1_SUB
1168	KERNEL4x1_SUB
1169	KERNEL4x1_SUB
1170
1171	subs	counterL, counterL, #1
1172	bgt	.Lstrmm_kernel_L1_M4_22
1173
1174
1175.Lstrmm_kernel_L1_M4_40:
1176
1177	ands	counterL , tempK, #7		// counterL = counterL % 8
1178	ble	.Lstrmm_kernel_L1_M4_100
1179
1180.Lstrmm_kernel_L1_M4_42:
1181
1182	KERNEL4x1_SUB
1183
1184	subs	counterL, counterL, #1
1185	bgt	.Lstrmm_kernel_L1_M4_42
1186
1187.Lstrmm_kernel_L1_M4_100:
1188
1189	SAVE4x1
1190
1191#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1192	sub	tempK, origK, tempOffset
1193#if defined(LEFT)
1194	sub	tempK, tempK, #4
1195#else
1196	sub	tempK, tempK, #1
1197#endif
1198	lsl	temp, tempK, #4
1199	add	pA, pA, temp
1200	lsl	temp, tempK, #2
1201	add	pB, pB, temp
1202#endif
1203#if defined(LEFT)
1204	add	tempOffset, tempOffset, #4
1205#endif
1206
1207.Lstrmm_kernel_L1_M4_END:
1208
1209	subs	counterI, counterI, #1
1210	bgt	.Lstrmm_kernel_L1_M4_20
1211
1212
1213.Lstrmm_kernel_L1_M2_BEGIN:
1214
1215	mov	counterI, origM
1216	tst	counterI , #3
1217	ble	.Lstrmm_kernel_L1_END
1218
1219	tst	counterI, #2			// counterI = counterI / 2
1220	ble	.Lstrmm_kernel_L1_M1_BEGIN
1221
1222.Lstrmm_kernel_L1_M2_20:
1223
1224	INIT2x1
1225
1226#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1227	mov	pB, origPB
1228#else
1229	mov	pB, origPB
1230	lsl	temp, tempOffset, #2
1231	add	pB, pB, temp
1232	lsl	temp, tempOffset, #3
1233	add	pA, pA, temp
1234#endif
1235
1236#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1237	sub	tempK, origK, tempOffset
1238#elif defined(LEFT)
1239	add	tempK, tempOffset, #2
1240#else
1241	add	tempK, tempOffset, #1
1242#endif
1243
1244	asr 	counterL , tempK, #3		// counterL = counterL / 8
1245	cmp	counterL , #0
1246	ble	.Lstrmm_kernel_L1_M2_40
1247
1248.Lstrmm_kernel_L1_M2_22:
1249
1250	KERNEL2x1_SUB
1251	KERNEL2x1_SUB
1252	KERNEL2x1_SUB
1253	KERNEL2x1_SUB
1254
1255	KERNEL2x1_SUB
1256	KERNEL2x1_SUB
1257	KERNEL2x1_SUB
1258	KERNEL2x1_SUB
1259
1260	subs	counterL, counterL, #1
1261	bgt	.Lstrmm_kernel_L1_M2_22
1262
1263
1264.Lstrmm_kernel_L1_M2_40:
1265
1266	ands	counterL , tempK, #7		// counterL = counterL % 8
1267	ble	.Lstrmm_kernel_L1_M2_100
1268
1269.Lstrmm_kernel_L1_M2_42:
1270
1271	KERNEL2x1_SUB
1272
1273	subs	counterL, counterL, #1
1274	bgt	.Lstrmm_kernel_L1_M2_42
1275
1276.Lstrmm_kernel_L1_M2_100:
1277
1278	SAVE2x1
1279
1280#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1281	sub	tempK, origK, tempOffset
1282#if defined(LEFT)
1283	sub	tempK, tempK, #2
1284#else
1285	sub	tempK, tempK, #1
1286#endif
1287	lsl	temp, tempK, #3
1288	add	pA, pA, temp
1289	lsl	temp, tempK, #2
1290	add	pB, pB, temp
1291#endif
1292#if defined(LEFT)
1293	add	tempOffset, tempOffset, #2
1294#endif
1295
1296
1297.Lstrmm_kernel_L1_M2_END:
1298
1299
1300.Lstrmm_kernel_L1_M1_BEGIN:
1301
1302	tst	counterI, #1			// counterI = counterI % 2
1303	ble	.Lstrmm_kernel_L1_END
1304
1305.Lstrmm_kernel_L1_M1_20:
1306
1307	INIT1x1
1308
1309#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1310	mov	pB, origPB
1311#else
1312	mov	pB, origPB
1313	lsl	temp, tempOffset, #2
1314	add	pB, pB, temp
1315	lsl	temp, tempOffset, #2
1316	add	pA, pA, temp
1317#endif
1318
1319#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1320	sub	tempK, origK, tempOffset
1321#elif defined(LEFT)
1322	add	tempK, tempOffset, #1
1323#else
1324	add	tempK, tempOffset, #1
1325#endif
1326
1327	asr 	counterL , tempK, #3		// counterL = counterL / 8
1328	cmp	counterL , #0
1329	ble	.Lstrmm_kernel_L1_M1_40
1330
1331.Lstrmm_kernel_L1_M1_22:
1332	KERNEL1x1_SUB
1333	KERNEL1x1_SUB
1334	KERNEL1x1_SUB
1335	KERNEL1x1_SUB
1336
1337	KERNEL1x1_SUB
1338	KERNEL1x1_SUB
1339	KERNEL1x1_SUB
1340	KERNEL1x1_SUB
1341
1342	subs	counterL, counterL, #1
1343	bgt	.Lstrmm_kernel_L1_M1_22
1344
1345
1346.Lstrmm_kernel_L1_M1_40:
1347
1348	ands	counterL , tempK, #7		// counterL = counterL % 8
1349	ble	.Lstrmm_kernel_L1_M1_100
1350
1351.Lstrmm_kernel_L1_M1_42:
1352
1353	KERNEL1x1_SUB
1354
1355	subs	counterL, counterL, #1
1356	bgt	.Lstrmm_kernel_L1_M1_42
1357
1358.Lstrmm_kernel_L1_M1_100:
1359
1360	SAVE1x1
1361
1362#if 0
1363#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1364	sub	tempK, origK, tempOffset
1365#if defined(LEFT)
1366	sub	tempK, tempK, #1
1367#else
1368	sub	tempK, tempK, #1
1369#endif
1370	lsl	temp, tempK, #2
1371	add	pA, pA, temp
1372	lsl	temp, tempK, #2
1373	add	pB, pB, temp
1374#endif
1375#if defined(LEFT)
1376	add	tempOffset, tempOffset, #1
1377#endif
1378#endif
1379
1380.Lstrmm_kernel_L1_END:
1381
1382#if 0
1383#if !defined(LEFT)
1384	add	tempOffset, tempOffset, #1
1385#endif
1386#endif
1387
1388.Lstrmm_kernel_L999:
1389	mov	x0, #0				// set return value
1390	ldp	d8, d9, [sp, #(0 * 16)]
1391	ldp	d10, d11, [sp, #(1 * 16)]
1392	ldp	d12, d13, [sp, #(2 * 16)]
1393	ldp	d14, d15, [sp, #(3 * 16)]
1394	ldp	d16, d17, [sp, #(4 * 16)]
1395	ldp	x18, x19, [sp, #(5 * 16)]
1396	ldp	x20, x21, [sp, #(6 * 16)]
1397	ldp	x22, x23, [sp, #(7 * 16)]
1398	ldp	x24, x25, [sp, #(8 * 16)]
1399	ldp	x26, x27, [sp, #(9 * 16)]
1400	ldr	x28, [sp, #(10 * 16)]
1401	add	sp, sp, #(11*16)
1402	ret
1403
1404	EPILOGUE
1405
1406