1/*******************************************************************************
2Copyright (c) 2015, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*******************************************************************************/
27
28#define ASSEMBLER
29#include "common.h"
30
31/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
32/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
33
34#define origM		x0
35#define origN		x1
36#define origK		x2
37#define origPA		x3
38#define origPB		x4
39#define pC		x5
40#define LDC		x6
41#define temp		x7
42#define counterL	x8
43#define counterI	x9
44#define counterJ	x10
45#define pB		x11
46#define pCRow0		x12
47#define pCRow1		x13
48#define pCRow2		x14
49#define pA_0		x15
50#define pA_1		x16
51#define pA_2		x17
52#define pA_3		x18
53
54
55#define alpha0		s10
56#define alphaV0		v10.s[0]
57#define alpha1		s11
58#define alphaV1		v11.s[0]
59#define alpha2		s14
60#define alphaV2		v14.s[0]
61#define alpha3		s15
62#define alphaV3		v15.s[0]
63
64// 00 origM
65// 01 origN
66// 02 origK
67// 03 origPA
68// 04 origPB
69// 05 pC
70// 06 origLDC -> LDC
71// 07 offset -> temp
72// 08 counterL
73// 09 counterI
74// 10 counterJ
75// 11 pB
76// 12 pCRow0
77// 13 pCRow1
78// 14 pCRow2
79// 15 pA_0
80// 16 pA_1
81// 17 pA_2
82// 18 must save pA_3
83// 19 must save
84// 20 must save
85// 21 must save
86// 22 must save
87// 23 must save
88// 24 must save
89// 25 must save
90// 26 must save
91// 27 must save
92// 28 must save
93// 29 frame
94// 30 link
95// 31 sp
96
97/***************************** FOR 16x4 ***************************************/
98//v00 ALPHA -> pA00_0, pA01_0, pA02_0, pA03_0
99//v01 pA10_0, pA11_0, pA12_0, pA13_0
100//v02 pA00_1, pA01_1, pA02_1, pA03_1
101//v03 pA10_1, pA11_1, pA12_1, pA13_1
102//v04 pA00_2, pA01_2, pA02_2, pA03_2
103//v05 pA10_2, pA11_2, pA12_2, pA13_2
104//v06 pA00_3, pA01_3, pA02_3, pA03_3
105//v07 pA10_3, pA11_3, pA12_3, pA13_3
106//v08 must save pB00, pB01, pB02, pB03
107//v09 must save
108//v10 must save ALPHA0
109//v11 must save ALPHA1
110//v12 must save pB10, pB11, pB12, pB13
111//v13 must save
112//v14 must save ALPHA2
113//v15 must save ALPHA3
114//v16 must save C00_0, C01_0, C02_0, C03_0
115//v17 must save C10_0, C11_0, C12_0, C13_0
116//v18 C20_0, C21_0, C22_0, C23_0
117//v19 C30_0, C31_0, C32_0, C33_0
118//v20 C00_1, C01_1, C02_1, C03_1
119//v21 C10_1, C11_1, C12_1, C13_1
120//v22 C20_1, C21_1, C22_1, C23_1
121//v23 C30_1, C31_1, C32_1, C33_1
122//v24 C00_2, C01_2, C02_2, C03_2
123//v25 C10_2, C11_2, C12_2, C13_2
124//v26 C20_2, C21_2, C22_2, C23_2
125//v27 C30_2, C31_2, C32_2, C33_2
126//v28 C00_3, C01_3, C02_3, C03_3
127//v29 C10_3, C11_3, C12_3, C13_3
128//v30 C20_3, C21_3, C22_3, C23_3
129//v31 C30_3, C31_3, C32_3, C33_3
130
131/***************************** EXCEPT FOR 16x4 ********************************/
132//v00 ALPHA -> pA00, pA01
133//v01 pA02, pA03
134//v02 ppA00, ppA01
135//v03 ppA02, ppA03
136//v04 pA10, pA11
137//v05 pA12, pA13
138//v06 ppA10, ppA11
139//v07 ppA12, ppA13
140//v08 must save pB00, pB01
141//v09 must save pB02, pB03
142//v10 must save ALPHA0
143//v11 must save ALPHA1
144//v12 must save pB10, pB11
145//v13 must save pB12, pB13
146//v14 must save ALPHA2
147//v15 must save ALPHA3
148//v16 must save C00, C01
149//v17 must save C02, C03
150//v18 ppC00, ppC01
151//v19 ppC02, ppC03
152//v20 C10, C11
153//v21 C12, C13
154//v22 ppC10, ppC11
155//v23 ppC12, ppC13
156//v24 C20, C21
157//v25 C22, C23
158//v26 ppC20, ppC21
159//v27 ppC22, ppC23
160//v28 C30, C31
161//v29 C32, C33
162//v30 ppC30, ppC31
163//v31 ppC32, ppC33
164
165/*******************************************************************************
166* Macro definitions
167*******************************************************************************/
168
169.macro INIT16x4
170	fmov		s16, wzr
171	fmov		s17, s16
172	fmov		s18, s17
173	fmov		s19, s16
174	fmov		s20, s17
175	fmov		s21, s16
176	fmov		s22, s17
177	fmov		s23, s16
178	fmov		s24, s17
179	fmov		s25, s16
180	fmov		s26, s17
181	fmov		s27, s16
182	fmov		s28, s17
183	fmov		s29, s16
184	fmov		s30, s17
185	fmov		s31, s16
186.endm
187
188.macro KERNEL16x4_I
189	ld1	{v8.4s}, [pB]
190	add	pB, pB, #16
191
192	ld1	{v0.4s}, [pA_0]
193	add	pA_0, pA_0, #16
194
195	fmul	v16.4s, v0.4s, v8.s[0]
196	fmul	v20.4s, v0.4s, v8.s[1]
197
198	ld1	{v2.4s}, [pA_1]
199	add	pA_1, pA_1, #16
200
201	fmul	v24.4s, v0.4s, v8.s[2]
202	fmul	v28.4s, v0.4s, v8.s[3]
203
204	ld1	{v4.4s}, [pA_2]
205	add	pA_2, pA_2, #16
206
207	fmul	v17.4s, v2.4s, v8.s[0]
208	fmul	v21.4s, v2.4s, v8.s[1]
209
210	ld1	{v6.4s}, [pA_3]
211	add	pA_3, pA_3, #16
212
213	fmul	v25.4s, v2.4s, v8.s[2]
214	fmul	v29.4s, v2.4s, v8.s[3]
215
216	ld1	{v12.4s}, [pB]		// for next round
217	add	pB, pB, #16
218
219	fmul	v18.4s, v4.4s, v8.s[0]
220	fmul	v19.4s, v6.4s, v8.s[0]
221
222	ld1	{v1.4s}, [pA_0]		// for next round
223	add	pA_0, pA_0, #16
224
225	fmul	v22.4s, v4.4s, v8.s[1]
226	fmul	v23.4s, v6.4s, v8.s[1]
227
228	ld1	{v3.4s}, [pA_1]		// for next round
229	add	pA_1, pA_1, #16
230
231	fmul	v26.4s, v4.4s, v8.s[2]
232	fmul	v27.4s, v6.4s, v8.s[2]
233
234	ld1	{v5.4s}, [pA_2]		// for next round
235	add	pA_2, pA_2, #16
236
237	fmul	v30.4s, v4.4s, v8.s[3]
238	fmul	v31.4s, v6.4s, v8.s[3]
239
240	ld1	{v7.4s}, [pA_3]		// for next round
241	add	pA_3, pA_3, #16
242.endm
243
244.macro KERNEL16x4_M2
245	fmla	v16.4s, v1.4s, v12.s[0]
246	fmla	v17.4s, v3.4s, v12.s[0]
247
248	ld1	{v8.4s}, [pB]		// for next round
249	add	pB, pB, #16
250
251	fmla	v18.4s, v5.4s, v12.s[0]
252	fmla	v19.4s, v7.4s, v12.s[0]
253
254	ld1	{v0.4s}, [pA_0]		// for next round
255	add	pA_0, pA_0, #16
256
257	fmla	v20.4s, v1.4s, v12.s[1]
258	fmla	v21.4s, v3.4s, v12.s[1]
259
260	ld1	{v2.4s}, [pA_1]		// for next round
261	add	pA_1, pA_1, #16
262
263	fmla	v22.4s, v5.4s, v12.s[1]
264	fmla	v23.4s, v7.4s, v12.s[1]
265
266	ld1	{v4.4s}, [pA_2]		// for next round
267	add	pA_2, pA_2, #16
268
269	fmla	v24.4s, v1.4s, v12.s[2]
270	fmla	v25.4s, v3.4s, v12.s[2]
271
272	ld1	{v6.4s}, [pA_3]		// for next round
273	add	pA_3, pA_3, #16
274
275	fmla	v26.4s, v5.4s, v12.s[2]
276	fmla	v27.4s, v7.4s, v12.s[2]
277
278	prfm	PLDL1KEEP, [pA_2, #512]
279
280	fmla	v28.4s, v1.4s, v12.s[3]
281	fmla	v29.4s, v3.4s, v12.s[3]
282
283	prfm	PLDL1KEEP, [pA_3, #512]
284
285	fmla	v30.4s, v5.4s, v12.s[3]
286	fmla	v31.4s, v7.4s, v12.s[3]
287
288	prfm	PLDL1KEEP, [pB, #512]
289.endm
290
291.macro KERNEL16x4_M1
292	fmla	v16.4s, v0.4s, v8.s[0]
293	fmla	v17.4s, v2.4s, v8.s[0]
294
295	ld1	{v12.4s}, [pB]		// for next round
296	add	pB, pB, #16
297
298	fmla	v18.4s, v4.4s, v8.s[0]
299	fmla	v19.4s, v6.4s, v8.s[0]
300
301	ld1	{v1.4s}, [pA_0]		// for next round
302	add	pA_0, pA_0, #16
303
304	fmla	v20.4s, v0.4s, v8.s[1]
305	fmla	v21.4s, v2.4s, v8.s[1]
306
307	ld1	{v3.4s}, [pA_1]		// for next round
308	add	pA_1, pA_1, #16
309
310	fmla	v22.4s, v4.4s, v8.s[1]
311	fmla	v23.4s, v6.4s, v8.s[1]
312
313	ld1	{v5.4s}, [pA_2]		// for next round
314	add	pA_2, pA_2, #16
315
316	fmla	v24.4s, v0.4s, v8.s[2]
317	fmla	v25.4s, v2.4s, v8.s[2]
318
319	ld1	{v7.4s}, [pA_3]		// for next round
320	add	pA_3, pA_3, #16
321
322	fmla	v26.4s, v4.4s, v8.s[2]
323	fmla	v27.4s, v6.4s, v8.s[2]
324
325	prfm	PLDL1KEEP, [pA_0, #512]
326
327	fmla	v28.4s, v0.4s, v8.s[3]
328	fmla	v29.4s, v2.4s, v8.s[3]
329
330	prfm	PLDL1KEEP, [pA_1, #512]
331
332	fmla	v30.4s, v4.4s, v8.s[3]
333	fmla	v31.4s, v6.4s, v8.s[3]
334.endm
335
336.macro KERNEL16x4_E
337	fmla	v16.4s, v1.4s, v12.s[0]
338	fmla	v17.4s, v3.4s, v12.s[0]
339	fmla	v18.4s, v5.4s, v12.s[0]
340	fmla	v19.4s, v7.4s, v12.s[0]
341	fmla	v20.4s, v1.4s, v12.s[1]
342	fmla	v21.4s, v3.4s, v12.s[1]
343	fmla	v22.4s, v5.4s, v12.s[1]
344	fmla	v23.4s, v7.4s, v12.s[1]
345	fmla	v24.4s, v1.4s, v12.s[2]
346	fmla	v25.4s, v3.4s, v12.s[2]
347	fmla	v26.4s, v5.4s, v12.s[2]
348	fmla	v27.4s, v7.4s, v12.s[2]
349	fmla	v28.4s, v1.4s, v12.s[3]
350	fmla	v29.4s, v3.4s, v12.s[3]
351	fmla	v30.4s, v5.4s, v12.s[3]
352	fmla	v31.4s, v7.4s, v12.s[3]
353.endm
354
355.macro KERNEL16x4_SUB
356	ld1	{v8.4s}, [pB]
357	add	pB, pB, #16
358
359	ld1	{v0.4s}, [pA_0]
360	add	pA_0, pA_0, #16
361
362	fmla	v16.4s, v0.4s, v8.s[0]
363	fmla	v20.4s, v0.4s, v8.s[1]
364	fmla	v24.4s, v0.4s, v8.s[2]
365	fmla	v28.4s, v0.4s, v8.s[3]
366
367	ld1	{v2.4s}, [pA_1]
368	add	pA_1, pA_1, #16
369
370	fmla	v17.4s, v2.4s, v8.s[0]
371	fmla	v21.4s, v2.4s, v8.s[1]
372	fmla	v25.4s, v2.4s, v8.s[2]
373	fmla	v29.4s, v2.4s, v8.s[3]
374
375	ld1	{v4.4s}, [pA_2]
376	add	pA_2, pA_2, #16
377
378	fmla	v18.4s, v4.4s, v8.s[0]
379	fmla	v22.4s, v4.4s, v8.s[1]
380	fmla	v26.4s, v4.4s, v8.s[2]
381	fmla	v30.4s, v4.4s, v8.s[3]
382
383	ld1	{v6.4s}, [pA_3]
384	add	pA_3, pA_3, #16
385
386	fmla	v19.4s, v6.4s, v8.s[0]
387	fmla	v23.4s, v6.4s, v8.s[1]
388	fmla	v27.4s, v6.4s, v8.s[2]
389	fmla	v31.4s, v6.4s, v8.s[3]
390.endm
391
392.macro SAVE16x4
393	mov	pCRow1, pCRow0
394
395	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1]
396	fmla	v0.4s, v16.4s, alphaV0
397	fmla	v1.4s, v17.4s, alphaV1
398	fmla	v2.4s, v18.4s, alphaV2
399	fmla	v3.4s, v19.4s, alphaV3
400	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1]
401
402	add	pCRow1, pCRow1, LDC
403
404	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
405	fmla	v4.4s, v20.4s, alphaV0
406	fmla	v5.4s, v21.4s, alphaV1
407	fmla	v6.4s, v22.4s, alphaV2
408	fmla	v7.4s, v23.4s, alphaV3
409	st1 	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
410
411	add	pCRow1, pCRow1, LDC
412
413	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1]
414	fmla	v0.4s, v24.4s, alphaV0
415	fmla	v1.4s, v25.4s, alphaV1
416	fmla	v2.4s, v26.4s, alphaV2
417	fmla	v3.4s, v27.4s, alphaV3
418	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1]
419
420	add	pCRow1, pCRow1, LDC
421
422	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
423	fmla	v4.4s, v28.4s, alphaV0
424	fmla	v5.4s, v29.4s, alphaV1
425	fmla	v6.4s, v30.4s, alphaV2
426	fmla	v7.4s, v31.4s, alphaV3
427	st1 	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
428
429	add	pCRow0, pCRow0, #64
430.endm
431
432/******************************************************************************/
433
434.macro INIT8x4
435	fmov		s16, wzr
436	fmov		s17, s16
437	fmov		s18, s17
438	fmov		s19, s16
439	fmov		s20, s17
440	fmov		s21, s16
441	fmov		s22, s17
442	fmov		s23, s16
443	fmov		s24, s17
444	fmov		s25, s16
445	fmov		s26, s17
446	fmov		s27, s16
447	fmov		s28, s17
448	fmov		s29, s16
449	fmov		s30, s17
450	fmov		s31, s16
451.endm
452
453.macro KERNEL8x4_SUB
454	ld1	{v8.2s, v9.2s}, [pB]
455	add	pB, pB, #16
456	ld1	{v0.2s, v1.2s}, [pA_0]
457	add	pA_0, pA_0, #16
458
459	fmla	v16.2s, v0.2s, v8.s[0]
460	fmla	v29.2s, v1.2s, v9.s[1]
461	fmla	v20.2s, v0.2s, v8.s[1]
462	fmla	v25.2s, v1.2s, v9.s[0]
463
464	ld1	{v2.2s, v3.2s}, [pA_1]
465	add	pA_1, pA_1, #16
466
467	fmla	v24.2s, v0.2s, v9.s[0]
468	fmla	v21.2s, v1.2s, v8.s[1]
469	fmla	v28.2s, v0.2s, v9.s[1]
470	fmla	v17.2s, v1.2s, v8.s[0]
471
472	fmla	v18.2s, v2.2s, v8.s[0]
473	fmla	v31.2s, v3.2s, v9.s[1]
474	fmla	v22.2s, v2.2s, v8.s[1]
475	fmla	v27.2s, v3.2s, v9.s[0]
476
477	fmla	v26.2s, v2.2s, v9.s[0]
478	fmla	v23.2s, v3.2s, v8.s[1]
479	fmla	v30.2s, v2.2s, v9.s[1]
480	fmla	v19.2s, v3.2s, v8.s[0]
481.endm
482
483.macro SAVE8x4
484	mov	pCRow1, pCRow0
485
486	ld1	{v0.2s, v1.2s}, [pCRow1]
487	fmla	v0.2s, v16.2s, alphaV0
488	fmla	v1.2s, v17.2s, alphaV1
489	st1 	{v0.2s, v1.2s}, [pCRow1]
490
491	add	pCRow2, pCRow1, LDC
492	add	pCRow1, pCRow1, #16
493
494	ld1	{v2.2s, v3.2s}, [pCRow1]
495	fmla	v2.2s, v18.2s, alphaV2
496	fmla	v3.2s, v19.2s, alphaV3
497	st1 	{v2.2s, v3.2s}, [pCRow1]
498
499	ld1	{v4.2s, v5.2s}, [pCRow2]
500	fmla	v4.2s, v20.2s, alphaV0
501	fmla	v5.2s, v21.2s, alphaV1
502	st1 	{v4.2s, v5.2s}, [pCRow2]
503
504	add	pCRow1, pCRow2, LDC
505	add	pCRow2, pCRow2, #16
506
507	ld1	{v6.2s, v7.2s}, [pCRow2]
508	fmla	v6.2s, v22.2s, alphaV2
509	fmla	v7.2s, v23.2s, alphaV3
510	st1 	{v6.2s, v7.2s}, [pCRow2]
511
512	ld1	{v0.2s, v1.2s}, [pCRow1]
513	fmla	v0.2s, v24.2s, alphaV0
514	fmla	v1.2s, v25.2s, alphaV1
515	st1 	{v0.2s, v1.2s}, [pCRow1]
516
517	add	pCRow2, pCRow1, LDC
518	add	pCRow1, pCRow1, #16
519
520	ld1	{v2.2s, v3.2s}, [pCRow1]
521	fmla	v2.2s, v26.2s, alphaV2
522	fmla	v3.2s, v27.2s, alphaV3
523	st1 	{v2.2s, v3.2s}, [pCRow1]
524
525	ld1	{v4.2s, v5.2s}, [pCRow2]
526	fmla	v4.2s, v28.2s, alphaV0
527	fmla	v5.2s, v29.2s, alphaV1
528	st1 	{v4.2s, v5.2s}, [pCRow2]
529
530	add	pCRow2, pCRow2, #16
531
532	ld1	{v6.2s, v7.2s}, [pCRow2]
533	fmla	v6.2s, v30.2s, alphaV2
534	fmla	v7.2s, v31.2s, alphaV3
535	st1 	{v6.2s, v7.2s}, [pCRow2]
536
537	add	pCRow0, pCRow0, #32
538.endm
539
540/******************************************************************************/
541
542.macro INIT4x4
543	fmov		s16, wzr
544	fmov		s17, s16
545	fmov		s20, s17
546	fmov		s21, s16
547	fmov		s24, s17
548	fmov		s25, s16
549	fmov		s28, s17
550	fmov		s29, s16
551.endm
552
553.macro KERNEL4x4_SUB
554	ld1	{v8.2s, v9.2s}, [pB]
555	add	pB, pB, #16
556	ld1	{v0.2s, v1.2s}, [pA_0]
557	add	pA_0, pA_0, #16
558
559	fmla	v16.2s, v0.2s, v8.s[0]
560	fmla	v29.2s, v1.2s, v9.s[1]
561
562	fmla	v20.2s, v0.2s, v8.s[1]
563	fmla	v25.2s, v1.2s, v9.s[0]
564
565	fmla	v24.2s, v0.2s, v9.s[0]
566	fmla	v21.2s, v1.2s, v8.s[1]
567
568	fmla	v28.2s, v0.2s, v9.s[1]
569	fmla	v17.2s, v1.2s, v8.s[0]
570.endm
571
572.macro SAVE4x4
573	ld1	{v8.2s, v9.2s}, [pCRow0]
574	fmla	v8.2s, v16.2s, alphaV0
575	fmla	v9.2s, v17.2s, alphaV1
576	st1 	{v8.2s, v9.2s}, [pCRow0]
577
578	add	pCRow1, pCRow0, LDC
579
580	ld1	{v12.2s, v13.2s}, [pCRow1]
581	fmla	v12.2s, v20.2s, alphaV2
582	fmla	v13.2s, v21.2s, alphaV3
583	st1 	{v12.2s, v13.2s}, [pCRow1]
584
585	add	pCRow2, pCRow1, LDC
586
587	ld1	{v8.2s, v9.2s}, [pCRow2]
588	fmla	v8.2s, v24.2s, alphaV0
589	fmla	v9.2s, v25.2s, alphaV1
590	st1 	{v8.2s, v9.2s}, [pCRow2]
591
592	add	pCRow1, pCRow2, LDC
593
594	ld1	{v12.2s, v13.2s}, [pCRow1]
595	fmla	v12.2s, v28.2s, alphaV2
596	fmla	v13.2s, v29.2s, alphaV3
597	st1 	{v12.2s, v13.2s}, [pCRow1]
598
599	add	pCRow0, pCRow0, #16
600.endm
601
602/******************************************************************************/
603
604.macro INIT2x4
605	fmov		s16, wzr
606	fmov		s20, s16
607	fmov		s24, s20
608	fmov		s28, s16
609.endm
610
611.macro KERNEL2x4_SUB
612	ld1	{v8.2s, v9.2s}, [pB]
613	add	pB, pB, #16
614	ld1	{v0.2s}, [pA_0]
615	add	pA_0, pA_0, #8
616
617	fmla	v16.2s, v0.2s, v8.s[0]
618	fmla	v20.2s, v0.2s, v8.s[1]
619	fmla	v24.2s, v0.2s, v9.s[0]
620	fmla	v28.2s, v0.2s, v9.s[1]
621.endm
622
623.macro SAVE2x4
624	ld1	{v8.2s}, [pCRow0]
625	fmla	v8.2s, v16.2s, alphaV0
626	st1	{v8.2s}, [pCRow0]
627
628	add	pCRow1, pCRow0, LDC
629
630	ld1	{v12.2s}, [pCRow1]
631	fmla	v12.2s, v20.2s, alphaV1
632	st1	{v12.2s}, [pCRow1]
633
634	add	pCRow2, pCRow1, LDC
635
636	ld1	{v8.2s}, [pCRow2]
637	fmla	v8.2s, v24.2s, alphaV2
638	st1	{v8.2s}, [pCRow2]
639
640	add	pCRow1, pCRow2, LDC
641
642	ld1	{v12.2s}, [pCRow1]
643	fmla	v12.2s, v28.2s, alphaV3
644	st1	{v12.2s}, [pCRow1]
645
646	add	pCRow0, pCRow0, #8
647.endm
648
649/******************************************************************************/
650
651.macro INIT1x4
652	fmov		s16, wzr
653	fmov		s20, s16
654.endm
655
656.macro KERNEL1x4_SUB
657	ldr	s0, [pA_0]
658	add	pA_0, pA_0, #4
659
660	ld1	{v8.2s, v9.2s}, [pB]
661	add	pB, pB, #16
662
663	fmla	v16.2s, v8.2s, v0.s[0]
664	fmla	v20.2s, v9.2s, v0.s[0]
665.endm
666
667.macro SAVE1x4
668	add	pCRow1, pCRow0, LDC
669
670	ld1	{v8.s}[0], [pCRow0]
671	ld1	{v8.s}[1], [pCRow1]
672	fmla	v8.2s, v16.2s, alphaV0
673	st1	{v8.s}[0], [pCRow0]
674	st1	{v8.s}[1], [pCRow1]
675
676	add	pCRow2, pCRow1, LDC
677	add	pCRow1, pCRow2, LDC
678
679	ld1	{v12.s}[0], [pCRow2]
680	ld1	{v12.s}[1], [pCRow1]
681	fmla	v12.2s, v20.2s, alphaV1
682	st1	{v12.s}[0], [pCRow2]
683	st1	{v12.s}[1], [pCRow1]
684
685	add	pCRow0, pCRow0, #4
686.endm
687
688/******************************************************************************/
689
690.macro INIT4x2
691	fmov	s16, wzr
692	fmov	s17, s16
693	fmov	s20, s17
694	fmov	s21, s16
695.endm
696
697.macro KERNEL4x2_SUB
698	ld1	{v8.2s}, [pB]
699	add	pB, pB, #8
700	ld1	{v0.2s, v1.2s}, [pA_0]
701	add	pA_0, pA_0, #16
702
703	fmla	v16.2s, v0.2s, v8.s[0]
704	fmla	v17.2s, v1.2s, v8.s[0]
705	fmla	v20.2s, v0.2s, v8.s[1]
706	fmla	v21.2s, v1.2s, v8.s[1]
707.endm
708
709.macro SAVE4x2
710	ld1	{v8.2s, v9.2s}, [pCRow0]
711	fmla	v8.2s, v16.2s, alphaV0
712	fmla	v9.2s, v17.2s, alphaV1
713	st1	{v8.2s, v9.2s}, [pCRow0]
714
715	add	pCRow1, pCRow0, LDC
716
717	ld1	{v12.2s, v13.2s}, [pCRow1]
718	fmla	v12.2s, v20.2s, alphaV2
719	fmla	v13.2s, v21.2s, alphaV3
720	st1	{v12.2s, v13.2s}, [pCRow1]
721
722	add	pCRow0, pCRow0, #16
723.endm
724
725/******************************************************************************/
726
727.macro INIT2x2
728	fmov		s16, wzr
729	fmov		s20, s16
730.endm
731
732.macro KERNEL2x2_SUB
733	ld1	{v8.2s}, [pB]
734	add	pB, pB, #8
735
736	ld1	{v0.2s}, [pA_0]
737	add	pA_0, pA_0, #8
738
739	fmla	v16.2s, v0.2s, v8.s[0]
740	fmla	v20.2s, v0.2s, v8.s[1]
741.endm
742
743.macro SAVE2x2
744	ld1	{v8.2s}, [pCRow0]
745	fmla	v8.2s, v16.2s, alphaV0
746	st1	{v8.2s}, [pCRow0]
747
748	add	pCRow1 , pCRow0, LDC
749
750	ld1	{v12.2s}, [pCRow1]
751	fmla	v12.2s, v20.2s, alphaV1
752	st1	{v12.2s}, [pCRow1]
753
754	add	pCRow0, pCRow0, #8
755.endm
756
757/******************************************************************************/
758
759.macro INIT1x2
760	fmov		s16, wzr
761.endm
762
763.macro KERNEL1x2_SUB
764	ld1	{v8.2s} , [pB]
765	add	pB , pB, #8
766
767	ldr	s0 , [pA_0]
768	add	pA_0, pA_0, #4
769
770	fmla	v16.2s, v8.2s, v0.s[0]
771.endm
772
773.macro SAVE1x2
774	add	pCRow1 , pCRow0, LDC
775
776	ld1	{v8.s}[0], [pCRow0]
777	ld1	{v8.s}[1], [pCRow1]
778	fmla	v8.2s, v16.2s, alphaV0
779	st1	{v8.s}[0], [pCRow0]
780	st1	{v8.s}[1], [pCRow1]
781
782	add	pCRow0, pCRow0, #4
783.endm
784
785/******************************************************************************/
786
787.macro INIT4x1
788	fmov	s16, wzr
789	fmov	s17, s16
790.endm
791
792.macro KERNEL4x1_SUB
793	ldr	s8, [pB]
794	add	pB , pB, #4
795
796	ld1	{v0.2s, v1.2s}, [pA_0]
797	add	pA_0 , pA_0, #16
798
799	fmla	v16.2s, v0.2s, v8.s[0]
800	fmla	v17.2s, v1.2s, v8.s[0]
801.endm
802
803.macro SAVE4x1
804	ld1	{v8.2s, v9.2s}, [pCRow0]
805	fmla	v8.2s, v16.2s, alphaV0
806	fmla	v9.2s, v17.2s, alphaV1
807	st1	{v8.2s, v9.2s}, [pCRow0]
808
809	add	pCRow0, pCRow0, #16
810.endm
811
812
813
814
815/******************************************************************************/
816
817.macro INIT2x1
818	fmov		s16, wzr
819.endm
820
821.macro KERNEL2x1_SUB
822	ldr	s8, [pB]
823	add	pB , pB, #4
824
825	ld1	{v0.2s}, [pA_0]
826	add	pA_0 , pA_0, #8
827
828	fmla	v16.2s, v0.2s, v8.s[0]
829.endm
830
831.macro SAVE2x1
832	ld1	{v8.2s}, [pCRow0]
833	fmla	v8.2s, v16.2s, alphaV0
834	st1	{v8.2s}, [pCRow0]
835
836	add	pCRow0, pCRow0, #8
837.endm
838
839/******************************************************************************/
840
841.macro INIT1x1
842	fmov	s16, wzr
843.endm
844
845.macro KERNEL1x1_SUB
846	ldr	s8, [pB]
847	add	pB , pB, #4
848
849	ldr	s0, [pA_0]
850	add	pA_0 , pA_0, #4
851
852	fmadd 	s16, s0, s8, s16
853.endm
854
855.macro SAVE1x1
856	ldr	s8, [pCRow0]
857	fmadd	s8, s16, alpha0, s8
858	str 	s8, [pCRow0]
859
860	add	pCRow0, pCRow0, #4
861.endm
862
863/*******************************************************************************
864* End of macro definitions
865*******************************************************************************/
866
867	PROLOGUE
868
869	.align 5
870	add	sp, sp, #-(11 * 16)
871	stp	d8, d9, [sp, #(0 * 16)]
872	stp	d10, d11, [sp, #(1 * 16)]
873	stp	d12, d13, [sp, #(2 * 16)]
874	stp	d14, d15, [sp, #(3 * 16)]
875	stp	d16, d17, [sp, #(4 * 16)]
876	stp	x18, x19, [sp, #(5 * 16)]
877	stp	x20, x21, [sp, #(6 * 16)]
878	stp	x22, x23, [sp, #(7 * 16)]
879	stp	x24, x25, [sp, #(8 * 16)]
880	stp	x26, x27, [sp, #(9 * 16)]
881	str	x28, [sp, #(10 * 16)]
882
883	fmov	alpha0, s0
884	fmov	alpha1, s0
885	fmov	alpha2, s0
886	fmov	alpha3, s0
887
888	lsl	LDC, LDC, #2			// ldc = ldc * 4
889
890	mov	pB, origPB
891
892	mov	counterJ, origN
893	asr 	counterJ, counterJ, #2		// J = J / 4
894	cmp 	counterJ, #0
895	ble	.Lsgemm_kernel_L2_BEGIN
896
897/******************************************************************************/
898
899.Lsgemm_kernel_L4_BEGIN:
900	mov	pCRow0, pC			// pCRow0 = C
901	add	pC, pC, LDC, lsl #2
902
903	lsl	temp, origK, #4			// k * 4 * 4
904	mov	pA_0, origPA			// pA_0 = start of A array
905	add	pA_1, temp, pA_0
906	add	pA_2, temp, pA_1
907	add	pA_3, temp, pA_2
908
909.Lsgemm_kernel_L4_M16_BEGIN:
910
911	mov	counterI, origM
912	asr 	counterI, counterI, #4		// counterI = counterI / 16
913	cmp 	counterI, #0
914	ble	.Lsgemm_kernel_L4_M8_BEGIN
915
916.Lsgemm_kernel_L4_M16_20:
917
918	mov	pB, origPB
919	asr 	counterL , origK, #1		// L = K / 2
920	cmp	counterL , #2			// is there at least 4 to do?
921	blt	.Lsgemm_kernel_L4_M16_32
922
923	KERNEL16x4_I				// do one in the K
924	KERNEL16x4_M2				// do another in the K
925
926	subs	counterL, counterL, #2
927	ble	.Lsgemm_kernel_L4_M16_22a
928	.align 5
929
930.Lsgemm_kernel_L4_M16_22:
931
932	KERNEL16x4_M1
933	KERNEL16x4_M2
934
935	subs	counterL, counterL, #1
936	bgt	.Lsgemm_kernel_L4_M16_22
937
938
939.Lsgemm_kernel_L4_M16_22a:
940
941	KERNEL16x4_M1
942	KERNEL16x4_E
943
944	b	 .Lsgemm_kernel_L4_M16_44
945
946.Lsgemm_kernel_L4_M16_32:
947
948	tst	counterL, #1
949	ble	.Lsgemm_kernel_L4_M16_40
950
951	KERNEL16x4_I
952
953	KERNEL16x4_E
954
955	b	.Lsgemm_kernel_L4_M16_44
956
957
958.Lsgemm_kernel_L4_M16_40:
959
960	INIT16x4
961
962.Lsgemm_kernel_L4_M16_44:
963
964	ands	counterL , origK, #1
965	ble	.Lsgemm_kernel_L4_M16_100
966
967.Lsgemm_kernel_L4_M16_46:
968
969	KERNEL16x4_SUB
970
971.Lsgemm_kernel_L4_M16_100:
972
973	SAVE16x4
974
975.Lsgemm_kernel_L4_M16_END:
976	lsl	temp, origK, #4			// k * 4 * 4 = Four rows of A
977	add	pA_0, pA_0, temp
978	add	pA_0, pA_0, temp
979	add	pA_0, pA_0, temp
980	add	pA_1, pA_0, temp
981	add	pA_2, pA_1, temp
982	add	pA_3, pA_2, temp
983	subs	counterI, counterI, #1
984	bne	.Lsgemm_kernel_L4_M16_20
985
986.Lsgemm_kernel_L4_M8_BEGIN:
987	mov	counterI, origM
988	tst	counterI , #15
989	ble	.Lsgemm_kernel_L4_END
990
991	tst	counterI, #8
992	ble	.Lsgemm_kernel_L4_M4_BEGIN
993
994.Lsgemm_kernel_L4_M8_20:
995
996	INIT8x4
997
998	mov	pB, origPB
999	asr 	counterL, origK, #3		// counterL = counterL / 8
1000	cmp	counterL, #0
1001	ble	.Lsgemm_kernel_L4_M8_40
1002
1003.Lsgemm_kernel_L4_M8_22:
1004
1005	KERNEL8x4_SUB
1006	KERNEL8x4_SUB
1007	KERNEL8x4_SUB
1008	KERNEL8x4_SUB
1009
1010	KERNEL8x4_SUB
1011	KERNEL8x4_SUB
1012	KERNEL8x4_SUB
1013	KERNEL8x4_SUB
1014
1015	subs	counterL, counterL, #1
1016	bgt	.Lsgemm_kernel_L4_M8_22
1017
1018
1019.Lsgemm_kernel_L4_M8_40:
1020
1021	ands	counterL , origK, #7		// counterL = counterL % 8
1022	ble	.Lsgemm_kernel_L4_M8_100
1023
1024.Lsgemm_kernel_L4_M8_42:
1025
1026	KERNEL8x4_SUB
1027
1028	subs	counterL, counterL, #1
1029	bgt	.Lsgemm_kernel_L4_M8_42
1030
1031.Lsgemm_kernel_L4_M8_100:
1032
1033	SAVE8x4
1034
1035.Lsgemm_kernel_L4_M8_END:
1036	lsl	temp, origK, #4			// k * 4 * 4
1037	add	pA_0, pA_0, temp
1038
1039.Lsgemm_kernel_L4_M4_BEGIN:
1040	mov	counterI, origM
1041	tst	counterI , #7
1042	ble	.Lsgemm_kernel_L4_END
1043
1044	tst	counterI, #4
1045	ble	.Lsgemm_kernel_L4_M2_BEGIN
1046
1047.Lsgemm_kernel_L4_M4_20:
1048
1049	INIT4x4
1050
1051	mov	pB, origPB
1052	asr 	counterL, origK, #3		// counterL = counterL / 8
1053	cmp	counterL, #0
1054	ble	.Lsgemm_kernel_L4_M4_40
1055
1056.Lsgemm_kernel_L4_M4_22:
1057
1058	KERNEL4x4_SUB
1059	KERNEL4x4_SUB
1060	KERNEL4x4_SUB
1061	KERNEL4x4_SUB
1062
1063	KERNEL4x4_SUB
1064	KERNEL4x4_SUB
1065	KERNEL4x4_SUB
1066	KERNEL4x4_SUB
1067
1068	subs	counterL, counterL, #1
1069	bgt	.Lsgemm_kernel_L4_M4_22
1070
1071
1072.Lsgemm_kernel_L4_M4_40:
1073
1074	ands	counterL , origK, #7		// counterL = counterL % 8
1075	ble	.Lsgemm_kernel_L4_M4_100
1076
1077.Lsgemm_kernel_L4_M4_42:
1078
1079	KERNEL4x4_SUB
1080
1081	subs	counterL, counterL, #1
1082	bgt	.Lsgemm_kernel_L4_M4_42
1083
1084.Lsgemm_kernel_L4_M4_100:
1085
1086	SAVE4x4
1087
1088.Lsgemm_kernel_L4_M4_END:
1089
1090
1091.Lsgemm_kernel_L4_M2_BEGIN:
1092
1093	mov	counterI, origM
1094	tst	counterI , #3
1095	ble	.Lsgemm_kernel_L4_END
1096
1097	tst	counterI, #2			// counterI = counterI / 2
1098	ble	.Lsgemm_kernel_L4_M1_BEGIN
1099
1100.Lsgemm_kernel_L4_M2_20:
1101
1102	INIT2x4
1103
1104	mov	pB, origPB
1105	asr 	counterL , origK, #3		// counterL = counterL / 8
1106	cmp	counterL , #0
1107	ble	.Lsgemm_kernel_L4_M2_40
1108
1109.Lsgemm_kernel_L4_M2_22:
1110
1111	KERNEL2x4_SUB
1112	KERNEL2x4_SUB
1113	KERNEL2x4_SUB
1114	KERNEL2x4_SUB
1115
1116	KERNEL2x4_SUB
1117	KERNEL2x4_SUB
1118	KERNEL2x4_SUB
1119	KERNEL2x4_SUB
1120
1121	subs	counterL, counterL, #1
1122	bgt	.Lsgemm_kernel_L4_M2_22
1123
1124
1125.Lsgemm_kernel_L4_M2_40:
1126
1127	ands	counterL , origK, #7		// counterL = counterL % 8
1128	ble	.Lsgemm_kernel_L4_M2_100
1129
1130.Lsgemm_kernel_L4_M2_42:
1131
1132	KERNEL2x4_SUB
1133
1134	subs	counterL, counterL, #1
1135	bgt	.Lsgemm_kernel_L4_M2_42
1136
1137.Lsgemm_kernel_L4_M2_100:
1138
1139	SAVE2x4
1140
1141.Lsgemm_kernel_L4_M2_END:
1142
1143
1144.Lsgemm_kernel_L4_M1_BEGIN:
1145
1146	tst	counterI, #1			// counterI = counterI % 2
1147	ble	.Lsgemm_kernel_L4_END
1148
1149.Lsgemm_kernel_L4_M1_20:
1150
1151	INIT1x4
1152
1153	mov	pB, origPB
1154	asr 	counterL , origK, #3		// counterL = counterL / 8
1155	cmp	counterL , #0
1156	ble	.Lsgemm_kernel_L4_M1_40
1157
1158.Lsgemm_kernel_L4_M1_22:
1159	KERNEL1x4_SUB
1160	KERNEL1x4_SUB
1161	KERNEL1x4_SUB
1162	KERNEL1x4_SUB
1163
1164	KERNEL1x4_SUB
1165	KERNEL1x4_SUB
1166	KERNEL1x4_SUB
1167	KERNEL1x4_SUB
1168
1169	subs	counterL, counterL, #1
1170	bgt	.Lsgemm_kernel_L4_M1_22
1171
1172
1173.Lsgemm_kernel_L4_M1_40:
1174
1175	ands	counterL , origK, #7		// counterL = counterL % 8
1176	ble	.Lsgemm_kernel_L4_M1_100
1177
1178.Lsgemm_kernel_L4_M1_42:
1179
1180	KERNEL1x4_SUB
1181
1182	subs	counterL, counterL, #1
1183	bgt	.Lsgemm_kernel_L4_M1_42
1184
1185.Lsgemm_kernel_L4_M1_100:
1186
1187	SAVE1x4
1188
1189
1190.Lsgemm_kernel_L4_END:
1191
1192	lsl	temp, origK, #4
1193	add	origPB, origPB, temp		// B = B + K * 4 * 4
1194
1195	subs	counterJ, counterJ , #1		// j--
1196	bgt	.Lsgemm_kernel_L4_BEGIN
1197
1198
1199/******************************************************************************/
1200
1201.Lsgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
1202
1203	mov	counterJ , origN
1204	tst	counterJ , #3
1205	ble	.Lsgemm_kernel_L999
1206
1207	tst	counterJ , #2
1208	ble	.Lsgemm_kernel_L1_BEGIN
1209
1210	mov	pCRow0, pC			// pCRow0 = pC
1211
1212	add	pC,pC,LDC, lsl #1
1213
1214	mov	pA_0, origPA			// pA_0 = A
1215
1216
1217
1218.Lsgemm_kernel_L2_M4_BEGIN:
1219
1220	mov	counterI, origM
1221	asr 	counterI, counterI, #2		// counterI = counterI / 4
1222	cmp	counterI,#0
1223	ble	.Lsgemm_kernel_L2_M2_BEGIN
1224
1225.Lsgemm_kernel_L2_M4_20:
1226
1227	INIT4x2
1228
1229	mov	pB, origPB
1230	asr	counterL , origK, #3		// counterL = counterL / 8
1231	cmp	counterL,#0
1232	ble	.Lsgemm_kernel_L2_M4_40
1233	.align 5
1234
1235.Lsgemm_kernel_L2_M4_22:
1236	KERNEL4x2_SUB
1237	KERNEL4x2_SUB
1238	KERNEL4x2_SUB
1239	KERNEL4x2_SUB
1240
1241	KERNEL4x2_SUB
1242	KERNEL4x2_SUB
1243	KERNEL4x2_SUB
1244	KERNEL4x2_SUB
1245
1246	subs	counterL, counterL, #1
1247	bgt	.Lsgemm_kernel_L2_M4_22
1248
1249
1250.Lsgemm_kernel_L2_M4_40:
1251
1252	ands	counterL , origK, #7		// counterL = counterL % 8
1253	ble	.Lsgemm_kernel_L2_M4_100
1254
1255.Lsgemm_kernel_L2_M4_42:
1256
1257	KERNEL4x2_SUB
1258
1259	subs	counterL, counterL, #1
1260	bgt	.Lsgemm_kernel_L2_M4_42
1261
1262.Lsgemm_kernel_L2_M4_100:
1263
1264	SAVE4x2
1265
1266.Lsgemm_kernel_L2_M4_END:
1267
1268	subs	counterI, counterI, #1
1269	bgt	.Lsgemm_kernel_L2_M4_20
1270
1271
1272.Lsgemm_kernel_L2_M2_BEGIN:
1273
1274	mov	counterI, origM
1275	tst	counterI , #3
1276	ble	.Lsgemm_kernel_L2_END
1277
1278	tst	counterI, #2			// counterI = counterI / 2
1279	ble	.Lsgemm_kernel_L2_M1_BEGIN
1280
1281.Lsgemm_kernel_L2_M2_20:
1282
1283	INIT2x2
1284
1285	mov	pB, origPB
1286	asr	counterL , origK, #3		// counterL = counterL / 8
1287        cmp	counterL,#0
1288	ble	.Lsgemm_kernel_L2_M2_40
1289
1290.Lsgemm_kernel_L2_M2_22:
1291
1292	KERNEL2x2_SUB
1293	KERNEL2x2_SUB
1294	KERNEL2x2_SUB
1295	KERNEL2x2_SUB
1296
1297	KERNEL2x2_SUB
1298	KERNEL2x2_SUB
1299	KERNEL2x2_SUB
1300	KERNEL2x2_SUB
1301
1302	subs	counterL, counterL, #1
1303	bgt	.Lsgemm_kernel_L2_M2_22
1304
1305
1306.Lsgemm_kernel_L2_M2_40:
1307
1308	ands	counterL , origK, #7		// counterL = counterL % 8
1309	ble	.Lsgemm_kernel_L2_M2_100
1310
1311.Lsgemm_kernel_L2_M2_42:
1312
1313	KERNEL2x2_SUB
1314
1315	subs	counterL, counterL, #1
1316	bgt	.Lsgemm_kernel_L2_M2_42
1317
1318.Lsgemm_kernel_L2_M2_100:
1319
1320	SAVE2x2
1321
1322.Lsgemm_kernel_L2_M2_END:
1323
1324
1325.Lsgemm_kernel_L2_M1_BEGIN:
1326
1327	tst	counterI, #1			// counterI = counterI % 2
1328	ble	.Lsgemm_kernel_L2_END
1329
1330.Lsgemm_kernel_L2_M1_20:
1331
1332	INIT1x2
1333
1334	mov	pB, origPB
1335	asr 	counterL , origK, #3		// counterL = counterL / 8
1336        cmp     counterL, #0
1337	ble	.Lsgemm_kernel_L2_M1_40
1338
1339.Lsgemm_kernel_L2_M1_22:
1340	KERNEL1x2_SUB
1341	KERNEL1x2_SUB
1342	KERNEL1x2_SUB
1343	KERNEL1x2_SUB
1344
1345	KERNEL1x2_SUB
1346	KERNEL1x2_SUB
1347	KERNEL1x2_SUB
1348	KERNEL1x2_SUB
1349
1350	subs	counterL, counterL, #1
1351	bgt	.Lsgemm_kernel_L2_M1_22
1352
1353
1354.Lsgemm_kernel_L2_M1_40:
1355
1356	ands	counterL , origK, #7		// counterL = counterL % 8
1357	ble	.Lsgemm_kernel_L2_M1_100
1358
1359.Lsgemm_kernel_L2_M1_42:
1360
1361	KERNEL1x2_SUB
1362
1363	subs	counterL, counterL, #1
1364	bgt	.Lsgemm_kernel_L2_M1_42
1365
1366.Lsgemm_kernel_L2_M1_100:
1367
1368	SAVE1x2
1369
1370
1371.Lsgemm_kernel_L2_END:
1372	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
1373
1374/******************************************************************************/
1375
1376.Lsgemm_kernel_L1_BEGIN:
1377
1378	mov	counterJ , origN
1379	tst	counterJ , #1
1380	ble	.Lsgemm_kernel_L999 // done
1381
1382
1383	mov	pCRow0, pC			// pCRow0 = C
1384	add	pC , pC , LDC			// Update pC to point to next
1385
1386	mov	pA_0, origPA			// pA_0 = A
1387
1388
1389
1390.Lsgemm_kernel_L1_M4_BEGIN:
1391
1392	mov	counterI, origM
1393	asr 	counterI, counterI, #2		// counterI = counterI / 4
1394	cmp	counterI, #0
1395	ble	.Lsgemm_kernel_L1_M2_BEGIN
1396
1397.Lsgemm_kernel_L1_M4_20:
1398
1399	INIT4x1
1400
1401	mov	pB, origPB
1402	asr	counterL , origK, #3		// counterL = counterL / 8
1403	cmp	counterL , #0
1404	ble	.Lsgemm_kernel_L1_M4_40
1405	.align 5
1406
1407.Lsgemm_kernel_L1_M4_22:
1408	KERNEL4x1_SUB
1409	KERNEL4x1_SUB
1410	KERNEL4x1_SUB
1411	KERNEL4x1_SUB
1412
1413	KERNEL4x1_SUB
1414	KERNEL4x1_SUB
1415	KERNEL4x1_SUB
1416	KERNEL4x1_SUB
1417
1418	subs	counterL, counterL, #1
1419	bgt	.Lsgemm_kernel_L1_M4_22
1420
1421
1422.Lsgemm_kernel_L1_M4_40:
1423
1424	ands	counterL , origK, #7		// counterL = counterL % 8
1425	ble	.Lsgemm_kernel_L1_M4_100
1426
1427.Lsgemm_kernel_L1_M4_42:
1428
1429	KERNEL4x1_SUB
1430
1431	subs	counterL, counterL, #1
1432	bgt	.Lsgemm_kernel_L1_M4_42
1433
1434.Lsgemm_kernel_L1_M4_100:
1435
1436	SAVE4x1
1437
1438.Lsgemm_kernel_L1_M4_END:
1439
1440	subs	counterI, counterI, #1
1441	bgt	.Lsgemm_kernel_L1_M4_20
1442
1443
1444.Lsgemm_kernel_L1_M2_BEGIN:
1445
1446	mov	counterI, origM
1447	tst	counterI , #3
1448	ble	.Lsgemm_kernel_L1_END
1449
1450	tst	counterI, #2			// counterI = counterI / 2
1451	ble	.Lsgemm_kernel_L1_M1_BEGIN
1452
1453.Lsgemm_kernel_L1_M2_20:
1454
1455	INIT2x1
1456
1457	mov	pB, origPB
1458	asr 	counterL , origK, #3		// counterL = counterL / 8
1459	cmp	counterL , #0
1460	ble	.Lsgemm_kernel_L1_M2_40
1461
1462.Lsgemm_kernel_L1_M2_22:
1463
1464	KERNEL2x1_SUB
1465	KERNEL2x1_SUB
1466	KERNEL2x1_SUB
1467	KERNEL2x1_SUB
1468
1469	KERNEL2x1_SUB
1470	KERNEL2x1_SUB
1471	KERNEL2x1_SUB
1472	KERNEL2x1_SUB
1473
1474	subs	counterL, counterL, #1
1475	bgt	.Lsgemm_kernel_L1_M2_22
1476
1477
1478.Lsgemm_kernel_L1_M2_40:
1479
1480	ands	counterL , origK, #7		// counterL = counterL % 8
1481	ble	.Lsgemm_kernel_L1_M2_100
1482
1483.Lsgemm_kernel_L1_M2_42:
1484
1485	KERNEL2x1_SUB
1486
1487	subs	counterL, counterL, #1
1488	bgt	.Lsgemm_kernel_L1_M2_42
1489
1490.Lsgemm_kernel_L1_M2_100:
1491
1492	SAVE2x1
1493
1494.Lsgemm_kernel_L1_M2_END:
1495
1496
1497.Lsgemm_kernel_L1_M1_BEGIN:
1498
1499	tst	counterI, #1			// counterI = counterI % 2
1500	ble	.Lsgemm_kernel_L1_END
1501
1502.Lsgemm_kernel_L1_M1_20:
1503
1504	INIT1x1
1505
1506	mov	pB, origPB
1507	asr 	counterL , origK, #3		// counterL = counterL / 8
1508	cmp	counterL , #0
1509	ble	.Lsgemm_kernel_L1_M1_40
1510
1511.Lsgemm_kernel_L1_M1_22:
1512	KERNEL1x1_SUB
1513	KERNEL1x1_SUB
1514	KERNEL1x1_SUB
1515	KERNEL1x1_SUB
1516
1517	KERNEL1x1_SUB
1518	KERNEL1x1_SUB
1519	KERNEL1x1_SUB
1520	KERNEL1x1_SUB
1521
1522	subs	counterL, counterL, #1
1523	bgt	.Lsgemm_kernel_L1_M1_22
1524
1525
1526.Lsgemm_kernel_L1_M1_40:
1527
1528	ands	counterL , origK, #7		// counterL = counterL % 8
1529	ble	.Lsgemm_kernel_L1_M1_100
1530
1531.Lsgemm_kernel_L1_M1_42:
1532
1533	KERNEL1x1_SUB
1534
1535	subs	counterL, counterL, #1
1536	bgt	.Lsgemm_kernel_L1_M1_42
1537
1538.Lsgemm_kernel_L1_M1_100:
1539
1540	SAVE1x1
1541
1542
1543.Lsgemm_kernel_L1_END:
1544
1545
1546.Lsgemm_kernel_L999:
1547	mov	x0, #0				// set return value
1548	ldp	d8, d9, [sp, #(0 * 16)]
1549	ldp	d10, d11, [sp, #(1 * 16)]
1550	ldp	d12, d13, [sp, #(2 * 16)]
1551	ldp	d14, d15, [sp, #(3 * 16)]
1552	ldp	d16, d17, [sp, #(4 * 16)]
1553	ldp	x18, x19, [sp, #(5 * 16)]
1554	ldp	x20, x21, [sp, #(6 * 16)]
1555	ldp	x22, x23, [sp, #(7 * 16)]
1556	ldp	x24, x25, [sp, #(8 * 16)]
1557	ldp	x26, x27, [sp, #(9 * 16)]
1558	ldr	x28, [sp, #(10 * 16)]
1559	add	sp, sp, #(11*16)
1560	ret
1561
1562	EPILOGUE
1563
1564