1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36
37
38// subroutine
39//
40// input arguments:
41// w8   <- k
42// x9   <- A
43// x10  <- B
44// x11  <- ldb
45//
46// output arguments:
47
48#if MACRO_LEVEL>=2
49	.macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C
50#else
51	.align	4
52	FUN_START(inner_kernel_gemm_add_nt_4x4_lib4c)
53#endif
54
55
56
57#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
58
59
60
61	// early return
62	cmp		w8, #0
63	ble		2f // return
64
65	add		x12, x11, x11
66	add		x13, x12, x11
67	add		x14, x12, x12
68	add		x15, x13, x12
69	add		x16, x13, x13
70	add		x17, x14, x13
71
72	// prefetch
73	prfm	PLDL1KEEP, [x10]
74	prfm	PLDL1KEEP, [x10, x11]
75	prfm	PLDL1KEEP, [x10, x12]
76	prfm	PLDL1KEEP, [x10, x13]
77	prfm	PLDL1KEEP, [x9, #0]
78	prfm	PLDL1KEEP, [x9, #64]
79
80	// preload
81	ldp		q24, q25, [x10, #(0*8)]
82	add		x10, x10, x11
83	ldp		q26, q27, [x10, #(0*8)]
84	add		x10, x10, x11
85	ldp		q28, q29, [x10, #(0*8)]
86	add		x10, x10, x11
87	ldp		q30, q31, [x10, #(0*8)]
88	add		x10, x10, x11
89	ldp		q16, q17, [x9, #(0*8+0*32)]
90
91	cmp		w8, #4
92	ble		0f // consider clean up loop
93
94	// prefetch
95	prfm	PLDL1KEEP, [x10]
96	prfm	PLDL1KEEP, [x10, x11]
97	prfm	PLDL1KEEP, [x10, x12]
98	prfm	PLDL1KEEP, [x10, x13]
99	prfm	PLDL1KEEP, [x9, #128]
100	prfm	PLDL1KEEP, [x9, #192]
101
102	// zero tmp acc
103	fmov	d8, xzr
104	fmov    d9, d8
105	fmov    d10, d8
106	fmov    d11, d8
107	fmov    d12, d8
108	fmov    d13, d8
109	fmov    d14, d8
110	fmov    d15, d8
111
112	// main loop
1131:
114
115	// unroll 0
116	ldp		q18, q19, [x9, #(0*8+1*32)]
117	fmla	v0.2d, v16.2d, v24.2d[0]
118	fmla	v1.2d, v17.2d, v24.2d[0]
119	prfm	PLDL1KEEP, [x9, #256]
120//	prfm	PLDL1KEEP, [x9, #128]
121	fmla	v2.2d, v16.2d, v24.2d[1]
122	fmla	v3.2d, v17.2d, v24.2d[1]
123	prfm	PLDL1KEEP, [x9, #320]
124//	prfm	PLDL1KEEP, [x9, #192]
125	fmla	v4.2d, v16.2d, v25.2d[0]
126	fmla	v5.2d, v17.2d, v25.2d[0]
127//	prfm	PLDL1KEEP, [x10]
128	prfm	PLDL1KEEP, [x10, x14]
129	fmla	v6.2d, v16.2d, v25.2d[1]
130	fmla	v7.2d, v17.2d, v25.2d[1]
131
132	// unroll 1
133	ldp		q16, q17, [x9, #(0*8+2*32)]
134	fmla	v8.2d, v18.2d, v26.2d[0]
135	fmla	v9.2d, v19.2d, v26.2d[0]
136//	prfm	PLDL1KEEP, [x10, x11]
137	prfm	PLDL1KEEP, [x10, x15]
138	fmla	v10.2d, v18.2d, v26.2d[1]
139	fmla	v11.2d, v19.2d, v26.2d[1]
140//	prfm	PLDL1KEEP, [x10, x12]
141	prfm	PLDL1KEEP, [x10, x16]
142	fmla	v12.2d, v18.2d, v27.2d[0]
143	fmla	v13.2d, v19.2d, v27.2d[0]
144//	prfm	PLDL1KEEP, [x10, x13]
145	prfm	PLDL1KEEP, [x10, x17]
146	fmla	v14.2d, v18.2d, v27.2d[1]
147	fmla	v15.2d, v19.2d, v27.2d[1]
148
149	// unroll 2
150	ldp		q18, q19, [x9, #(0*8+3*32)]
151	fmla	v0.2d, v16.2d, v28.2d[0]
152	fmla	v1.2d, v17.2d, v28.2d[0]
153	add		x9, x9, #128
154	fmla	v2.2d, v16.2d, v28.2d[1]
155	fmla	v3.2d, v17.2d, v28.2d[1]
156	sub		w8, w8, #4
157	fmla	v4.2d, v16.2d, v29.2d[0]
158	fmla	v5.2d, v17.2d, v29.2d[0]
159	cmp		w8, #4
160	fmla	v6.2d, v16.2d, v29.2d[1]
161	fmla	v7.2d, v17.2d, v29.2d[1]
162
163	// unroll 3
164	ldp		q16, q17, [x9, #(0*8+0*32)]
165	fmla	v8.2d, v18.2d, v30.2d[0]
166	fmla	v9.2d, v19.2d, v30.2d[0]
167	ldp		q24, q25, [x10, #(0*8)]
168	fmla	v10.2d, v18.2d, v30.2d[1]
169	add		x10, x10, x11
170	fmla	v11.2d, v19.2d, v30.2d[1]
171	ldp		q26, q27, [x10, #(0*8)]
172	fmla	v12.2d, v18.2d, v31.2d[0]
173	add		x10, x10, x11
174	fmla	v13.2d, v19.2d, v31.2d[0]
175	ldp		q28, q29, [x10, #(0*8)]
176	fmla	v14.2d, v18.2d, v31.2d[1]
177	add		x10, x10, x11
178	fmla	v15.2d, v19.2d, v31.2d[1]
179	ldp		q30, q31, [x10, #(0*8)]
180	add		x10, x10, x11
181
182	bgt		1b
183
184
185	// reduce
186	fadd	v0.2d, v0.2d, v8.2d
187	fadd	v1.2d, v1.2d, v9.2d
188	fadd	v2.2d, v2.2d, v10.2d
189	fadd	v3.2d, v3.2d, v11.2d
190	fadd	v4.2d, v4.2d, v12.2d
191	fadd	v5.2d, v5.2d, v13.2d
192	fadd	v6.2d, v6.2d, v14.2d
193	fadd	v7.2d, v7.2d, v15.2d
194
195//	sub		x9, x9, #32
196//	sub		x10, x10, #32
197
1980:
199
200	cmp		w8, #3
201	ble		4f
202
203	// unroll 0
204	ldp		q18, q19, [x9, #(0*8+1*32)]
205	fmla	v0.2d, v16.2d, v24.2d[0]
206	fmla	v1.2d, v17.2d, v24.2d[0]
207//	prfm	PLDL1KEEP, [x9, #128]
208	fmla	v2.2d, v16.2d, v24.2d[1]
209	fmla	v3.2d, v17.2d, v24.2d[1]
210//	prfm	PLDL1KEEP, [x9, #192]
211	fmla	v4.2d, v16.2d, v25.2d[0]
212	fmla	v5.2d, v17.2d, v25.2d[0]
213//	prfm	PLDL1KEEP, [x10, #128]
214	fmla	v6.2d, v16.2d, v25.2d[1]
215	fmla	v7.2d, v17.2d, v25.2d[1]
216
217	// unroll 1
218//	prfm	PLDL1KEEP, [x10, #192]
219	fmla	v0.2d, v18.2d, v26.2d[0]
220	fmla	v1.2d, v19.2d, v26.2d[0]
221	ldp		q16, q17, [x9, #(0*8+2*32)]
222	fmla	v2.2d, v18.2d, v26.2d[1]
223	fmla	v3.2d, v19.2d, v26.2d[1]
224	fmla	v4.2d, v18.2d, v27.2d[0]
225	fmla	v5.2d, v19.2d, v27.2d[0]
226	sub		w8, w8, #4
227	fmla	v6.2d, v18.2d, v27.2d[1]
228	fmla	v7.2d, v19.2d, v27.2d[1]
229
230	// unroll 2
231	ldp		q18, q19, [x9, #(0*8+3*32)]
232	fmla	v0.2d, v16.2d, v28.2d[0]
233	fmla	v1.2d, v17.2d, v28.2d[0]
234	add		x9, x9, #128
235	fmla	v2.2d, v16.2d, v28.2d[1]
236	fmla	v3.2d, v17.2d, v28.2d[1]
237	fmla	v4.2d, v16.2d, v29.2d[0]
238	fmla	v5.2d, v17.2d, v29.2d[0]
239//	cmp		w8, #4
240	fmla	v6.2d, v16.2d, v29.2d[1]
241	fmla	v7.2d, v17.2d, v29.2d[1]
242
243	// unroll 3
244//	ldp		q16, q17, [x9, #(0*8+0*32)]
245	fmla	v0.2d, v18.2d, v30.2d[0]
246	fmla	v1.2d, v19.2d, v30.2d[0]
247//	ldp		q24, q25, [x10, #(0*8+0*32)]
248	fmla	v2.2d, v18.2d, v30.2d[1]
249//	add		x10, x10, x11
250	fmla	v3.2d, v19.2d, v30.2d[1]
251//	ldp		q26, q27, [x10, #(0*8+1*32)]
252	fmla	v4.2d, v18.2d, v31.2d[0]
253//	add		x10, x10, x11
254	fmla	v5.2d, v19.2d, v31.2d[0]
255//	ldp		q28, q29, [x10, #(0*8+2*32)]
256	fmla	v6.2d, v18.2d, v31.2d[1]
257//	add		x10, x10, x11
258	fmla	v7.2d, v19.2d, v31.2d[1]
259//	ldp		q30, q31, [x10, #(0*8+3*32)]
260//	add		x10, x10, x11
261
262	b		2f // return
263
2644: // consider clean1-up loop
265
266	cmp		w8, #0
267	ble		2f // return
268
269	sub		x10, x10, x11
270	sub		x10, x10, x11
271	sub		x10, x10, x11
272	sub		x10, x10, x11
273
2743: // clean1-up loop
275
276	// unroll 0
277	ld1		{v24.2d, v25.2d}, [x9], #32
278	ld1		{v28.2d, v29.2d}, [x10]
279	fmla	v0.2d, v24.2d, v28.2d[0]
280	fmla	v1.2d, v25.2d, v28.2d[0]
281	add		x10, x10, x11
282	fmla	v2.2d, v24.2d, v28.2d[1]
283	fmla	v3.2d, v25.2d, v28.2d[1]
284	sub		w8, w8, #1
285	fmla	v4.2d, v24.2d, v29.2d[0]
286	fmla	v5.2d, v25.2d, v29.2d[0]
287	cmp		w8, #0
288	fmla	v6.2d, v24.2d, v29.2d[1]
289	fmla	v7.2d, v25.2d, v29.2d[1]
290
291	bgt		3b
292
2932: // return
294
295
296
297#elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
298
299
300
301	// early return
302	cmp		w8, #0
303	ble		2f // return
304
305	// prefetch
306
307	// preload
308
309	cmp		w8, #4
310	ble		0f // consider clean up loop
311
312	// prefetch
313
314	// zero tmp acc
315
316	// main loop
3171:
318
319	// load 0 & 1 & 2 & 3
320	ld1		{v16.2d, v17.2d}, [x9], #32
321	ld1		{v24.2d, v25.2d}, [x10]
322	add		x10, x10, x11
323	ld1		{v18.2d, v19.2d}, [x9], #32
324	ld1		{v26.2d, v27.2d}, [x10]
325	add		x10, x10, x11
326	ld1		{v20.2d, v21.2d}, [x9], #32
327	ld1		{v28.2d, v29.2d}, [x10]
328	add		x10, x10, x11
329	ld1		{v22.2d, v23.2d}, [x9], #32
330	ld1		{v30.2d, v31.2d}, [x10]
331	add		x10, x10, x11
332
333	// unroll 0
334	fmla	v0.2d, v16.2d, v24.2d[0]
335	fmla	v1.2d, v17.2d, v24.2d[0]
336	fmla	v2.2d, v16.2d, v24.2d[1]
337	fmla	v3.2d, v17.2d, v24.2d[1]
338	fmla	v4.2d, v16.2d, v25.2d[0]
339	fmla	v5.2d, v17.2d, v25.2d[0]
340	fmla	v6.2d, v16.2d, v25.2d[1]
341	fmla	v7.2d, v17.2d, v25.2d[1]
342
343	// unroll  1
344	fmla	v0.2d, v18.2d, v26.2d[0]
345	fmla	v1.2d, v19.2d, v26.2d[0]
346	fmla	v2.2d, v18.2d, v26.2d[1]
347	fmla	v3.2d, v19.2d, v26.2d[1]
348	fmla	v4.2d, v18.2d, v27.2d[0]
349	fmla	v5.2d, v19.2d, v27.2d[0]
350	fmla	v6.2d, v18.2d, v27.2d[1]
351	fmla	v7.2d, v19.2d, v27.2d[1]
352	sub		w8, w8, #4
353
354	// unroll 2
355	fmla	v0.2d, v20.2d, v28.2d[0]
356	fmla	v1.2d, v21.2d, v28.2d[0]
357	fmla	v2.2d, v20.2d, v28.2d[1]
358	fmla	v3.2d, v21.2d, v28.2d[1]
359	fmla	v4.2d, v20.2d, v29.2d[0]
360	fmla	v5.2d, v21.2d, v29.2d[0]
361	fmla	v6.2d, v20.2d, v29.2d[1]
362	fmla	v7.2d, v21.2d, v29.2d[1]
363	cmp		w8, #4
364
365	// unroll 3
366	fmla	v0.2d, v22.2d, v30.2d[0]
367	fmla	v1.2d, v23.2d, v30.2d[0]
368	fmla	v2.2d, v22.2d, v30.2d[1]
369	fmla	v3.2d, v23.2d, v30.2d[1]
370	fmla	v4.2d, v22.2d, v31.2d[0]
371	fmla	v5.2d, v23.2d, v31.2d[0]
372	fmla	v6.2d, v22.2d, v31.2d[1]
373	fmla	v7.2d, v23.2d, v31.2d[1]
374
375	bgt		1b
376
377
378	// reduce
379
3800:
381
382	cmp		w8, #3
383	ble		4f
384
385	// load 0 & 1 & 2 & 3
386	ld1		{v16.2d, v17.2d}, [x9], #32
387	ld1		{v24.2d, v25.2d}, [x10]
388	add		x10, x10, x11
389	ld1		{v18.2d, v19.2d}, [x9], #32
390	ld1		{v26.2d, v27.2d}, [x10]
391	add		x10, x10, x11
392	ld1		{v20.2d, v21.2d}, [x9], #32
393	ld1		{v28.2d, v29.2d}, [x10]
394	add		x10, x10, x11
395	ld1		{v22.2d, v23.2d}, [x9], #32
396	ld1		{v30.2d, v31.2d}, [x10]
397	add		x10, x10, x11
398
399	// unroll 0
400	fmla	v0.2d, v16.2d, v24.2d[0]
401	fmla	v1.2d, v17.2d, v24.2d[0]
402	fmla	v2.2d, v16.2d, v24.2d[1]
403	fmla	v3.2d, v17.2d, v24.2d[1]
404	fmla	v4.2d, v16.2d, v25.2d[0]
405	fmla	v5.2d, v17.2d, v25.2d[0]
406	fmla	v6.2d, v16.2d, v25.2d[1]
407	fmla	v7.2d, v17.2d, v25.2d[1]
408
409	// unroll  1
410	fmla	v0.2d, v18.2d, v26.2d[0]
411	fmla	v1.2d, v19.2d, v26.2d[0]
412	fmla	v2.2d, v18.2d, v26.2d[1]
413	fmla	v3.2d, v19.2d, v26.2d[1]
414	fmla	v4.2d, v18.2d, v27.2d[0]
415	fmla	v5.2d, v19.2d, v27.2d[0]
416	fmla	v6.2d, v18.2d, v27.2d[1]
417	fmla	v7.2d, v19.2d, v27.2d[1]
418	sub		w8, w8, #4
419
420	// unroll 2
421	fmla	v0.2d, v20.2d, v28.2d[0]
422	fmla	v1.2d, v21.2d, v28.2d[0]
423	fmla	v2.2d, v20.2d, v28.2d[1]
424	fmla	v3.2d, v21.2d, v28.2d[1]
425	fmla	v4.2d, v20.2d, v29.2d[0]
426	fmla	v5.2d, v21.2d, v29.2d[0]
427	fmla	v6.2d, v20.2d, v29.2d[1]
428	fmla	v7.2d, v21.2d, v29.2d[1]
429
430	// unroll 3
431	fmla	v0.2d, v22.2d, v30.2d[0]
432	fmla	v1.2d, v23.2d, v30.2d[0]
433	fmla	v2.2d, v22.2d, v30.2d[1]
434	fmla	v3.2d, v23.2d, v30.2d[1]
435	fmla	v4.2d, v22.2d, v31.2d[0]
436	fmla	v5.2d, v23.2d, v31.2d[0]
437	fmla	v6.2d, v22.2d, v31.2d[1]
438	fmla	v7.2d, v23.2d, v31.2d[1]
439
440	b		2f // return
441
4424: // consider clean1-up loop
443
444	cmp		w8, #0
445	ble		2f // return
446
4473: // clean1-up loop
448
449	// unroll 0
450	ld1		{v24.2d, v25.2d}, [x9], #32
451	ld1		{v28.2d, v29.2d}, [x10]
452	fmla	v0.2d, v24.2d, v28.2d[0]
453	fmla	v1.2d, v25.2d, v28.2d[0]
454	add		x10, x10, x11
455	fmla	v2.2d, v24.2d, v28.2d[1]
456	fmla	v3.2d, v25.2d, v28.2d[1]
457	sub		w8, w8, #1
458	fmla	v4.2d, v24.2d, v29.2d[0]
459	fmla	v5.2d, v25.2d, v29.2d[0]
460	cmp		w8, #0
461	fmla	v6.2d, v24.2d, v29.2d[1]
462	fmla	v7.2d, v25.2d, v29.2d[1]
463
464	bgt		3b
465
4662: // return
467
468
469
470#endif // cortex a53
471
472
473
474#if MACRO_LEVEL>=2
475	.endm
476#else
477	ret
478
479	FUN_END(inner_kernel_gemm_add_nt_4x4_lib4c)
480#endif
481
482
483
484
485
486// subroutine
487//
488// input arguments:
489// w8   <- k
490// x9   <- A
491// x10  <- B
492// x11  <- ldb
493//
494// output arguments:
495
496#if MACRO_LEVEL>=2
497	.macro INNER_KERNEL_GEMM_ADD_NT_4X3_LIB4C
498#else
499	.align	4
500	FUN_START(inner_kernel_gemm_add_nt_4x3_lib4c)
501#endif
502
503
504
505#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
506
507
508
509	// early return
510	cmp		w8, #0
511	ble		2f // return
512
513	add		x12, x11, x11
514	add		x13, x12, x11
515	add		x14, x12, x12
516	add		x15, x13, x12
517	add		x16, x13, x13
518	add		x17, x14, x13
519
520	// prefetch
521	prfm	PLDL1KEEP, [x10]
522	prfm	PLDL1KEEP, [x10, x11]
523	prfm	PLDL1KEEP, [x10, x12]
524	prfm	PLDL1KEEP, [x10, x13]
525	prfm	PLDL1KEEP, [x9, #0]
526	prfm	PLDL1KEEP, [x9, #64]
527
528	// preload
529	ldr		q24, [x10, #(0*8)]
530	ldr		d25, [x10, #(2*8)]
531	add		x10, x10, x11
532	ldr		q26, [x10, #(0*8)]
533	ldr		d27, [x10, #(2*8)]
534	add		x10, x10, x11
535	ldr		q28, [x10, #(0*8)]
536	ldr		d29, [x10, #(2*8)]
537	add		x10, x10, x11
538	ldr		q30, [x10, #(0*8)]
539	ldr		d31, [x10, #(2*8)]
540	add		x10, x10, x11
541	ldp		q16, q17, [x9, #(0*8+0*32)]
542
543	cmp		w8, #4
544	ble		0f // consider clean up loop
545
546	// prefetch
547	prfm	PLDL1KEEP, [x10]
548	prfm	PLDL1KEEP, [x10, x11]
549	prfm	PLDL1KEEP, [x10, x12]
550	prfm	PLDL1KEEP, [x10, x13]
551	prfm	PLDL1KEEP, [x9, #128]
552	prfm	PLDL1KEEP, [x9, #192]
553
554	// zero tmp acc
555	fmov	d8, xzr
556	fmov    d9, d8
557	fmov    d10, d8
558	fmov    d11, d8
559	fmov    d12, d8
560	fmov    d13, d8
561
562	// main loop
5631:
564
565	// unroll 0
566	ldp		q18, q19, [x9, #(0*8+1*32)]
567	fmla	v0.2d, v16.2d, v24.2d[0]
568	fmla	v1.2d, v17.2d, v24.2d[0]
569	prfm	PLDL1KEEP, [x9, #256]
570//	prfm	PLDL1KEEP, [x9, #128]
571	fmla	v2.2d, v16.2d, v24.2d[1]
572	fmla	v3.2d, v17.2d, v24.2d[1]
573	prfm	PLDL1KEEP, [x9, #320]
574//	prfm	PLDL1KEEP, [x9, #192]
575	fmla	v4.2d, v16.2d, v25.2d[0]
576	fmla	v5.2d, v17.2d, v25.2d[0]
577//	prfm	PLDL1KEEP, [x10]
578	prfm	PLDL1KEEP, [x10, x14]
579
580	// unroll 1
581	ldp		q16, q17, [x9, #(0*8+2*32)]
582	fmla	v8.2d, v18.2d, v26.2d[0]
583	fmla	v9.2d, v19.2d, v26.2d[0]
584//	prfm	PLDL1KEEP, [x10, x11]
585	prfm	PLDL1KEEP, [x10, x15]
586	fmla	v10.2d, v18.2d, v26.2d[1]
587	fmla	v11.2d, v19.2d, v26.2d[1]
588//	prfm	PLDL1KEEP, [x10, x12]
589	prfm	PLDL1KEEP, [x10, x16]
590	fmla	v12.2d, v18.2d, v27.2d[0]
591	fmla	v13.2d, v19.2d, v27.2d[0]
592//	prfm	PLDL1KEEP, [x10, x13]
593	prfm	PLDL1KEEP, [x10, x17]
594
595	// unroll 2
596	ldp		q18, q19, [x9, #(0*8+3*32)]
597	fmla	v0.2d, v16.2d, v28.2d[0]
598	fmla	v1.2d, v17.2d, v28.2d[0]
599	add		x9, x9, #128
600	fmla	v2.2d, v16.2d, v28.2d[1]
601	fmla	v3.2d, v17.2d, v28.2d[1]
602	sub		w8, w8, #4
603	fmla	v4.2d, v16.2d, v29.2d[0]
604	fmla	v5.2d, v17.2d, v29.2d[0]
605	cmp		w8, #4
606
607	// unroll 3
608	ldp		q16, q17, [x9, #(0*8+0*32)]
609	fmla	v8.2d, v18.2d, v30.2d[0]
610	fmla	v9.2d, v19.2d, v30.2d[0]
611	ldr		q24, [x10, #(0*8)]
612	ldr		d25, [x10, #(2*8)]
613	fmla	v10.2d, v18.2d, v30.2d[1]
614	add		x10, x10, x11
615	fmla	v11.2d, v19.2d, v30.2d[1]
616	ldr		q26, [x10, #(0*8)]
617	ldr		d27, [x10, #(2*8)]
618	fmla	v12.2d, v18.2d, v31.2d[0]
619	add		x10, x10, x11
620	fmla	v13.2d, v19.2d, v31.2d[0]
621	ldr		q28, [x10, #(0*8)]
622	ldr		d29, [x10, #(2*8)]
623	add		x10, x10, x11
624	ldr		q30, [x10, #(0*8)]
625	ldr		d31, [x10, #(2*8)]
626	add		x10, x10, x11
627
628	bgt		1b
629
630
631	// reduce
632	fadd	v0.2d, v0.2d, v8.2d
633	fadd	v1.2d, v1.2d, v9.2d
634	fadd	v2.2d, v2.2d, v10.2d
635	fadd	v3.2d, v3.2d, v11.2d
636	fadd	v4.2d, v4.2d, v12.2d
637	fadd	v5.2d, v5.2d, v13.2d
638
6390:
640
641	cmp		w8, #3
642	ble		4f
643
644	// unroll 0
645	ldp		q18, q19, [x9, #(0*8+1*32)]
646	fmla	v0.2d, v16.2d, v24.2d[0]
647	fmla	v1.2d, v17.2d, v24.2d[0]
648//	prfm	PLDL1KEEP, [x9, #128]
649	fmla	v2.2d, v16.2d, v24.2d[1]
650	fmla	v3.2d, v17.2d, v24.2d[1]
651//	prfm	PLDL1KEEP, [x9, #192]
652	fmla	v4.2d, v16.2d, v25.2d[0]
653	fmla	v5.2d, v17.2d, v25.2d[0]
654//	prfm	PLDL1KEEP, [x10, #128]
655
656	// unroll 1
657//	prfm	PLDL1KEEP, [x10, #192]
658	fmla	v0.2d, v18.2d, v26.2d[0]
659	fmla	v1.2d, v19.2d, v26.2d[0]
660	ldp		q16, q17, [x9, #(0*8+2*32)]
661	fmla	v2.2d, v18.2d, v26.2d[1]
662	fmla	v3.2d, v19.2d, v26.2d[1]
663	fmla	v4.2d, v18.2d, v27.2d[0]
664	fmla	v5.2d, v19.2d, v27.2d[0]
665	sub		w8, w8, #4
666
667	// unroll 2
668	ldp		q18, q19, [x9, #(0*8+3*32)]
669	fmla	v0.2d, v16.2d, v28.2d[0]
670	fmla	v1.2d, v17.2d, v28.2d[0]
671	add		x9, x9, #128
672	fmla	v2.2d, v16.2d, v28.2d[1]
673	fmla	v3.2d, v17.2d, v28.2d[1]
674	fmla	v4.2d, v16.2d, v29.2d[0]
675	fmla	v5.2d, v17.2d, v29.2d[0]
676//	cmp		w8, #4
677
678	// unroll 3
679//	ldp		q16, q17, [x9, #(0*8+0*32)]
680	fmla	v0.2d, v18.2d, v30.2d[0]
681	fmla	v1.2d, v19.2d, v30.2d[0]
682//	ldp		q24, q25, [x10, #(0*8+0*32)]
683	fmla	v2.2d, v18.2d, v30.2d[1]
684//	add		x10, x10, x11
685	fmla	v3.2d, v19.2d, v30.2d[1]
686//	ldp		q26, q27, [x10, #(0*8+1*32)]
687	fmla	v4.2d, v18.2d, v31.2d[0]
688//	add		x10, x10, x11
689	fmla	v5.2d, v19.2d, v31.2d[0]
690//	ldp		q28, q29, [x10, #(0*8+2*32)]
691//	add		x10, x10, x11
692//	ldp		q30, q31, [x10, #(0*8+3*32)]
693//	add		x10, x10, x11
694
695	b		2f // return
696
6974: // consider clean1-up loop
698
699	cmp		w8, #0
700	ble		2f // return
701
702	sub		x10, x10, x11
703	sub		x10, x10, x11
704	sub		x10, x10, x11
705	sub		x10, x10, x11
706
7073: // clean1-up loop
708
709	// unroll 0
710	ld1		{v24.2d, v25.2d}, [x9], #32
711	ldr		q28, [x10, #(0*8)]
712	ldr		d29, [x10, #(2*8)]
713	fmla	v0.2d, v24.2d, v28.2d[0]
714	fmla	v1.2d, v25.2d, v28.2d[0]
715	add		x10, x10, x11
716	fmla	v2.2d, v24.2d, v28.2d[1]
717	fmla	v3.2d, v25.2d, v28.2d[1]
718	sub		w8, w8, #1
719	fmla	v4.2d, v24.2d, v29.2d[0]
720	fmla	v5.2d, v25.2d, v29.2d[0]
721	cmp		w8, #0
722
723	bgt		3b
724
7252: // return
726
727
728
729#elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
730
731
732
733	// early return
734	cmp		w8, #0
735	ble		2f // return
736
737	// prefetch
738
739	// preload
740
741	cmp		w8, #4
742	ble		0f // consider clean up loop
743
744	// prefetch
745
746	// zero tmp acc
747
748	// main loop
7491:
750
751	// load 0 & 1 & 2 & 3
752	ld1		{v16.2d, v17.2d}, [x9], #32
753	ldr		q24, [x10, #0]
754	ldr		d25, [x10, #16]
755	add		x10, x10, x11
756	ld1		{v18.2d, v19.2d}, [x9], #32
757	ldr		q26, [x10, #0]
758	ldr		d27, [x10, #16]
759	add		x10, x10, x11
760	ld1		{v20.2d, v21.2d}, [x9], #32
761	ldr		q28, [x10, #0]
762	ldr		d29, [x10, #16]
763	add		x10, x10, x11
764	ld1		{v22.2d, v23.2d}, [x9], #32
765	ldr		q30, [x10, #0]
766	ldr		d31, [x10, #16]
767	add		x10, x10, x11
768
769	// unroll 0
770	fmla	v0.2d, v16.2d, v24.2d[0]
771	fmla	v1.2d, v17.2d, v24.2d[0]
772	fmla	v2.2d, v16.2d, v24.2d[1]
773	fmla	v3.2d, v17.2d, v24.2d[1]
774	fmla	v4.2d, v16.2d, v25.2d[0]
775	fmla	v5.2d, v17.2d, v25.2d[0]
776
777	// unroll  1
778	fmla	v0.2d, v18.2d, v26.2d[0]
779	fmla	v1.2d, v19.2d, v26.2d[0]
780	fmla	v2.2d, v18.2d, v26.2d[1]
781	fmla	v3.2d, v19.2d, v26.2d[1]
782	fmla	v4.2d, v18.2d, v27.2d[0]
783	fmla	v5.2d, v19.2d, v27.2d[0]
784	sub		w8, w8, #4
785
786	// unroll 2
787	fmla	v0.2d, v20.2d, v28.2d[0]
788	fmla	v1.2d, v21.2d, v28.2d[0]
789	fmla	v2.2d, v20.2d, v28.2d[1]
790	fmla	v3.2d, v21.2d, v28.2d[1]
791	fmla	v4.2d, v20.2d, v29.2d[0]
792	fmla	v5.2d, v21.2d, v29.2d[0]
793	cmp		w8, #4
794
795	// unroll 3
796	fmla	v0.2d, v22.2d, v30.2d[0]
797	fmla	v1.2d, v23.2d, v30.2d[0]
798	fmla	v2.2d, v22.2d, v30.2d[1]
799	fmla	v3.2d, v23.2d, v30.2d[1]
800	fmla	v4.2d, v22.2d, v31.2d[0]
801	fmla	v5.2d, v23.2d, v31.2d[0]
802
803	bgt		1b
804
805
806	// reduce
807
8080:
809
810	cmp		w8, #3
811	ble		4f
812
813	// load 0 & 1 & 2 & 3
814	ld1		{v16.2d, v17.2d}, [x9], #32
815	ldr		q24, [x10, #0]
816	ldr		d25, [x10, #16]
817	add		x10, x10, x11
818	ld1		{v18.2d, v19.2d}, [x9], #32
819	ldr		q26, [x10, #0]
820	ldr		d27, [x10, #16]
821	add		x10, x10, x11
822	ld1		{v20.2d, v21.2d}, [x9], #32
823	ldr		q28, [x10, #0]
824	ldr		d29, [x10, #16]
825	add		x10, x10, x11
826	ld1		{v22.2d, v23.2d}, [x9], #32
827	ldr		q30, [x10, #0]
828	ldr		d31, [x10, #16]
829	add		x10, x10, x11
830
831	// unroll 0
832	fmla	v0.2d, v16.2d, v24.2d[0]
833	fmla	v1.2d, v17.2d, v24.2d[0]
834	fmla	v2.2d, v16.2d, v24.2d[1]
835	fmla	v3.2d, v17.2d, v24.2d[1]
836	fmla	v4.2d, v16.2d, v25.2d[0]
837	fmla	v5.2d, v17.2d, v25.2d[0]
838
839	// unroll  1
840	fmla	v0.2d, v18.2d, v26.2d[0]
841	fmla	v1.2d, v19.2d, v26.2d[0]
842	fmla	v2.2d, v18.2d, v26.2d[1]
843	fmla	v3.2d, v19.2d, v26.2d[1]
844	fmla	v4.2d, v18.2d, v27.2d[0]
845	fmla	v5.2d, v19.2d, v27.2d[0]
846	sub		w8, w8, #4
847
848	// unroll 2
849	fmla	v0.2d, v20.2d, v28.2d[0]
850	fmla	v1.2d, v21.2d, v28.2d[0]
851	fmla	v2.2d, v20.2d, v28.2d[1]
852	fmla	v3.2d, v21.2d, v28.2d[1]
853	fmla	v4.2d, v20.2d, v29.2d[0]
854	fmla	v5.2d, v21.2d, v29.2d[0]
855
856	// unroll 3
857	fmla	v0.2d, v22.2d, v30.2d[0]
858	fmla	v1.2d, v23.2d, v30.2d[0]
859	fmla	v2.2d, v22.2d, v30.2d[1]
860	fmla	v3.2d, v23.2d, v30.2d[1]
861	fmla	v4.2d, v22.2d, v31.2d[0]
862	fmla	v5.2d, v23.2d, v31.2d[0]
863
864	b		2f // return
865
8664: // consider clean1-up loop
867
868	cmp		w8, #0
869	ble		2f // return
870
8713: // clean1-up loop
872
873	// unroll 0
874	ld1		{v24.2d, v25.2d}, [x9], #32
875	ldr		q28, [x10, #0]
876	ldr		d29, [x10, #16]
877	fmla	v0.2d, v24.2d, v28.2d[0]
878	fmla	v1.2d, v25.2d, v28.2d[0]
879	add		x10, x10, x11
880	fmla	v2.2d, v24.2d, v28.2d[1]
881	fmla	v3.2d, v25.2d, v28.2d[1]
882	sub		w8, w8, #1
883	fmla	v4.2d, v24.2d, v29.2d[0]
884	fmla	v5.2d, v25.2d, v29.2d[0]
885	cmp		w8, #0
886
887	bgt		3b
888
8892: // return
890
891
892
893#endif // cortex a53
894
895
896
897#if MACRO_LEVEL>=2
898	.endm
899#else
900	ret
901
902	FUN_END(inner_kernel_gemm_add_nt_4x3_lib4c)
903#endif
904
905
906
907
908
909// subroutine
910//
911// input arguments:
912// w8   <- k
913// x9   <- A
914// x10  <- B
915// x11  <- ldb
916//
917// output arguments:
918
919#if MACRO_LEVEL>=2
920	.macro INNER_KERNEL_GEMM_ADD_NT_4X2_LIB4C
921#else
922	.align	4
923	FUN_START(inner_kernel_gemm_add_nt_4x2_lib4c)
924#endif
925
926
927
928#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
929
930
931
932	// early return
933	cmp		w8, #0
934	ble		2f // return
935
936	add		x12, x11, x11
937	add		x13, x12, x11
938	add		x14, x12, x12
939	add		x15, x13, x12
940	add		x16, x13, x13
941	add		x17, x14, x13
942
943	// prefetch
944	prfm	PLDL1KEEP, [x10]
945	prfm	PLDL1KEEP, [x10, x11]
946	prfm	PLDL1KEEP, [x10, x12]
947	prfm	PLDL1KEEP, [x10, x13]
948	prfm	PLDL1KEEP, [x9, #0]
949	prfm	PLDL1KEEP, [x9, #64]
950
951	// preload
952	ldr		q24, [x10, #(0*8)]
953	add		x10, x10, x11
954	ldr		q26, [x10, #(0*8)]
955	add		x10, x10, x11
956	ldr		q28, [x10, #(0*8)]
957	add		x10, x10, x11
958	ldr		q30, [x10, #(0*8)]
959	add		x10, x10, x11
960	ldp		q16, q17, [x9, #(0*8+0*32)]
961
962	cmp		w8, #4
963	ble		0f // consider clean up loop
964
965	// prefetch
966	prfm	PLDL1KEEP, [x10]
967	prfm	PLDL1KEEP, [x10, x11]
968	prfm	PLDL1KEEP, [x10, x12]
969	prfm	PLDL1KEEP, [x10, x13]
970	prfm	PLDL1KEEP, [x9, #128]
971	prfm	PLDL1KEEP, [x9, #192]
972
973	// zero tmp acc
974	fmov	d8, xzr
975	fmov    d9, d8
976	fmov    d10, d8
977	fmov    d11, d8
978
979	// main loop
9801:
981
982	// unroll 0
983	ldp		q18, q19, [x9, #(0*8+1*32)]
984	fmla	v0.2d, v16.2d, v24.2d[0]
985	fmla	v1.2d, v17.2d, v24.2d[0]
986	prfm	PLDL1KEEP, [x9, #256]
987//	prfm	PLDL1KEEP, [x9, #128]
988	fmla	v2.2d, v16.2d, v24.2d[1]
989	fmla	v3.2d, v17.2d, v24.2d[1]
990	prfm	PLDL1KEEP, [x9, #320]
991//	prfm	PLDL1KEEP, [x9, #192]
992//	prfm	PLDL1KEEP, [x10]
993	prfm	PLDL1KEEP, [x10, x14]
994
995	// unroll 1
996	ldp		q16, q17, [x9, #(0*8+2*32)]
997	fmla	v8.2d, v18.2d, v26.2d[0]
998	fmla	v9.2d, v19.2d, v26.2d[0]
999//	prfm	PLDL1KEEP, [x10, x11]
1000	prfm	PLDL1KEEP, [x10, x15]
1001	fmla	v10.2d, v18.2d, v26.2d[1]
1002	fmla	v11.2d, v19.2d, v26.2d[1]
1003//	prfm	PLDL1KEEP, [x10, x12]
1004	prfm	PLDL1KEEP, [x10, x16]
1005//	prfm	PLDL1KEEP, [x10, x13]
1006	prfm	PLDL1KEEP, [x10, x17]
1007
1008	// unroll 2
1009	ldp		q18, q19, [x9, #(0*8+3*32)]
1010	fmla	v0.2d, v16.2d, v28.2d[0]
1011	fmla	v1.2d, v17.2d, v28.2d[0]
1012	add		x9, x9, #128
1013	fmla	v2.2d, v16.2d, v28.2d[1]
1014	fmla	v3.2d, v17.2d, v28.2d[1]
1015	sub		w8, w8, #4
1016	cmp		w8, #4
1017
1018	// unroll 3
1019	ldp		q16, q17, [x9, #(0*8+0*32)]
1020	fmla	v8.2d, v18.2d, v30.2d[0]
1021	fmla	v9.2d, v19.2d, v30.2d[0]
1022	ldr		q24, [x10, #(0*8)]
1023	fmla	v10.2d, v18.2d, v30.2d[1]
1024	add		x10, x10, x11
1025	fmla	v11.2d, v19.2d, v30.2d[1]
1026	ldr		q26, [x10, #(0*8)]
1027	add		x10, x10, x11
1028	ldr		q28, [x10, #(0*8)]
1029	add		x10, x10, x11
1030	ldr		q30, [x10, #(0*8)]
1031	add		x10, x10, x11
1032
1033	bgt		1b
1034
1035
1036	// reduce
1037	fadd	v0.2d, v0.2d, v8.2d
1038	fadd	v1.2d, v1.2d, v9.2d
1039	fadd	v2.2d, v2.2d, v10.2d
1040	fadd	v3.2d, v3.2d, v11.2d
1041
10420:
1043
1044	cmp		w8, #3
1045	ble		4f
1046
1047	// unroll 0
1048	ldp		q18, q19, [x9, #(0*8+1*32)]
1049	fmla	v0.2d, v16.2d, v24.2d[0]
1050	fmla	v1.2d, v17.2d, v24.2d[0]
1051//	prfm	PLDL1KEEP, [x9, #128]
1052	fmla	v2.2d, v16.2d, v24.2d[1]
1053	fmla	v3.2d, v17.2d, v24.2d[1]
1054//	prfm	PLDL1KEEP, [x9, #192]
1055//	prfm	PLDL1KEEP, [x10, #128]
1056
1057	// unroll 1
1058//	prfm	PLDL1KEEP, [x10, #192]
1059	fmla	v0.2d, v18.2d, v26.2d[0]
1060	fmla	v1.2d, v19.2d, v26.2d[0]
1061	ldp		q16, q17, [x9, #(0*8+2*32)]
1062	fmla	v2.2d, v18.2d, v26.2d[1]
1063	fmla	v3.2d, v19.2d, v26.2d[1]
1064	sub		w8, w8, #4
1065
1066	// unroll 2
1067	ldp		q18, q19, [x9, #(0*8+3*32)]
1068	fmla	v0.2d, v16.2d, v28.2d[0]
1069	fmla	v1.2d, v17.2d, v28.2d[0]
1070	add		x9, x9, #128
1071	fmla	v2.2d, v16.2d, v28.2d[1]
1072	fmla	v3.2d, v17.2d, v28.2d[1]
1073//	cmp		w8, #4
1074
1075	// unroll 3
1076//	ldp		q16, q17, [x9, #(0*8+0*32)]
1077	fmla	v0.2d, v18.2d, v30.2d[0]
1078	fmla	v1.2d, v19.2d, v30.2d[0]
1079//	ldp		q24, q25, [x10, #(0*8+0*32)]
1080	fmla	v2.2d, v18.2d, v30.2d[1]
1081//	add		x10, x10, x11
1082	fmla	v3.2d, v19.2d, v30.2d[1]
1083//	ldp		q26, q27, [x10, #(0*8+1*32)]
1084//	add		x10, x10, x11
1085//	ldp		q28, q29, [x10, #(0*8+2*32)]
1086//	add		x10, x10, x11
1087//	ldp		q30, q31, [x10, #(0*8+3*32)]
1088//	add		x10, x10, x11
1089
1090	b		2f // return
1091
10924: // consider clean1-up loop
1093
1094	cmp		w8, #0
1095	ble		2f // return
1096
1097	sub		x10, x10, x11
1098	sub		x10, x10, x11
1099	sub		x10, x10, x11
1100	sub		x10, x10, x11
1101
11023: // clean1-up loop
1103
1104	// unroll 0
1105	ld1		{v24.2d, v25.2d}, [x9], #32
1106	ldr		q28, [x10, #(0*8)]
1107	fmla	v0.2d, v24.2d, v28.2d[0]
1108	fmla	v1.2d, v25.2d, v28.2d[0]
1109	add		x10, x10, x11
1110	fmla	v2.2d, v24.2d, v28.2d[1]
1111	fmla	v3.2d, v25.2d, v28.2d[1]
1112	sub		w8, w8, #1
1113	cmp		w8, #0
1114
1115	bgt		3b
1116
11172: // return
1118
1119
1120
1121#elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1122
1123
1124
1125	// early return
1126	cmp		w8, #0
1127	ble		2f // return
1128
1129	// prefetch
1130
1131	// preload
1132
1133	cmp		w8, #4
1134	ble		0f // consider clean up loop
1135
1136	// prefetch
1137
1138	// zero tmp acc
1139
1140	// main loop
11411:
1142
1143	// load 0 & 1 & 2 & 3
1144	ld1		{v16.2d, v17.2d}, [x9], #32
1145	ldr		q24, [x10, #0]
1146	add		x10, x10, x11
1147	ld1		{v18.2d, v19.2d}, [x9], #32
1148	ldr		q26, [x10, #0]
1149	add		x10, x10, x11
1150	ld1		{v20.2d, v21.2d}, [x9], #32
1151	ldr		q28, [x10, #0]
1152	add		x10, x10, x11
1153	ld1		{v22.2d, v23.2d}, [x9], #32
1154	ldr		q30, [x10, #0]
1155	add		x10, x10, x11
1156
1157	// unroll 0
1158	fmla	v0.2d, v16.2d, v24.2d[0]
1159	fmla	v1.2d, v17.2d, v24.2d[0]
1160	fmla	v2.2d, v16.2d, v24.2d[1]
1161	fmla	v3.2d, v17.2d, v24.2d[1]
1162
1163	// unroll  1
1164	fmla	v0.2d, v18.2d, v26.2d[0]
1165	fmla	v1.2d, v19.2d, v26.2d[0]
1166	fmla	v2.2d, v18.2d, v26.2d[1]
1167	fmla	v3.2d, v19.2d, v26.2d[1]
1168	sub		w8, w8, #4
1169
1170	// unroll 2
1171	fmla	v0.2d, v20.2d, v28.2d[0]
1172	fmla	v1.2d, v21.2d, v28.2d[0]
1173	fmla	v2.2d, v20.2d, v28.2d[1]
1174	fmla	v3.2d, v21.2d, v28.2d[1]
1175	cmp		w8, #4
1176
1177	// unroll 3
1178	fmla	v0.2d, v22.2d, v30.2d[0]
1179	fmla	v1.2d, v23.2d, v30.2d[0]
1180	fmla	v2.2d, v22.2d, v30.2d[1]
1181	fmla	v3.2d, v23.2d, v30.2d[1]
1182
1183	bgt		1b
1184
1185
1186	// reduce
1187
11880:
1189
1190	cmp		w8, #3
1191	ble		4f
1192
1193	// load 0 & 1 & 2 & 3
1194	ld1		{v16.2d, v17.2d}, [x9], #32
1195	ldr		q24, [x10, #0]
1196	add		x10, x10, x11
1197	ld1		{v18.2d, v19.2d}, [x9], #32
1198	ldr		q26, [x10, #0]
1199	add		x10, x10, x11
1200	ld1		{v20.2d, v21.2d}, [x9], #32
1201	ldr		q28, [x10, #0]
1202	add		x10, x10, x11
1203	ld1		{v22.2d, v23.2d}, [x9], #32
1204	ldr		q30, [x10, #0]
1205	add		x10, x10, x11
1206
1207	// unroll 0
1208	fmla	v0.2d, v16.2d, v24.2d[0]
1209	fmla	v1.2d, v17.2d, v24.2d[0]
1210	fmla	v2.2d, v16.2d, v24.2d[1]
1211	fmla	v3.2d, v17.2d, v24.2d[1]
1212
1213	// unroll  1
1214	fmla	v0.2d, v18.2d, v26.2d[0]
1215	fmla	v1.2d, v19.2d, v26.2d[0]
1216	fmla	v2.2d, v18.2d, v26.2d[1]
1217	fmla	v3.2d, v19.2d, v26.2d[1]
1218	sub		w8, w8, #4
1219
1220	// unroll 2
1221	fmla	v0.2d, v20.2d, v28.2d[0]
1222	fmla	v1.2d, v21.2d, v28.2d[0]
1223	fmla	v2.2d, v20.2d, v28.2d[1]
1224	fmla	v3.2d, v21.2d, v28.2d[1]
1225
1226	// unroll 3
1227	fmla	v0.2d, v22.2d, v30.2d[0]
1228	fmla	v1.2d, v23.2d, v30.2d[0]
1229	fmla	v2.2d, v22.2d, v30.2d[1]
1230	fmla	v3.2d, v23.2d, v30.2d[1]
1231
1232	b		2f // return
1233
12344: // consider clean1-up loop
1235
1236	cmp		w8, #0
1237	ble		2f // return
1238
12393: // clean1-up loop
1240
1241	// unroll 0
1242	ld1		{v24.2d, v25.2d}, [x9], #32
1243	ldr		q28, [x10, #0]
1244	fmla	v0.2d, v24.2d, v28.2d[0]
1245	fmla	v1.2d, v25.2d, v28.2d[0]
1246	add		x10, x10, x11
1247	fmla	v2.2d, v24.2d, v28.2d[1]
1248	fmla	v3.2d, v25.2d, v28.2d[1]
1249	sub		w8, w8, #1
1250	cmp		w8, #0
1251
1252	bgt		3b
1253
12542: // return
1255
1256
1257
1258#endif // cortex a53
1259
1260
1261
1262#if MACRO_LEVEL>=2
1263	.endm
1264#else
1265	ret
1266
1267	FUN_END(inner_kernel_gemm_add_nt_4x2_lib4c)
1268#endif
1269
1270
1271
1272
1273
1274// subroutine
1275//
1276// input arguments:
1277// w8   <- k
1278// x9   <- A
1279// x10  <- B
1280// x11  <- ldb
1281//
1282// output arguments:
1283
1284#if MACRO_LEVEL>=2
1285	.macro INNER_KERNEL_GEMM_ADD_NT_4X1_LIB4C
1286#else
1287	.align	4
1288	FUN_START(inner_kernel_gemm_add_nt_4x1_lib4c)
1289#endif
1290
1291
1292
1293#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1294
1295
1296
1297	// early return
1298	cmp		w8, #0
1299	ble		2f // return
1300
1301	add		x12, x11, x11
1302	add		x13, x12, x11
1303	add		x14, x12, x12
1304	add		x15, x13, x12
1305	add		x16, x13, x13
1306	add		x17, x14, x13
1307
1308	// prefetch
1309	prfm	PLDL1KEEP, [x10]
1310	prfm	PLDL1KEEP, [x10, x11]
1311	prfm	PLDL1KEEP, [x10, x12]
1312	prfm	PLDL1KEEP, [x10, x13]
1313	prfm	PLDL1KEEP, [x9, #0]
1314	prfm	PLDL1KEEP, [x9, #64]
1315
1316	// preload
1317	ldr		d24, [x10, #(0*8)]
1318	add		x10, x10, x11
1319	ldr		d26, [x10, #(0*8)]
1320	add		x10, x10, x11
1321	ldr		d28, [x10, #(0*8)]
1322	add		x10, x10, x11
1323	ldr		d30, [x10, #(0*8)]
1324	add		x10, x10, x11
1325	ldp		q16, q17, [x9, #(0*8+0*32)]
1326
1327	cmp		w8, #4
1328	ble		0f // consider clean up loop
1329
1330	// prefetch
1331	prfm	PLDL1KEEP, [x10]
1332	prfm	PLDL1KEEP, [x10, x11]
1333	prfm	PLDL1KEEP, [x10, x12]
1334	prfm	PLDL1KEEP, [x10, x13]
1335	prfm	PLDL1KEEP, [x9, #128]
1336	prfm	PLDL1KEEP, [x9, #192]
1337
1338	// zero tmp acc
1339	fmov	d8, xzr
1340	fmov    d9, d8
1341
1342	// main loop
13431:
1344
1345	// unroll 0
1346	ldp		q18, q19, [x9, #(0*8+1*32)]
1347	fmla	v0.2d, v16.2d, v24.2d[0]
1348	fmla	v1.2d, v17.2d, v24.2d[0]
1349	prfm	PLDL1KEEP, [x9, #256]
1350//	prfm	PLDL1KEEP, [x9, #128]
1351	prfm	PLDL1KEEP, [x9, #320]
1352//	prfm	PLDL1KEEP, [x9, #192]
1353//	prfm	PLDL1KEEP, [x10]
1354	prfm	PLDL1KEEP, [x10, x14]
1355
1356	// unroll 1
1357	ldp		q16, q17, [x9, #(0*8+2*32)]
1358	fmla	v8.2d, v18.2d, v26.2d[0]
1359	fmla	v9.2d, v19.2d, v26.2d[0]
1360//	prfm	PLDL1KEEP, [x10, x11]
1361	prfm	PLDL1KEEP, [x10, x15]
1362//	prfm	PLDL1KEEP, [x10, x12]
1363	prfm	PLDL1KEEP, [x10, x16]
1364//	prfm	PLDL1KEEP, [x10, x13]
1365	prfm	PLDL1KEEP, [x10, x17]
1366
1367	// unroll 2
1368	ldp		q18, q19, [x9, #(0*8+3*32)]
1369	fmla	v0.2d, v16.2d, v28.2d[0]
1370	fmla	v1.2d, v17.2d, v28.2d[0]
1371	add		x9, x9, #128
1372	sub		w8, w8, #4
1373	cmp		w8, #4
1374
1375	// unroll 3
1376	ldp		q16, q17, [x9, #(0*8+0*32)]
1377	fmla	v8.2d, v18.2d, v30.2d[0]
1378	fmla	v9.2d, v19.2d, v30.2d[0]
1379	ldr		d24, [x10, #(0*8)]
1380	add		x10, x10, x11
1381	ldr		d26, [x10, #(0*8)]
1382	add		x10, x10, x11
1383	ldr		d28, [x10, #(0*8)]
1384	add		x10, x10, x11
1385	ldr		d30, [x10, #(0*8)]
1386	add		x10, x10, x11
1387
1388	bgt		1b
1389
1390
1391	// reduce
1392	fadd	v0.2d, v0.2d, v8.2d
1393	fadd	v1.2d, v1.2d, v9.2d
1394
13950:
1396
1397	cmp		w8, #3
1398	ble		4f
1399
1400	// unroll 0
1401	ldp		q18, q19, [x9, #(0*8+1*32)]
1402	fmla	v0.2d, v16.2d, v24.2d[0]
1403	fmla	v1.2d, v17.2d, v24.2d[0]
1404//	prfm	PLDL1KEEP, [x9, #128]
1405//	prfm	PLDL1KEEP, [x9, #192]
1406//	prfm	PLDL1KEEP, [x10, #128]
1407
1408	// unroll 1
1409//	prfm	PLDL1KEEP, [x10, #192]
1410	fmla	v0.2d, v18.2d, v26.2d[0]
1411	fmla	v1.2d, v19.2d, v26.2d[0]
1412	ldp		q16, q17, [x9, #(0*8+2*32)]
1413	sub		w8, w8, #4
1414
1415	// unroll 2
1416	ldp		q18, q19, [x9, #(0*8+3*32)]
1417	fmla	v0.2d, v16.2d, v28.2d[0]
1418	fmla	v1.2d, v17.2d, v28.2d[0]
1419	add		x9, x9, #128
1420//	cmp		w8, #4
1421
1422	// unroll 3
1423//	ldp		q16, q17, [x9, #(0*8+0*32)]
1424	fmla	v0.2d, v18.2d, v30.2d[0]
1425	fmla	v1.2d, v19.2d, v30.2d[0]
1426//	ldp		q24, q25, [x10, #(0*8+0*32)]
1427//	add		x10, x10, x11
1428//	ldp		q26, q27, [x10, #(0*8+1*32)]
1429//	add		x10, x10, x11
1430//	ldp		q28, q29, [x10, #(0*8+2*32)]
1431//	add		x10, x10, x11
1432//	ldp		q30, q31, [x10, #(0*8+3*32)]
1433//	add		x10, x10, x11
1434
1435	b		2f // return
1436
14374: // consider clean1-up loop
1438
1439	cmp		w8, #0
1440	ble		2f // return
1441
1442	sub		x10, x10, x11
1443	sub		x10, x10, x11
1444	sub		x10, x10, x11
1445	sub		x10, x10, x11
1446
14473: // clean1-up loop
1448
1449	// unroll 0
1450	ld1		{v24.2d, v25.2d}, [x9], #32
1451	ldr		d28, [x10, #(0*8)]
1452	fmla	v0.2d, v24.2d, v28.2d[0]
1453	fmla	v1.2d, v25.2d, v28.2d[0]
1454	add		x10, x10, x11
1455	sub		w8, w8, #1
1456	cmp		w8, #0
1457
1458	bgt		3b
1459
14602: // return
1461
1462
1463
1464#elif defined(TARGET_ARMV8A_ARM_CORTEX_A53)
1465
1466
1467
1468	// early return
1469	cmp		w8, #0
1470	ble		2f // return
1471
1472	// prefetch
1473
1474	// preload
1475
1476	cmp		w8, #4
1477	ble		0f // consider clean up loop
1478
1479	// prefetch
1480
1481	// zero tmp acc
1482
1483	// main loop
14841:
1485
1486	// load 0 & 1 & 2 & 3
1487	ld1		{v16.2d, v17.2d}, [x9], #32
1488	ldr		d24, [x10, #0]
1489	add		x10, x10, x11
1490	ld1		{v18.2d, v19.2d}, [x9], #32
1491	ldr		d26, [x10, #0]
1492	add		x10, x10, x11
1493	ld1		{v20.2d, v21.2d}, [x9], #32
1494	ldr		d28, [x10, #0]
1495	add		x10, x10, x11
1496	ld1		{v22.2d, v23.2d}, [x9], #32
1497	ldr		d30, [x10, #0]
1498	add		x10, x10, x11
1499
1500	// unroll 0
1501	fmla	v0.2d, v16.2d, v24.2d[0]
1502	fmla	v1.2d, v17.2d, v24.2d[0]
1503
1504	// unroll  1
1505	fmla	v0.2d, v18.2d, v26.2d[0]
1506	fmla	v1.2d, v19.2d, v26.2d[0]
1507	sub		w8, w8, #4
1508
1509	// unroll 2
1510	fmla	v0.2d, v20.2d, v28.2d[0]
1511	fmla	v1.2d, v21.2d, v28.2d[0]
1512	cmp		w8, #4
1513
1514	// unroll 3
1515	fmla	v0.2d, v22.2d, v30.2d[0]
1516	fmla	v1.2d, v23.2d, v30.2d[0]
1517
1518	bgt		1b
1519
1520
1521	// reduce
1522
15230:
1524
1525	cmp		w8, #3
1526	ble		4f
1527
1528	// load 0 & 1 & 2 & 3
1529	ld1		{v16.2d, v17.2d}, [x9], #32
1530	ldr		d24, [x10, #0]
1531	add		x10, x10, x11
1532	ld1		{v18.2d, v19.2d}, [x9], #32
1533	ldr		d26, [x10, #0]
1534	add		x10, x10, x11
1535	ld1		{v20.2d, v21.2d}, [x9], #32
1536	ldr		d28, [x10, #0]
1537	add		x10, x10, x11
1538	ld1		{v22.2d, v23.2d}, [x9], #32
1539	ldr		d30, [x10, #0]
1540	add		x10, x10, x11
1541
1542	// unroll 0
1543	fmla	v0.2d, v16.2d, v24.2d[0]
1544	fmla	v1.2d, v17.2d, v24.2d[0]
1545
1546	// unroll  1
1547	fmla	v0.2d, v18.2d, v26.2d[0]
1548	fmla	v1.2d, v19.2d, v26.2d[0]
1549	sub		w8, w8, #4
1550
1551	// unroll 2
1552	fmla	v0.2d, v20.2d, v28.2d[0]
1553	fmla	v1.2d, v21.2d, v28.2d[0]
1554
1555	// unroll 3
1556	fmla	v0.2d, v22.2d, v30.2d[0]
1557	fmla	v1.2d, v23.2d, v30.2d[0]
1558
1559	b		2f // return
1560
15614: // consider clean1-up loop
1562
1563	cmp		w8, #0
1564	ble		2f // return
1565
15663: // clean1-up loop
1567
1568	// unroll 0
1569	ld1		{v24.2d, v25.2d}, [x9], #32
1570	ldr		d28, [x10, #0]
1571	fmla	v0.2d, v24.2d, v28.2d[0]
1572	fmla	v1.2d, v25.2d, v28.2d[0]
1573	add		x10, x10, x11
1574	sub		w8, w8, #1
1575	cmp		w8, #0
1576
1577	bgt		3b
1578
15792: // return
1580
1581
1582
1583#endif // cortex a53
1584
1585
1586
1587#if MACRO_LEVEL>=2
1588	.endm
1589#else
1590	ret
1591
1592	FUN_END(inner_kernel_gemm_add_nt_4x1_lib4c)
1593#endif
1594
1595
1596
1597
1598
1599// subroutine
1600//
1601// input arguments:
1602// w8   <- k
1603// x9   <- A
1604// x10   <- B
1605// x11   <- ldb
1606//
1607// output arguments:
1608
1609#if MACRO_LEVEL>=2
1610	.macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C
1611#else
1612	.align	4
1613	FUN_START(inner_kernel_gemm_add_nn_4x4_lib4c)
1614#endif
1615
1616
1617
1618#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
1619
1620
1621
1622	// early return
1623	cmp		w8, #0
1624	ble		2f // return
1625
1626	add		x12, x10, x11
1627	add		x13, x12, x11
1628	add		x14, x13, x11
1629
1630	// prefetch
1631	prfm	PLDL1KEEP, [x10, #0]
1632	prfm	PLDL1KEEP, [x12, #0]
1633	prfm	PLDL1KEEP, [x13, #0]
1634	prfm	PLDL1KEEP, [x14, #0]
1635	prfm	PLDL1KEEP, [x9, #0]
1636	prfm	PLDL1KEEP, [x9, #64]
1637
1638	// preload
1639	ldp		q24, q25, [x10], #32
1640	ldp		q26, q27, [x12], #32
1641	ldp		q28, q29, [x13], #32
1642	ldp		q30, q31, [x14], #32
1643	ldp		q16, q17, [x9, #(0*8+0*32)]
1644
1645	cmp		w8, #4
1646	ble		0f // consider clean up loop
1647
1648	// prefetch
1649//	prfm	PLDL1KEEP, [x10, #0]
1650//	prfm	PLDL1KEEP, [x12, #0]
1651//	prfm	PLDL1KEEP, [x13, #0]
1652//	prfm	PLDL1KEEP, [x14, #0]
1653	prfm	PLDL1KEEP, [x9, #128]
1654	prfm	PLDL1KEEP, [x9, #192]
1655
1656	// zero tmp acc
1657	fmov	d8, xzr
1658	fmov    d9, d8
1659	fmov    d10, d8
1660	fmov    d11, d8
1661	fmov    d12, d8
1662	fmov    d13, d8
1663	fmov    d14, d8
1664	fmov    d15, d8
1665
1666//	add		x12, x11, #64
1667//	add		x12, x11, x11
1668//	add		x13, x12, #64
1669
1670	// main loop
16711:
1672
1673	// unroll 0
1674	ldp		q18, q19, [x9, #(0*8+1*32)]
1675	fmla	v0.2d, v16.2d, v24.2d[0]
1676	fmla	v1.2d, v17.2d, v24.2d[0]
1677//	prfm	PLDL1KEEP, [x9, #128]
1678	prfm	PLDL1KEEP, [x9, #256]
1679	fmla	v2.2d, v16.2d, v26.2d[0]
1680	fmla	v3.2d, v17.2d, v26.2d[0]
1681//	prfm	PLDL1KEEP, [x9, #192]
1682	prfm	PLDL1KEEP, [x9, #320]
1683	fmla	v4.2d, v16.2d, v28.2d[0]
1684	fmla	v5.2d, v17.2d, v28.2d[0]
1685	prfm	PLDL1KEEP, [x10, #32]
1686	fmla	v6.2d, v16.2d, v30.2d[0]
1687	fmla	v7.2d, v17.2d, v30.2d[0]
1688
1689	// unroll 1
1690	prfm	PLDL1KEEP, [x12, #32]
1691	fmla	v8.2d, v18.2d, v24.2d[1]
1692	fmla	v9.2d, v19.2d, v24.2d[1]
1693	ldp		q16, q17, [x9, #(0*8+2*32)]
1694	fmla	v10.2d, v18.2d, v26.2d[1]
1695	fmla	v11.2d, v19.2d, v26.2d[1]
1696	prfm	PLDL1KEEP, [x13, #32]
1697	fmla	v12.2d, v18.2d, v28.2d[1]
1698	fmla	v13.2d, v19.2d, v28.2d[1]
1699	prfm	PLDL1KEEP, [x14, #32]
1700	fmla	v14.2d, v18.2d, v30.2d[1]
1701	fmla	v15.2d, v19.2d, v30.2d[1]
1702
1703	// unroll 2
1704	ldp		q18, q19, [x9, #(0*8+3*32)]
1705	fmla	v0.2d, v16.2d, v25.2d[0]
1706	fmla	v1.2d, v17.2d, v25.2d[0]
1707	add		x9, x9, #128
1708	fmla	v2.2d, v16.2d, v27.2d[0]
1709	fmla	v3.2d, v17.2d, v27.2d[0]
1710	sub		w8, w8, #4
1711	fmla	v4.2d, v16.2d, v29.2d[0]
1712	fmla	v5.2d, v17.2d, v29.2d[0]
1713	cmp		w8, #4
1714	fmla	v6.2d, v16.2d, v31.2d[0]
1715	fmla	v7.2d, v17.2d, v31.2d[0]
1716
1717	// unroll 3
1718	ldp		q16, q17, [x9, #(0*8+0*32)]
1719	fmla	v8.2d, v18.2d, v25.2d[1]
1720	fmla	v9.2d, v19.2d, v25.2d[1]
1721	ldp		q24, q25, [x10], #32
1722	fmla	v10.2d, v18.2d, v27.2d[1]
1723	fmla	v11.2d, v19.2d, v27.2d[1]
1724	ldp		q26, q27, [x12], #32
1725	fmla	v12.2d, v18.2d, v29.2d[1]
1726	fmla	v13.2d, v19.2d, v29.2d[1]
1727	ldp		q28, q29, [x13], #32
1728	fmla	v14.2d, v18.2d, v31.2d[1]
1729	fmla	v15.2d, v19.2d, v31.2d[1]
1730	ldp		q30, q31, [x14], #32
1731
1732	bgt		1b
1733
1734
1735	// reduce
1736	fadd	v0.2d, v0.2d, v8.2d
1737	fadd	v1.2d, v1.2d, v9.2d
1738	fadd	v2.2d, v2.2d, v10.2d
1739	fadd	v3.2d, v3.2d, v11.2d
1740	fadd	v4.2d, v4.2d, v12.2d
1741	fadd	v5.2d, v5.2d, v13.2d
1742	fadd	v6.2d, v6.2d, v14.2d
1743	fadd	v7.2d, v7.2d, v15.2d
1744
1745//	sub		x9, x9, #32
1746//	sub		x10, x10, #32
1747
17480:
1749
1750	cmp		w8, #3
1751	ble		4f
1752
1753	// unroll 0
1754	ldp		q18, q19, [x9, #(0*8+1*32)]
1755	fmla	v0.2d, v16.2d, v24.2d[0]
1756	fmla	v1.2d, v17.2d, v24.2d[0]
1757//	prfm	PLDL1KEEP, [x9, #256]
1758	fmla	v2.2d, v16.2d, v26.2d[0]
1759	fmla	v3.2d, v17.2d, v26.2d[0]
1760//	prfm	PLDL1KEEP, [x9, #320]
1761	fmla	v4.2d, v16.2d, v28.2d[0]
1762	fmla	v5.2d, v17.2d, v28.2d[0]
1763//	prfm	PLDL1KEEP, [x10, #256]
1764	fmla	v6.2d, v16.2d, v30.2d[0]
1765	fmla	v7.2d, v17.2d, v30.2d[0]
1766
1767	// unroll 1
1768//	prfm	PLDL1KEEP, [x10, #320]
1769	fmla	v0.2d, v18.2d, v24.2d[1]
1770	fmla	v1.2d, v19.2d, v24.2d[1]
1771	ldp		q16, q17, [x9, #(0*8+2*32)]
1772	fmla	v2.2d, v18.2d, v26.2d[1]
1773	fmla	v3.2d, v19.2d, v26.2d[1]
1774//	add		x10, x10, x11
1775	fmla	v4.2d, v18.2d, v28.2d[1]
1776	fmla	v5.2d, v19.2d, v28.2d[1]
1777	sub		w8, w8, #4
1778	fmla	v6.2d, v18.2d, v30.2d[1]
1779	fmla	v7.2d, v19.2d, v30.2d[1]
1780
1781	// unroll 2
1782	ldp		q18, q19, [x9, #(0*8+3*32)]
1783	fmla	v0.2d, v16.2d, v25.2d[0]
1784	fmla	v1.2d, v17.2d, v25.2d[0]
1785	add		x9, x9, #128
1786	fmla	v2.2d, v16.2d, v27.2d[0]
1787	fmla	v3.2d, v17.2d, v27.2d[0]
1788	fmla	v4.2d, v16.2d, v29.2d[0]
1789	fmla	v5.2d, v17.2d, v29.2d[0]
1790	cmp		w8, #4
1791	fmla	v6.2d, v16.2d, v31.2d[0]
1792	fmla	v7.2d, v17.2d, v31.2d[0]
1793
1794	// unroll 3
1795//	ldp		q16, q17, [x9, #(0*8+0*32)]
1796	fmla	v0.2d, v18.2d, v25.2d[1]
1797	fmla	v1.2d, v19.2d, v25.2d[1]
1798//	ldp		q24, q25, [x10, #(0*8+0*32)]
1799	fmla	v2.2d, v18.2d, v27.2d[1]
1800	fmla	v3.2d, v19.2d, v27.2d[1]
1801//	ldp		q26, q27, [x10, #(0*8+1*32)]
1802	fmla	v4.2d, v18.2d, v29.2d[1]
1803	fmla	v5.2d, v19.2d, v29.2d[1]
1804//	ldp		q28, q29, [x10, #(0*8+2*32)]
1805	fmla	v6.2d, v18.2d, v31.2d[1]
1806	fmla	v7.2d, v19.2d, v31.2d[1]
1807//	ldp		q30, q31, [x10, #(0*8+3*32)]
1808
1809	b		2f // return
1810
18114: // consider clean1-up loop
1812
1813	cmp		w8, #0
1814	ble		2f // return
1815
1816	sub		x10, x10, #32
1817	sub		x12, x12, #32
1818	sub		x13, x13, #32
1819	sub		x14, x14, #32
1820
18213: // clean1-up loop
1822
1823	// unroll 0
1824	ldp		q24, q25, [x9, #0]
1825	ldr		d28, [x10], #8
1826	ldr		d29, [x12], #8
1827	ldr		d30, [x13], #8
1828	ldr		d31, [x14], #8
1829	fmla	v0.2d, v24.2d, v28.2d[0]
1830	fmla	v1.2d, v25.2d, v28.2d[0]
1831	add		x9, x9, #32
1832	fmla	v2.2d, v24.2d, v29.2d[0]
1833	fmla	v3.2d, v25.2d, v29.2d[0]
1834	sub		w8, w8, #1
1835	fmla	v4.2d, v24.2d, v30.2d[0]
1836	fmla	v5.2d, v25.2d, v30.2d[0]
1837	cmp		w8, #0
1838	fmla	v6.2d, v24.2d, v31.2d[0]
1839	fmla	v7.2d, v25.2d, v31.2d[0]
1840
1841	bgt		3b
1842
18432: // return
1844
1845
1846
1847#else // cortex a53
1848
1849
1850
1851	// early return
1852	cmp		w8, #0
1853	ble		2f // return
1854
1855	add		x12, x10, x11
1856	add		x13, x12, x11
1857	add		x14, x13, x11
1858
1859	// prefetch
1860
1861	// preload
1862
1863	cmp		w8, #4
1864	ble		0f // consider clean up loop
1865
1866	// prefetch
1867
1868	// zero tmp acc
1869
1870	// main loop
18711:
1872
1873	// load 0 & 1 & 2 & 3
1874	ldp		q24, q25, [x10], #32
1875	ldp		q26, q27, [x12], #32
1876	ldp		q28, q29, [x13], #32
1877	ldp		q30, q31, [x14], #32
1878	ldp		q16, q17, [x9], #32
1879	ldp		q18, q19, [x9], #32
1880	ldp		q20, q21, [x9], #32
1881	ldp		q22, q23, [x9], #32
1882
1883	// unroll 0
1884	fmla	v0.2d, v16.2d, v24.2d[0]
1885	fmla	v1.2d, v17.2d, v24.2d[0]
1886	fmla	v2.2d, v16.2d, v26.2d[0]
1887	fmla	v3.2d, v17.2d, v26.2d[0]
1888	fmla	v4.2d, v16.2d, v28.2d[0]
1889	fmla	v5.2d, v17.2d, v28.2d[0]
1890	fmla	v6.2d, v16.2d, v30.2d[0]
1891	fmla	v7.2d, v17.2d, v30.2d[0]
1892
1893	// unroll 1
1894	fmla	v0.2d, v18.2d, v24.2d[1]
1895	fmla	v1.2d, v19.2d, v24.2d[1]
1896	fmla	v2.2d, v18.2d, v26.2d[1]
1897	fmla	v3.2d, v19.2d, v26.2d[1]
1898	fmla	v4.2d, v18.2d, v28.2d[1]
1899	fmla	v5.2d, v19.2d, v28.2d[1]
1900	fmla	v6.2d, v18.2d, v30.2d[1]
1901	fmla	v7.2d, v19.2d, v30.2d[1]
1902	sub		w8, w8, #4
1903
1904	// unroll 2
1905	fmla	v0.2d, v20.2d, v25.2d[0]
1906	fmla	v1.2d, v21.2d, v25.2d[0]
1907	fmla	v2.2d, v20.2d, v27.2d[0]
1908	fmla	v3.2d, v21.2d, v27.2d[0]
1909	fmla	v4.2d, v20.2d, v29.2d[0]
1910	fmla	v5.2d, v21.2d, v29.2d[0]
1911	fmla	v6.2d, v20.2d, v31.2d[0]
1912	fmla	v7.2d, v21.2d, v31.2d[0]
1913	cmp		w8, #4
1914
1915	// unroll 3
1916	fmla	v0.2d, v22.2d, v25.2d[1]
1917	fmla	v1.2d, v23.2d, v25.2d[1]
1918	fmla	v2.2d, v22.2d, v27.2d[1]
1919	fmla	v3.2d, v23.2d, v27.2d[1]
1920	fmla	v4.2d, v22.2d, v29.2d[1]
1921	fmla	v5.2d, v23.2d, v29.2d[1]
1922	fmla	v6.2d, v22.2d, v31.2d[1]
1923	fmla	v7.2d, v23.2d, v31.2d[1]
1924
1925	bgt		1b
1926
1927
1928	// reduce
1929
19300:
1931
1932	cmp		w8, #3
1933	ble		4f
1934
1935	// load 0 & 1 & 2 & 3
1936	ldp		q24, q25, [x10], #32
1937	ldp		q26, q27, [x12], #32
1938	ldp		q28, q29, [x13], #32
1939	ldp		q30, q31, [x14], #32
1940	ldp		q16, q17, [x9], #32
1941	ldp		q18, q19, [x9], #32
1942	ldp		q20, q21, [x9], #32
1943	ldp		q22, q23, [x9], #32
1944
1945	// unroll 0
1946	fmla	v0.2d, v16.2d, v24.2d[0]
1947	fmla	v1.2d, v17.2d, v24.2d[0]
1948	fmla	v2.2d, v16.2d, v26.2d[0]
1949	fmla	v3.2d, v17.2d, v26.2d[0]
1950	fmla	v4.2d, v16.2d, v28.2d[0]
1951	fmla	v5.2d, v17.2d, v28.2d[0]
1952	fmla	v6.2d, v16.2d, v30.2d[0]
1953	fmla	v7.2d, v17.2d, v30.2d[0]
1954
1955	// unroll 1
1956	fmla	v0.2d, v18.2d, v24.2d[1]
1957	fmla	v1.2d, v19.2d, v24.2d[1]
1958	fmla	v2.2d, v18.2d, v26.2d[1]
1959	fmla	v3.2d, v19.2d, v26.2d[1]
1960	fmla	v4.2d, v18.2d, v28.2d[1]
1961	fmla	v5.2d, v19.2d, v28.2d[1]
1962	fmla	v6.2d, v18.2d, v30.2d[1]
1963	fmla	v7.2d, v19.2d, v30.2d[1]
1964	sub		w8, w8, #4
1965
1966	// unroll 2
1967	fmla	v0.2d, v20.2d, v25.2d[0]
1968	fmla	v1.2d, v21.2d, v25.2d[0]
1969	fmla	v2.2d, v20.2d, v27.2d[0]
1970	fmla	v3.2d, v21.2d, v27.2d[0]
1971	fmla	v4.2d, v20.2d, v29.2d[0]
1972	fmla	v5.2d, v21.2d, v29.2d[0]
1973	fmla	v6.2d, v20.2d, v31.2d[0]
1974	fmla	v7.2d, v21.2d, v31.2d[0]
1975//	cmp		w8, #4
1976
1977	// unroll 3
1978	fmla	v0.2d, v22.2d, v25.2d[1]
1979	fmla	v1.2d, v23.2d, v25.2d[1]
1980	fmla	v2.2d, v22.2d, v27.2d[1]
1981	fmla	v3.2d, v23.2d, v27.2d[1]
1982	fmla	v4.2d, v22.2d, v29.2d[1]
1983	fmla	v5.2d, v23.2d, v29.2d[1]
1984	fmla	v6.2d, v22.2d, v31.2d[1]
1985	fmla	v7.2d, v23.2d, v31.2d[1]
1986
1987	b		2f // return
1988
19894: // consider clean1-up loop
1990
1991	cmp		w8, #0
1992	ble		2f // return
1993
19943: // clean1-up loop
1995
1996	// unroll 0
1997	ldp		q24, q25, [x9, #0]
1998	ldr		d28, [x10], #8
1999	ldr		d29, [x12], #8
2000	ldr		d30, [x13], #8
2001	ldr		d31, [x14], #8
2002	fmla	v0.2d, v24.2d, v28.2d[0]
2003	fmla	v1.2d, v25.2d, v28.2d[0]
2004	add		x9, x9, #32
2005	fmla	v2.2d, v24.2d, v29.2d[0]
2006	fmla	v3.2d, v25.2d, v29.2d[0]
2007	sub		w8, w8, #1
2008	fmla	v4.2d, v24.2d, v30.2d[0]
2009	fmla	v5.2d, v25.2d, v30.2d[0]
2010	cmp		w8, #0
2011	fmla	v6.2d, v24.2d, v31.2d[0]
2012	fmla	v7.2d, v25.2d, v31.2d[0]
2013
2014	bgt		3b
2015
20162: // return
2017
2018
2019
2020#endif
2021
2022
2023
2024#if MACRO_LEVEL>=2
2025	.endm
2026#else
2027	ret
2028
2029	FUN_END(inner_kernel_gemm_add_nn_4x4_lib4c)
2030#endif
2031
2032
2033
2034
2035
2036// subroutine
2037//
2038// input arguments:
2039// w8   <- k
2040// x9   <- A
2041// x10   <- B
2042// x11   <- ldb
2043//
2044// output arguments:
2045
2046#if MACRO_LEVEL>=2
2047	.macro INNER_KERNEL_GEMM_ADD_NN_4X3_LIB4C
2048#else
2049	.align	4
2050	FUN_START(inner_kernel_gemm_add_nn_4x3_lib4c)
2051#endif
2052
2053
2054
2055#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2056
2057
2058
2059	// early return
2060	cmp		w8, #0
2061	ble		2f // return
2062
2063	add		x12, x10, x11
2064	add		x13, x12, x11
2065
2066	// prefetch
2067	prfm	PLDL1KEEP, [x10, #0]
2068	prfm	PLDL1KEEP, [x12, #0]
2069	prfm	PLDL1KEEP, [x13, #0]
2070	prfm	PLDL1KEEP, [x9, #0]
2071	prfm	PLDL1KEEP, [x9, #64]
2072
2073	// preload
2074	ldp		q24, q25, [x10], #32
2075	ldp		q26, q27, [x12], #32
2076	ldp		q28, q29, [x13], #32
2077	ldp		q16, q17, [x9, #(0*8+0*32)]
2078
2079	cmp		w8, #4
2080	ble		0f // consider clean up loop
2081
2082	// prefetch
2083//	prfm	PLDL1KEEP, [x10, #0]
2084//	prfm	PLDL1KEEP, [x12, #0]
2085//	prfm	PLDL1KEEP, [x13, #0]
2086//	prfm	PLDL1KEEP, [x14, #0]
2087	prfm	PLDL1KEEP, [x9, #128]
2088	prfm	PLDL1KEEP, [x9, #192]
2089
2090	// zero tmp acc
2091	fmov	d8, xzr
2092	fmov    d9, d8
2093	fmov    d10, d8
2094	fmov    d11, d8
2095	fmov    d12, d8
2096	fmov    d13, d8
2097
2098	// main loop
20991:
2100
2101	// unroll 0
2102	ldp		q18, q19, [x9, #(0*8+1*32)]
2103	fmla	v0.2d, v16.2d, v24.2d[0]
2104	fmla	v1.2d, v17.2d, v24.2d[0]
2105//	prfm	PLDL1KEEP, [x9, #128]
2106	prfm	PLDL1KEEP, [x9, #256]
2107	fmla	v2.2d, v16.2d, v26.2d[0]
2108	fmla	v3.2d, v17.2d, v26.2d[0]
2109//	prfm	PLDL1KEEP, [x9, #192]
2110	prfm	PLDL1KEEP, [x9, #320]
2111	fmla	v4.2d, v16.2d, v28.2d[0]
2112	fmla	v5.2d, v17.2d, v28.2d[0]
2113	prfm	PLDL1KEEP, [x10, #32]
2114
2115	// unroll 1
2116	prfm	PLDL1KEEP, [x12, #32]
2117	fmla	v8.2d, v18.2d, v24.2d[1]
2118	fmla	v9.2d, v19.2d, v24.2d[1]
2119	ldp		q16, q17, [x9, #(0*8+2*32)]
2120	fmla	v10.2d, v18.2d, v26.2d[1]
2121	fmla	v11.2d, v19.2d, v26.2d[1]
2122	prfm	PLDL1KEEP, [x13, #32]
2123	fmla	v12.2d, v18.2d, v28.2d[1]
2124	fmla	v13.2d, v19.2d, v28.2d[1]
2125//	prfm	PLDL1KEEP, [x14, #32]
2126
2127	// unroll 2
2128	ldp		q18, q19, [x9, #(0*8+3*32)]
2129	fmla	v0.2d, v16.2d, v25.2d[0]
2130	fmla	v1.2d, v17.2d, v25.2d[0]
2131	add		x9, x9, #128
2132	fmla	v2.2d, v16.2d, v27.2d[0]
2133	fmla	v3.2d, v17.2d, v27.2d[0]
2134	sub		w8, w8, #4
2135	fmla	v4.2d, v16.2d, v29.2d[0]
2136	fmla	v5.2d, v17.2d, v29.2d[0]
2137	cmp		w8, #4
2138
2139	// unroll 3
2140	ldp		q16, q17, [x9, #(0*8+0*32)]
2141	fmla	v8.2d, v18.2d, v25.2d[1]
2142	fmla	v9.2d, v19.2d, v25.2d[1]
2143	ldp		q24, q25, [x10], #32
2144	fmla	v10.2d, v18.2d, v27.2d[1]
2145	fmla	v11.2d, v19.2d, v27.2d[1]
2146	ldp		q26, q27, [x12], #32
2147	fmla	v12.2d, v18.2d, v29.2d[1]
2148	fmla	v13.2d, v19.2d, v29.2d[1]
2149	ldp		q28, q29, [x13], #32
2150
2151	bgt		1b
2152
2153
2154	// reduce
2155	fadd	v0.2d, v0.2d, v8.2d
2156	fadd	v1.2d, v1.2d, v9.2d
2157	fadd	v2.2d, v2.2d, v10.2d
2158	fadd	v3.2d, v3.2d, v11.2d
2159	fadd	v4.2d, v4.2d, v12.2d
2160	fadd	v5.2d, v5.2d, v13.2d
2161
21620:
2163
2164	cmp		w8, #3
2165	ble		4f
2166
2167	// unroll 0
2168	ldp		q18, q19, [x9, #(0*8+1*32)]
2169	fmla	v0.2d, v16.2d, v24.2d[0]
2170	fmla	v1.2d, v17.2d, v24.2d[0]
2171//	prfm	PLDL1KEEP, [x9, #256]
2172	fmla	v2.2d, v16.2d, v26.2d[0]
2173	fmla	v3.2d, v17.2d, v26.2d[0]
2174//	prfm	PLDL1KEEP, [x9, #320]
2175	fmla	v4.2d, v16.2d, v28.2d[0]
2176	fmla	v5.2d, v17.2d, v28.2d[0]
2177//	prfm	PLDL1KEEP, [x10, #256]
2178
2179	// unroll 1
2180//	prfm	PLDL1KEEP, [x10, #320]
2181	fmla	v0.2d, v18.2d, v24.2d[1]
2182	fmla	v1.2d, v19.2d, v24.2d[1]
2183	ldp		q16, q17, [x9, #(0*8+2*32)]
2184	fmla	v2.2d, v18.2d, v26.2d[1]
2185	fmla	v3.2d, v19.2d, v26.2d[1]
2186//	add		x10, x10, x11
2187	fmla	v4.2d, v18.2d, v28.2d[1]
2188	fmla	v5.2d, v19.2d, v28.2d[1]
2189	sub		w8, w8, #4
2190
2191	// unroll 2
2192	ldp		q18, q19, [x9, #(0*8+3*32)]
2193	fmla	v0.2d, v16.2d, v25.2d[0]
2194	fmla	v1.2d, v17.2d, v25.2d[0]
2195	add		x9, x9, #128
2196	fmla	v2.2d, v16.2d, v27.2d[0]
2197	fmla	v3.2d, v17.2d, v27.2d[0]
2198	fmla	v4.2d, v16.2d, v29.2d[0]
2199	fmla	v5.2d, v17.2d, v29.2d[0]
2200	cmp		w8, #4
2201
2202	// unroll 3
2203//	ldp		q16, q17, [x9, #(0*8+0*32)]
2204	fmla	v0.2d, v18.2d, v25.2d[1]
2205	fmla	v1.2d, v19.2d, v25.2d[1]
2206//	ldp		q24, q25, [x10, #(0*8+0*32)]
2207	fmla	v2.2d, v18.2d, v27.2d[1]
2208	fmla	v3.2d, v19.2d, v27.2d[1]
2209//	ldp		q26, q27, [x10, #(0*8+1*32)]
2210	fmla	v4.2d, v18.2d, v29.2d[1]
2211	fmla	v5.2d, v19.2d, v29.2d[1]
2212//	ldp		q28, q29, [x10, #(0*8+2*32)]
2213
2214	b		2f // return
2215
22164: // consider clean1-up loop
2217
2218	cmp		w8, #0
2219	ble		2f // return
2220
2221	sub		x10, x10, #32
2222	sub		x12, x12, #32
2223	sub		x13, x13, #32
2224
22253: // clean1-up loop
2226
2227	// unroll 0
2228	ldp		q24, q25, [x9, #0]
2229	ldr		d28, [x10], #8
2230	ldr		d29, [x12], #8
2231	ldr		d30, [x13], #8
2232//	ldr		d31, [x14], #8
2233	fmla	v0.2d, v24.2d, v28.2d[0]
2234	fmla	v1.2d, v25.2d, v28.2d[0]
2235	add		x9, x9, #32
2236	fmla	v2.2d, v24.2d, v29.2d[0]
2237	fmla	v3.2d, v25.2d, v29.2d[0]
2238	sub		w8, w8, #1
2239	fmla	v4.2d, v24.2d, v30.2d[0]
2240	fmla	v5.2d, v25.2d, v30.2d[0]
2241	cmp		w8, #0
2242
2243	bgt		3b
2244
22452: // return
2246
2247
2248
2249#else // cortex a53
2250
2251
2252
2253	// early return
2254	cmp		w8, #0
2255	ble		2f // return
2256
2257	add		x12, x10, x11
2258	add		x13, x12, x11
2259
2260	// prefetch
2261
2262	// preload
2263
2264	cmp		w8, #4
2265	ble		0f // consider clean up loop
2266
2267	// prefetch
2268
2269	// zero tmp acc
2270
2271	// main loop
22721:
2273
2274	// load 0 & 1 & 2 & 3
2275	ldp		q24, q25, [x10], #32
2276	ldp		q26, q27, [x12], #32
2277	ldp		q28, q29, [x13], #32
2278	ldp		q16, q17, [x9], #32
2279	ldp		q18, q19, [x9], #32
2280	ldp		q20, q21, [x9], #32
2281	ldp		q22, q23, [x9], #32
2282
2283	// unroll 0
2284	fmla	v0.2d, v16.2d, v24.2d[0]
2285	fmla	v1.2d, v17.2d, v24.2d[0]
2286	fmla	v2.2d, v16.2d, v26.2d[0]
2287	fmla	v3.2d, v17.2d, v26.2d[0]
2288	fmla	v4.2d, v16.2d, v28.2d[0]
2289	fmla	v5.2d, v17.2d, v28.2d[0]
2290
2291	// unroll 1
2292	fmla	v0.2d, v18.2d, v24.2d[1]
2293	fmla	v1.2d, v19.2d, v24.2d[1]
2294	fmla	v2.2d, v18.2d, v26.2d[1]
2295	fmla	v3.2d, v19.2d, v26.2d[1]
2296	fmla	v4.2d, v18.2d, v28.2d[1]
2297	fmla	v5.2d, v19.2d, v28.2d[1]
2298	sub		w8, w8, #4
2299
2300	// unroll 2
2301	fmla	v0.2d, v20.2d, v25.2d[0]
2302	fmla	v1.2d, v21.2d, v25.2d[0]
2303	fmla	v2.2d, v20.2d, v27.2d[0]
2304	fmla	v3.2d, v21.2d, v27.2d[0]
2305	fmla	v4.2d, v20.2d, v29.2d[0]
2306	fmla	v5.2d, v21.2d, v29.2d[0]
2307	cmp		w8, #4
2308
2309	// unroll 3
2310	fmla	v0.2d, v22.2d, v25.2d[1]
2311	fmla	v1.2d, v23.2d, v25.2d[1]
2312	fmla	v2.2d, v22.2d, v27.2d[1]
2313	fmla	v3.2d, v23.2d, v27.2d[1]
2314	fmla	v4.2d, v22.2d, v29.2d[1]
2315	fmla	v5.2d, v23.2d, v29.2d[1]
2316
2317	bgt		1b
2318
2319
2320	// reduce
2321
23220:
2323
2324	cmp		w8, #3
2325	ble		4f
2326
2327	// load 0 & 1 & 2 & 3
2328	ldp		q24, q25, [x10], #32
2329	ldp		q26, q27, [x12], #32
2330	ldp		q28, q29, [x13], #32
2331	ldp		q16, q17, [x9], #32
2332	ldp		q18, q19, [x9], #32
2333	ldp		q20, q21, [x9], #32
2334	ldp		q22, q23, [x9], #32
2335
2336	// unroll 0
2337	fmla	v0.2d, v16.2d, v24.2d[0]
2338	fmla	v1.2d, v17.2d, v24.2d[0]
2339	fmla	v2.2d, v16.2d, v26.2d[0]
2340	fmla	v3.2d, v17.2d, v26.2d[0]
2341	fmla	v4.2d, v16.2d, v28.2d[0]
2342	fmla	v5.2d, v17.2d, v28.2d[0]
2343
2344	// unroll 1
2345	fmla	v0.2d, v18.2d, v24.2d[1]
2346	fmla	v1.2d, v19.2d, v24.2d[1]
2347	fmla	v2.2d, v18.2d, v26.2d[1]
2348	fmla	v3.2d, v19.2d, v26.2d[1]
2349	fmla	v4.2d, v18.2d, v28.2d[1]
2350	fmla	v5.2d, v19.2d, v28.2d[1]
2351	sub		w8, w8, #4
2352
2353	// unroll 2
2354	fmla	v0.2d, v20.2d, v25.2d[0]
2355	fmla	v1.2d, v21.2d, v25.2d[0]
2356	fmla	v2.2d, v20.2d, v27.2d[0]
2357	fmla	v3.2d, v21.2d, v27.2d[0]
2358	fmla	v4.2d, v20.2d, v29.2d[0]
2359	fmla	v5.2d, v21.2d, v29.2d[0]
2360//	cmp		w8, #4
2361
2362	// unroll 3
2363	fmla	v0.2d, v22.2d, v25.2d[1]
2364	fmla	v1.2d, v23.2d, v25.2d[1]
2365	fmla	v2.2d, v22.2d, v27.2d[1]
2366	fmla	v3.2d, v23.2d, v27.2d[1]
2367	fmla	v4.2d, v22.2d, v29.2d[1]
2368	fmla	v5.2d, v23.2d, v29.2d[1]
2369
2370	b		2f // return
2371
23724: // consider clean1-up loop
2373
2374	cmp		w8, #0
2375	ble		2f // return
2376
23773: // clean1-up loop
2378
2379	// unroll 0
2380	ldp		q24, q25, [x9, #0]
2381	ldr		d28, [x10], #8
2382	ldr		d29, [x12], #8
2383	ldr		d30, [x13], #8
2384	fmla	v0.2d, v24.2d, v28.2d[0]
2385	fmla	v1.2d, v25.2d, v28.2d[0]
2386	add		x9, x9, #32
2387	fmla	v2.2d, v24.2d, v29.2d[0]
2388	fmla	v3.2d, v25.2d, v29.2d[0]
2389	sub		w8, w8, #1
2390	fmla	v4.2d, v24.2d, v30.2d[0]
2391	fmla	v5.2d, v25.2d, v30.2d[0]
2392	cmp		w8, #0
2393
2394	bgt		3b
2395
23962: // return
2397
2398
2399
2400#endif
2401
2402
2403
2404#if MACRO_LEVEL>=2
2405	.endm
2406#else
2407	ret
2408
2409	FUN_END(inner_kernel_gemm_add_nn_4x3_lib4c)
2410#endif
2411
2412
2413
2414
2415
2416// subroutine
2417//
2418// input arguments:
2419// w8   <- k
2420// x9   <- A
2421// x10   <- B
2422// x11   <- ldb
2423//
2424// output arguments:
2425
2426#if MACRO_LEVEL>=2
2427	.macro INNER_KERNEL_GEMM_ADD_NN_4X2_LIB4C
2428#else
2429	.align	4
2430	FUN_START(inner_kernel_gemm_add_nn_4x2_lib4c)
2431#endif
2432
2433
2434
2435#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2436
2437
2438
2439	// early return
2440	cmp		w8, #0
2441	ble		2f // return
2442
2443	add		x12, x10, x11
2444
2445	// prefetch
2446	prfm	PLDL1KEEP, [x10, #0]
2447	prfm	PLDL1KEEP, [x12, #0]
2448	prfm	PLDL1KEEP, [x9, #0]
2449	prfm	PLDL1KEEP, [x9, #64]
2450
2451	// preload
2452	ldp		q24, q25, [x10], #32
2453	ldp		q26, q27, [x12], #32
2454	ldp		q16, q17, [x9, #(0*8+0*32)]
2455
2456	cmp		w8, #4
2457	ble		0f // consider clean up loop
2458
2459	// prefetch
2460//	prfm	PLDL1KEEP, [x10, #0]
2461//	prfm	PLDL1KEEP, [x12, #0]
2462//	prfm	PLDL1KEEP, [x13, #0]
2463//	prfm	PLDL1KEEP, [x14, #0]
2464	prfm	PLDL1KEEP, [x9, #128]
2465	prfm	PLDL1KEEP, [x9, #192]
2466
2467	// zero tmp acc
2468	fmov	d8, xzr
2469	fmov    d9, d8
2470	fmov    d10, d8
2471	fmov    d11, d8
2472
2473	// main loop
24741:
2475
2476	// unroll 0
2477	ldp		q18, q19, [x9, #(0*8+1*32)]
2478	fmla	v0.2d, v16.2d, v24.2d[0]
2479	fmla	v1.2d, v17.2d, v24.2d[0]
2480//	prfm	PLDL1KEEP, [x9, #128]
2481	prfm	PLDL1KEEP, [x9, #256]
2482	fmla	v2.2d, v16.2d, v26.2d[0]
2483	fmla	v3.2d, v17.2d, v26.2d[0]
2484//	prfm	PLDL1KEEP, [x9, #192]
2485	prfm	PLDL1KEEP, [x9, #320]
2486	prfm	PLDL1KEEP, [x10, #32]
2487
2488	// unroll 1
2489	prfm	PLDL1KEEP, [x12, #32]
2490	fmla	v8.2d, v18.2d, v24.2d[1]
2491	fmla	v9.2d, v19.2d, v24.2d[1]
2492	ldp		q16, q17, [x9, #(0*8+2*32)]
2493	fmla	v10.2d, v18.2d, v26.2d[1]
2494	fmla	v11.2d, v19.2d, v26.2d[1]
2495
2496	// unroll 2
2497	ldp		q18, q19, [x9, #(0*8+3*32)]
2498	fmla	v0.2d, v16.2d, v25.2d[0]
2499	fmla	v1.2d, v17.2d, v25.2d[0]
2500	add		x9, x9, #128
2501	fmla	v2.2d, v16.2d, v27.2d[0]
2502	fmla	v3.2d, v17.2d, v27.2d[0]
2503	sub		w8, w8, #4
2504	cmp		w8, #4
2505
2506	// unroll 3
2507	ldp		q16, q17, [x9, #(0*8+0*32)]
2508	fmla	v8.2d, v18.2d, v25.2d[1]
2509	fmla	v9.2d, v19.2d, v25.2d[1]
2510	ldp		q24, q25, [x10], #32
2511	fmla	v10.2d, v18.2d, v27.2d[1]
2512	fmla	v11.2d, v19.2d, v27.2d[1]
2513	ldp		q26, q27, [x12], #32
2514
2515	bgt		1b
2516
2517
2518	// reduce
2519	fadd	v0.2d, v0.2d, v8.2d
2520	fadd	v1.2d, v1.2d, v9.2d
2521	fadd	v2.2d, v2.2d, v10.2d
2522	fadd	v3.2d, v3.2d, v11.2d
2523
25240:
2525
2526	cmp		w8, #3
2527	ble		4f
2528
2529	// unroll 0
2530	ldp		q18, q19, [x9, #(0*8+1*32)]
2531	fmla	v0.2d, v16.2d, v24.2d[0]
2532	fmla	v1.2d, v17.2d, v24.2d[0]
2533//	prfm	PLDL1KEEP, [x9, #256]
2534	fmla	v2.2d, v16.2d, v26.2d[0]
2535	fmla	v3.2d, v17.2d, v26.2d[0]
2536//	prfm	PLDL1KEEP, [x9, #320]
2537//	prfm	PLDL1KEEP, [x10, #256]
2538
2539	// unroll 1
2540//	prfm	PLDL1KEEP, [x10, #320]
2541	fmla	v0.2d, v18.2d, v24.2d[1]
2542	fmla	v1.2d, v19.2d, v24.2d[1]
2543	ldp		q16, q17, [x9, #(0*8+2*32)]
2544	fmla	v2.2d, v18.2d, v26.2d[1]
2545	fmla	v3.2d, v19.2d, v26.2d[1]
2546//	add		x10, x10, x11
2547	sub		w8, w8, #4
2548
2549	// unroll 2
2550	ldp		q18, q19, [x9, #(0*8+3*32)]
2551	fmla	v0.2d, v16.2d, v25.2d[0]
2552	fmla	v1.2d, v17.2d, v25.2d[0]
2553	add		x9, x9, #128
2554	fmla	v2.2d, v16.2d, v27.2d[0]
2555	fmla	v3.2d, v17.2d, v27.2d[0]
2556	cmp		w8, #4
2557
2558	// unroll 3
2559//	ldp		q16, q17, [x9, #(0*8+0*32)]
2560	fmla	v0.2d, v18.2d, v25.2d[1]
2561	fmla	v1.2d, v19.2d, v25.2d[1]
2562//	ldp		q24, q25, [x10, #(0*8+0*32)]
2563	fmla	v2.2d, v18.2d, v27.2d[1]
2564	fmla	v3.2d, v19.2d, v27.2d[1]
2565//	ldp		q26, q27, [x10, #(0*8+1*32)]
2566
2567	b		2f // return
2568
25694: // consider clean1-up loop
2570
2571	cmp		w8, #0
2572	ble		2f // return
2573
2574	sub		x10, x10, #32
2575	sub		x12, x12, #32
2576
25773: // clean1-up loop
2578
2579	// unroll 0
2580	ldp		q24, q25, [x9, #0]
2581	ldr		d28, [x10], #8
2582	ldr		d29, [x12], #8
2583	fmla	v0.2d, v24.2d, v28.2d[0]
2584	fmla	v1.2d, v25.2d, v28.2d[0]
2585	add		x9, x9, #32
2586	fmla	v2.2d, v24.2d, v29.2d[0]
2587	fmla	v3.2d, v25.2d, v29.2d[0]
2588	sub		w8, w8, #1
2589	cmp		w8, #0
2590
2591	bgt		3b
2592
25932: // return
2594
2595
2596
2597#else // cortex a53
2598
2599
2600
2601	// early return
2602	cmp		w8, #0
2603	ble		2f // return
2604
2605	add		x12, x10, x11
2606
2607	// prefetch
2608
2609	// preload
2610
2611	cmp		w8, #4
2612	ble		0f // consider clean up loop
2613
2614	// prefetch
2615
2616	// zero tmp acc
2617
2618	// main loop
26191:
2620
2621	// load 0 & 1 & 2 & 3
2622	ldp		q24, q25, [x10], #32
2623	ldp		q26, q27, [x12], #32
2624	ldp		q16, q17, [x9], #32
2625	ldp		q18, q19, [x9], #32
2626	ldp		q20, q21, [x9], #32
2627	ldp		q22, q23, [x9], #32
2628
2629	// unroll 0
2630	fmla	v0.2d, v16.2d, v24.2d[0]
2631	fmla	v1.2d, v17.2d, v24.2d[0]
2632	fmla	v2.2d, v16.2d, v26.2d[0]
2633	fmla	v3.2d, v17.2d, v26.2d[0]
2634
2635	// unroll 1
2636	fmla	v0.2d, v18.2d, v24.2d[1]
2637	fmla	v1.2d, v19.2d, v24.2d[1]
2638	fmla	v2.2d, v18.2d, v26.2d[1]
2639	fmla	v3.2d, v19.2d, v26.2d[1]
2640	sub		w8, w8, #4
2641
2642	// unroll 2
2643	fmla	v0.2d, v20.2d, v25.2d[0]
2644	fmla	v1.2d, v21.2d, v25.2d[0]
2645	fmla	v2.2d, v20.2d, v27.2d[0]
2646	fmla	v3.2d, v21.2d, v27.2d[0]
2647	cmp		w8, #4
2648
2649	// unroll 3
2650	fmla	v0.2d, v22.2d, v25.2d[1]
2651	fmla	v1.2d, v23.2d, v25.2d[1]
2652	fmla	v2.2d, v22.2d, v27.2d[1]
2653	fmla	v3.2d, v23.2d, v27.2d[1]
2654
2655	bgt		1b
2656
2657
2658	// reduce
2659
26600:
2661
2662	cmp		w8, #3
2663	ble		4f
2664
2665	// load 0 & 1 & 2 & 3
2666	ldp		q24, q25, [x10], #32
2667	ldp		q26, q27, [x12], #32
2668	ldp		q16, q17, [x9], #32
2669	ldp		q18, q19, [x9], #32
2670	ldp		q20, q21, [x9], #32
2671	ldp		q22, q23, [x9], #32
2672
2673	// unroll 0
2674	fmla	v0.2d, v16.2d, v24.2d[0]
2675	fmla	v1.2d, v17.2d, v24.2d[0]
2676	fmla	v2.2d, v16.2d, v26.2d[0]
2677	fmla	v3.2d, v17.2d, v26.2d[0]
2678
2679	// unroll 1
2680	fmla	v0.2d, v18.2d, v24.2d[1]
2681	fmla	v1.2d, v19.2d, v24.2d[1]
2682	fmla	v2.2d, v18.2d, v26.2d[1]
2683	fmla	v3.2d, v19.2d, v26.2d[1]
2684	sub		w8, w8, #4
2685
2686	// unroll 2
2687	fmla	v0.2d, v20.2d, v25.2d[0]
2688	fmla	v1.2d, v21.2d, v25.2d[0]
2689	fmla	v2.2d, v20.2d, v27.2d[0]
2690	fmla	v3.2d, v21.2d, v27.2d[0]
2691//	cmp		w8, #4
2692
2693	// unroll 3
2694	fmla	v0.2d, v22.2d, v25.2d[1]
2695	fmla	v1.2d, v23.2d, v25.2d[1]
2696	fmla	v2.2d, v22.2d, v27.2d[1]
2697	fmla	v3.2d, v23.2d, v27.2d[1]
2698
2699	b		2f // return
2700
27014: // consider clean1-up loop
2702
2703	cmp		w8, #0
2704	ble		2f // return
2705
27063: // clean1-up loop
2707
2708	// unroll 0
2709	ldp		q24, q25, [x9, #0]
2710	ldr		d28, [x10], #8
2711	ldr		d29, [x12], #8
2712	fmla	v0.2d, v24.2d, v28.2d[0]
2713	fmla	v1.2d, v25.2d, v28.2d[0]
2714	add		x9, x9, #32
2715	fmla	v2.2d, v24.2d, v29.2d[0]
2716	fmla	v3.2d, v25.2d, v29.2d[0]
2717	sub		w8, w8, #1
2718	cmp		w8, #0
2719
2720	bgt		3b
2721
27222: // return
2723
2724
2725
2726#endif
2727
2728
2729
2730#if MACRO_LEVEL>=2
2731	.endm
2732#else
2733	ret
2734
2735	FUN_END(inner_kernel_gemm_add_nn_4x2_lib4c)
2736#endif
2737
2738
2739
2740
2741
2742// subroutine
2743//
2744// input arguments:
2745// w8   <- k
2746// x9   <- A
2747// x10   <- B
2748// x11   <- ldb
2749//
2750// output arguments:
2751
2752#if MACRO_LEVEL>=2
2753	.macro INNER_KERNEL_GEMM_ADD_NN_4X1_LIB4C
2754#else
2755	.align	4
2756	FUN_START(inner_kernel_gemm_add_nn_4x1_lib4c)
2757#endif
2758
2759
2760
2761#if defined(TARGET_ARMV8A_ARM_CORTEX_A57)
2762
2763
2764
2765	// early return
2766	cmp		w8, #0
2767	ble		2f // return
2768
2769	// prefetch
2770	prfm	PLDL1KEEP, [x10, #0]
2771	prfm	PLDL1KEEP, [x9, #0]
2772	prfm	PLDL1KEEP, [x9, #64]
2773
2774	// preload
2775	ldp		q24, q25, [x10], #32
2776	ldp		q16, q17, [x9, #(0*8+0*32)]
2777
2778	cmp		w8, #4
2779	ble		0f // consider clean up loop
2780
2781	// prefetch
2782	prfm	PLDL1KEEP, [x9, #128]
2783	prfm	PLDL1KEEP, [x9, #192]
2784
2785	// zero tmp acc
2786	fmov	d8, xzr
2787	fmov    d9, d8
2788
2789	// main loop
27901:
2791
2792	// unroll 0
2793	ldp		q18, q19, [x9, #(0*8+1*32)]
2794	fmla	v0.2d, v16.2d, v24.2d[0]
2795	fmla	v1.2d, v17.2d, v24.2d[0]
2796//	prfm	PLDL1KEEP, [x9, #128]
2797	prfm	PLDL1KEEP, [x9, #256]
2798//	prfm	PLDL1KEEP, [x9, #192]
2799	prfm	PLDL1KEEP, [x9, #320]
2800	prfm	PLDL1KEEP, [x10, #32]
2801
2802	// unroll 1
2803	fmla	v8.2d, v18.2d, v24.2d[1]
2804	fmla	v9.2d, v19.2d, v24.2d[1]
2805	ldp		q16, q17, [x9, #(0*8+2*32)]
2806
2807	// unroll 2
2808	ldp		q18, q19, [x9, #(0*8+3*32)]
2809	fmla	v0.2d, v16.2d, v25.2d[0]
2810	fmla	v1.2d, v17.2d, v25.2d[0]
2811	add		x9, x9, #128
2812	sub		w8, w8, #4
2813	cmp		w8, #4
2814
2815	// unroll 3
2816	ldp		q16, q17, [x9, #(0*8+0*32)]
2817	fmla	v8.2d, v18.2d, v25.2d[1]
2818	fmla	v9.2d, v19.2d, v25.2d[1]
2819	ldp		q24, q25, [x10], #32
2820
2821	bgt		1b
2822
2823
2824	// reduce
2825	fadd	v0.2d, v0.2d, v8.2d
2826	fadd	v1.2d, v1.2d, v9.2d
2827
28280:
2829
2830	cmp		w8, #3
2831	ble		4f
2832
2833	// unroll 0
2834	ldp		q18, q19, [x9, #(0*8+1*32)]
2835	fmla	v0.2d, v16.2d, v24.2d[0]
2836	fmla	v1.2d, v17.2d, v24.2d[0]
2837//	prfm	PLDL1KEEP, [x9, #256]
2838//	prfm	PLDL1KEEP, [x9, #320]
2839//	prfm	PLDL1KEEP, [x10, #256]
2840
2841	// unroll 1
2842	fmla	v0.2d, v18.2d, v24.2d[1]
2843	fmla	v1.2d, v19.2d, v24.2d[1]
2844	ldp		q16, q17, [x9, #(0*8+2*32)]
2845//	add		x10, x10, x11
2846	sub		w8, w8, #4
2847
2848	// unroll 2
2849	ldp		q18, q19, [x9, #(0*8+3*32)]
2850	fmla	v0.2d, v16.2d, v25.2d[0]
2851	fmla	v1.2d, v17.2d, v25.2d[0]
2852	add		x9, x9, #128
2853	cmp		w8, #4
2854
2855	// unroll 3
2856//	ldp		q16, q17, [x9, #(0*8+0*32)]
2857	fmla	v0.2d, v18.2d, v25.2d[1]
2858	fmla	v1.2d, v19.2d, v25.2d[1]
2859//	ldp		q24, q25, [x10, #(0*8+0*32)]
2860
2861	b		2f // return
2862
28634: // consider clean1-up loop
2864
2865	cmp		w8, #0
2866	ble		2f // return
2867
2868	sub		x10, x10, #32
2869
28703: // clean1-up loop
2871
2872	// unroll 0
2873	ldp		q24, q25, [x9, #0]
2874	ldr		d28, [x10], #8
2875	fmla	v0.2d, v24.2d, v28.2d[0]
2876	fmla	v1.2d, v25.2d, v28.2d[0]
2877	add		x9, x9, #32
2878	sub		w8, w8, #1
2879	cmp		w8, #0
2880
2881	bgt		3b
2882
28832: // return
2884
2885
2886
2887#else // cortex a53
2888
2889
2890
2891	// early return
2892	cmp		w8, #0
2893	ble		2f // return
2894
2895	// prefetch
2896
2897	// preload
2898
2899	cmp		w8, #4
2900	ble		0f // consider clean up loop
2901
2902	// prefetch
2903
2904	// zero tmp acc
2905
2906	// main loop
29071:
2908
2909	// load 0 & 1 & 2 & 3
2910	ldp		q24, q25, [x10], #32
2911	ldp		q16, q17, [x9], #32
2912	ldp		q18, q19, [x9], #32
2913	ldp		q20, q21, [x9], #32
2914	ldp		q22, q23, [x9], #32
2915
2916	// unroll 0
2917	fmla	v0.2d, v16.2d, v24.2d[0]
2918	fmla	v1.2d, v17.2d, v24.2d[0]
2919
2920	// unroll 1
2921	fmla	v0.2d, v18.2d, v24.2d[1]
2922	fmla	v1.2d, v19.2d, v24.2d[1]
2923	sub		w8, w8, #4
2924
2925	// unroll 2
2926	fmla	v0.2d, v20.2d, v25.2d[0]
2927	fmla	v1.2d, v21.2d, v25.2d[0]
2928	cmp		w8, #4
2929
2930	// unroll 3
2931	fmla	v0.2d, v22.2d, v25.2d[1]
2932	fmla	v1.2d, v23.2d, v25.2d[1]
2933
2934	bgt		1b
2935
2936
2937	// reduce
2938
29390:
2940
2941	cmp		w8, #3
2942	ble		4f
2943
2944	// load 0 & 1 & 2 & 3
2945	ldp		q24, q25, [x10], #32
2946	ldp		q16, q17, [x9], #32
2947	ldp		q18, q19, [x9], #32
2948	ldp		q20, q21, [x9], #32
2949	ldp		q22, q23, [x9], #32
2950
2951	// unroll 0
2952	fmla	v0.2d, v16.2d, v24.2d[0]
2953	fmla	v1.2d, v17.2d, v24.2d[0]
2954
2955	// unroll 1
2956	fmla	v0.2d, v18.2d, v24.2d[1]
2957	fmla	v1.2d, v19.2d, v24.2d[1]
2958	sub		w8, w8, #4
2959
2960	// unroll 2
2961	fmla	v0.2d, v20.2d, v25.2d[0]
2962	fmla	v1.2d, v21.2d, v25.2d[0]
2963//	cmp		w8, #4
2964
2965	// unroll 3
2966	fmla	v0.2d, v22.2d, v25.2d[1]
2967	fmla	v1.2d, v23.2d, v25.2d[1]
2968
2969	b		2f // return
2970
29714: // consider clean1-up loop
2972
2973	cmp		w8, #0
2974	ble		2f // return
2975
29763: // clean1-up loop
2977
2978	// unroll 0
2979	ldp		q24, q25, [x9, #0]
2980	ldr		d28, [x10], #8
2981	fmla	v0.2d, v24.2d, v28.2d[0]
2982	fmla	v1.2d, v25.2d, v28.2d[0]
2983	add		x9, x9, #32
2984	sub		w8, w8, #1
2985	cmp		w8, #0
2986
2987	bgt		3b
2988
29892: // return
2990
2991
2992
2993#endif
2994
2995
2996
2997#if MACRO_LEVEL>=2
2998	.endm
2999#else
3000	ret
3001
3002	FUN_END(inner_kernel_gemm_add_nn_4x1_lib4c)
3003#endif
3004
3005
3006
3007
3008
3009// subroutine
3010//
3011// triangular substitution:
3012// side = left
3013// uplo = lower
3014// tran = not-transposed
3015// unit diagonal
3016//
3017// input arguments:
3018// x8   <- E
3019// x9   <- lde
3020//
3021// output arguments:
3022
3023#if MACRO_LEVEL>=1
3024	.macro INNER_EDGE_TRSM_LLN_ONE_4X4_LIB
3025#else
3026	.align 4
3027	FUN_START(inner_edge_trsm_lln_one_4x4_lib)
3028#endif
3029
3030	ldp		q24, q25, [x8, #0] // E[0+4*0]
3031	add		x8, x8, x9
3032	ins		v24.d[0], xzr
3033	fmls	v0.2d, v24.2d, v0.2d[0]
3034	fmls	v1.2d, v25.2d, v0.2d[0]
3035	fmls	v2.2d, v24.2d, v2.2d[0]
3036	fmls	v3.2d, v25.2d, v2.2d[0]
3037	fmls	v4.2d, v24.2d, v4.2d[0]
3038	fmls	v5.2d, v25.2d, v4.2d[0]
3039	fmls	v6.2d, v24.2d, v6.2d[0]
3040	fmls	v7.2d, v25.2d, v6.2d[0]
3041
3042	ldr		q25, [x8, #16] // E[2+4*1]
3043	add		x8, x8, x9
3044	fmls	v1.2d, v25.2d, v0.2d[1]
3045	fmls	v3.2d, v25.2d, v2.2d[1]
3046	fmls	v5.2d, v25.2d, v4.2d[1]
3047	fmls	v7.2d, v25.2d, v6.2d[1]
3048
3049	ldr		q25, [x8, #16] // E[2+4*2]
3050//	add		x8, x8, x9
3051	ins		v25.d[0], xzr
3052	fmls	v1.2d, v25.2d, v1.2d[0]
3053	fmls	v3.2d, v25.2d, v3.2d[0]
3054	fmls	v5.2d, v25.2d, v5.2d[0]
3055	fmls	v7.2d, v25.2d, v7.2d[0]
3056
3057#if MACRO_LEVEL>=1
3058	.endm
3059#else
3060	ret
3061
3062	FUN_END(inner_edge_trsm_lln_one_4x4_lib)
3063#endif
3064
3065
3066
3067
3068
3069// subroutine
3070//
3071// triangular substitution:
3072// side = right
3073// uplo = lower
3074// tran = transposed
3075// requires explicit inverse of diagonal
3076//
3077// input arguments:
3078// x8   <- E
3079// x9   <- lde
3080// x10  <- inv_diag_E
3081//
3082// output arguments:
3083
3084#if MACRO_LEVEL>=1
3085	.macro INNER_EDGE_TRSM_RLT_INV_4X4_LIB
3086#else
3087	.align 4
3088	FUN_START(inner_edge_trsm_rlt_inv_4x4_lib)
3089#endif
3090
3091	ldr			d16, [x10, #0] // E_inv[0]
3092	fmul		v0.2d, v0.2d, v16.2d[0]
3093	fmul		v1.2d, v1.2d, v16.2d[0]
3094	ldr			d16, [x8, #8] // E[1+4*0]
3095	fmls		v2.2d, v0.2d, v16.2d[0]
3096	fmls		v3.2d, v1.2d, v16.2d[0]
3097	ldr			d16, [x8, #16] // E[2+4*0]
3098	fmls		v4.2d, v0.2d, v16.2d[0]
3099	fmls		v5.2d, v1.2d, v16.2d[0]
3100	ldr			d16, [x8, #24] // E[3+4*0]
3101	fmls		v6.2d, v0.2d, v16.2d[0]
3102	fmls		v7.2d, v1.2d, v16.2d[0]
3103	add			x8, x8, x9
3104
3105	ldr			d16, [x10, #8] // E_inv[1]
3106	fmul		v2.2d, v2.2d, v16.2d[0]
3107	fmul		v3.2d, v3.2d, v16.2d[0]
3108	ldr			d16, [x8, #16] // E[2+4*1]
3109	fmls		v4.2d, v2.2d, v16.2d[0]
3110	fmls		v5.2d, v3.2d, v16.2d[0]
3111	ldr			d16, [x8, #24] // E[3+4*1]
3112	fmls		v6.2d, v2.2d, v16.2d[0]
3113	fmls		v7.2d, v3.2d, v16.2d[0]
3114	add			x8, x8, x9
3115
3116	ldr			d16, [x10, #16] // E_inv[2]
3117	fmul		v4.2d, v4.2d, v16.2d[0]
3118	fmul		v5.2d, v5.2d, v16.2d[0]
3119	ldr			d16, [x8, #24] // E[3+4*1]
3120	fmls		v6.2d, v4.2d, v16.2d[0]
3121	fmls		v7.2d, v5.2d, v16.2d[0]
3122//	add			x8, x8, x9
3123
3124	ldr			d16, [x10, #24] // E_inv[2]
3125	fmul		v6.2d, v6.2d, v16.2d[0]
3126	fmul		v7.2d, v7.2d, v16.2d[0]
3127//	add			x8, x8, x9
3128
3129#if MACRO_LEVEL>=1
3130	.endm
3131#else
3132	ret
3133
3134	FUN_END(inner_edge_trsm_rlt_inv_4x4_lib)
3135#endif
3136
3137
3138
3139
3140
3141// subroutine
3142//
3143// triangular substitution:
3144// side = right
3145// uplo = lower
3146// tran = transposed
3147// requires explicit inverse of diagonal
3148//
3149// input arguments:
3150// x8   <- E
3151// w9   <- lde
3152// x10  <- inv_diag_E
3153// w11  <- n1
3154//
3155// output arguments:
3156
3157#if MACRO_LEVEL>=1
3158	.macro INNER_EDGE_TRSM_RLT_INV_4X4_VS_LIB
3159#else
3160	.align 4
3161	FUN_START(inner_edge_trsm_rlt_inv_4x4_vs_lib)
3162#endif
3163
3164	// first column
3165	ldr			d16, [x10, #0] // E_inv[0]
3166	fmul		v0.2d, v0.2d, v16.2d[0]
3167	fmul		v1.2d, v1.2d, v16.2d[0]
3168	cmp			w11, #2
3169	blt			0f // return
3170
3171	// second column
3172	ldr			d16, [x8, #8] // E[1+4*0]
3173	fmls		v2.2d, v0.2d, v16.2d[0]
3174	fmls		v3.2d, v1.2d, v16.2d[0]
3175	ldr			d16, [x10, #8] // E_inv[1]
3176	fmul		v2.2d, v2.2d, v16.2d[0]
3177	fmul		v3.2d, v3.2d, v16.2d[0]
3178	cmp			w11, #3
3179	blt			0f // return
3180
3181	// third column
3182	add			x12, x8, x9
3183	ldr			d16, [x8, #16] // E[2+4*0]
3184	fmls		v4.2d, v0.2d, v16.2d[0]
3185	fmls		v5.2d, v1.2d, v16.2d[0]
3186	ldr			d16, [x12, #16] // E[2+4*1]
3187	fmls		v4.2d, v2.2d, v16.2d[0]
3188	fmls		v5.2d, v3.2d, v16.2d[0]
3189	ldr			d16, [x10, #16] // E_inv[2]
3190	fmul		v4.2d, v4.2d, v16.2d[0]
3191	fmul		v5.2d, v5.2d, v16.2d[0]
3192	cmp			w11, #4
3193	blt			0f // return
3194
3195	// forth column
3196	add			x13, x12, x9
3197	ldr			d16, [x8, #24] // E[3+4*0]
3198	fmls		v6.2d, v0.2d, v16.2d[0]
3199	fmls		v7.2d, v1.2d, v16.2d[0]
3200	ldr			d16, [x12, #24] // E[3+4*1]
3201	fmls		v6.2d, v2.2d, v16.2d[0]
3202	fmls		v7.2d, v3.2d, v16.2d[0]
3203	ldr			d16, [x13, #24] // E[3+4*2]
3204	fmls		v6.2d, v4.2d, v16.2d[0]
3205	fmls		v7.2d, v5.2d, v16.2d[0]
3206	ldr			d16, [x10, #24] // E_inv[3]
3207	fmul		v6.2d, v6.2d, v16.2d[0]
3208	fmul		v7.2d, v7.2d, v16.2d[0]
3209
32100:
3211#if MACRO_LEVEL>=1
3212	.endm
3213#else
3214	ret
3215
3216	FUN_END(inner_edge_trsm_rlt_inv_4x4_vs_lib)
3217#endif
3218
3219
3220
3221
3222
3223// subroutine
3224//
3225// input arguments:
3226//
3227// output arguments:
3228
3229#if MACRO_LEVEL>=1
3230	.macro INNER_TRAN_4X4_LIB
3231#else
3232	.align	4
3233	FUN_START(inner_tran_4x4_lib)
3234#endif
3235
3236	trn1	v24.2d, v0.2d, v2.2d
3237	trn2	v2.2d, v0.2d, v2.2d
3238	trn1	v25.2d, v5.2d, v7.2d
3239	trn2	v7.2d, v5.2d, v7.2d
3240	trn1	v26.2d, v1.2d, v3.2d
3241	trn2	v27.2d, v1.2d, v3.2d
3242	trn1	v1.2d, v4.2d, v6.2d
3243	trn2	v3.2d, v4.2d, v6.2d
3244	mov		v0.16b, v24.16b
3245	mov		v5.16b, v25.16b
3246	mov		v4.16b, v26.16b
3247	mov		v6.16b, v27.16b
3248
3249#if MACRO_LEVEL>=1
3250	.endm
3251#else
3252	ret
3253
3254	FUN_END(inner_tran_4x4_lib)
3255#endif
3256
3257
3258
3259
3260
3261// subroutine
3262//
3263// input arguments:
3264// x8   <- alpha
3265// x9   <- beta
3266// x10  <- C
3267// x11  <- ldc*sizeof(double)
3268//
3269// output arguments:
3270
3271#if MACRO_LEVEL>=1
3272	.macro INNER_SCALE_AB_4X4_LIB
3273#else
3274	.align	4
3275	FUN_START(inner_scale_ab_4x4_lib)
3276#endif
3277
3278	ld1		{v28.2d}, [x8]
3279
3280	ld1		{v29.2d}, [x9]
3281
3282	fmul	v0.2d, v0.2d, v28.2d[0]
3283	fmul	v1.2d, v1.2d, v28.2d[0]
3284	fmul	v2.2d, v2.2d, v28.2d[0]
3285	fmul	v3.2d, v3.2d, v28.2d[0]
3286	fmul	v4.2d, v4.2d, v28.2d[0]
3287	fmul	v5.2d, v5.2d, v28.2d[0]
3288	fmul	v6.2d, v6.2d, v28.2d[0]
3289	fmul	v7.2d, v7.2d, v28.2d[0]
3290
3291	fcmpe	d29, #0
3292	beq		0f
3293
3294	ldp		q24, q25, [x10, #0]
3295	add		x10, x10, x11
3296	ldp		q26, q27, [x10, #0]
3297	add		x10, x10, x11
3298	fmla	v0.2d, v24.2d, v29.2d[0]
3299	fmla	v1.2d, v25.2d, v29.2d[0]
3300	fmla	v2.2d, v26.2d, v29.2d[0]
3301	fmla	v3.2d, v27.2d, v29.2d[0]
3302
3303	ldp		q24, q25, [x10, #0]
3304	add		x10, x10, x11
3305	ldp		q26, q27, [x10, #0]
3306	add		x10, x10, x11
3307	fmla	v4.2d, v24.2d, v29.2d[0]
3308	fmla	v5.2d, v25.2d, v29.2d[0]
3309	fmla	v6.2d, v26.2d, v29.2d[0]
3310	fmla	v7.2d, v27.2d, v29.2d[0]
3311
33120:
3313
3314#if MACRO_LEVEL>=1
3315	.endm
3316#else
3317	ret
3318
3319	FUN_END(inner_scale_ab_4x4_lib)
3320#endif
3321
3322
3323
3324
3325
3326// subroutine
3327//
3328// input arguments:
3329// x8   <- alpha
3330// x9   <- beta
3331// x10  <- C
3332// x11  <- ldc*sizeof(double)
3333// x12  <- km
3334// x13  <- kn
3335//
3336// output arguments:
3337
3338#if MACRO_LEVEL>=1
3339	.macro INNER_SCALE_AB_4X4_VS_LIB
3340#else
3341	.align	4
3342	FUN_START(inner_scale_ab_4x4_vs_lib)
3343#endif
3344
3345	ld1		{v28.2d}, [x8]
3346
3347	ld1		{v29.2d}, [x9]
3348
3349	fmul	v0.2d, v0.2d, v28.2d[0]
3350	fmul	v1.2d, v1.2d, v28.2d[0]
3351	fmul	v2.2d, v2.2d, v28.2d[0]
3352	fmul	v3.2d, v3.2d, v28.2d[0]
3353	fmul	v4.2d, v4.2d, v28.2d[0]
3354	fmul	v5.2d, v5.2d, v28.2d[0]
3355	fmul	v6.2d, v6.2d, v28.2d[0]
3356	fmul	v7.2d, v7.2d, v28.2d[0]
3357
3358	fcmpe	d29, #0
3359	beq		0f
3360
3361	cmp		w12, #4
3362	blt		1f
3363
3364	ldp		q24, q25, [x10, #0]
3365	add		x10, x10, x11
3366	fmla	v0.2d, v24.2d, v29.2d[0]
3367	fmla	v1.2d, v25.2d, v29.2d[0]
3368
3369	cmp		w13, #1
3370	ble		0f
3371
3372	ldp		q24, q25, [x10, #0]
3373	add		x10, x10, x11
3374	fmla	v2.2d, v24.2d, v29.2d[0]
3375	fmla	v3.2d, v25.2d, v29.2d[0]
3376
3377	cmp		w13, #2
3378	ble		0f
3379
3380	ldp		q24, q25, [x10, #0]
3381	add		x10, x10, x11
3382	fmla	v4.2d, v24.2d, v29.2d[0]
3383	fmla	v5.2d, v25.2d, v29.2d[0]
3384
3385	cmp		w13, #3
3386	ble		0f
3387
3388	ldp		q24, q25, [x10, #0]
3389	add		x10, x10, x11
3390	fmla	v6.2d, v24.2d, v29.2d[0]
3391	fmla	v7.2d, v25.2d, v29.2d[0]
3392
3393	b 0f
3394
33951:
3396	cmp		w12, #3
3397	blt		2f
3398
3399	ldr		q24, [x10, #0]
3400	ldr		d25, [x10, #16]
3401	add		x10, x10, x11
3402	fmla	v0.2d, v24.2d, v29.2d[0]
3403	fmla	v1.2d, v25.2d, v29.2d[0]
3404
3405	cmp		w13, #1
3406	ble		0f
3407
3408	ldr		q24, [x10, #0]
3409	ldr		d25, [x10, #16]
3410	add		x10, x10, x11
3411	fmla	v2.2d, v24.2d, v29.2d[0]
3412	fmla	v3.2d, v25.2d, v29.2d[0]
3413
3414	cmp		w13, #2
3415	ble		0f
3416
3417	ldr		q24, [x10, #0]
3418	ldr		d25, [x10, #16]
3419	add		x10, x10, x11
3420	fmla	v4.2d, v24.2d, v29.2d[0]
3421	fmla	v5.2d, v25.2d, v29.2d[0]
3422
3423	cmp		w13, #3
3424	ble		0f
3425
3426	ldr		q24, [x10, #0]
3427	ldr		d25, [x10, #16]
3428	add		x10, x10, x11
3429	fmla	v6.2d, v24.2d, v29.2d[0]
3430	fmla	v7.2d, v25.2d, v29.2d[0]
3431
3432	b 0f
3433
34342:
3435	cmp		w12, #2
3436	blt		3f
3437
3438	ldr		q24, [x10, #0]
3439	add		x10, x10, x11
3440	fmla	v0.2d, v24.2d, v29.2d[0]
3441
3442	cmp		w13, #1
3443	ble		0f
3444
3445	ldr		q24, [x10, #0]
3446	add		x10, x10, x11
3447	fmla	v2.2d, v24.2d, v29.2d[0]
3448
3449	cmp		w13, #2
3450	ble		0f
3451
3452	ldr		q24, [x10, #0]
3453	add		x10, x10, x11
3454	fmla	v4.2d, v24.2d, v29.2d[0]
3455
3456	cmp		w13, #3
3457	ble		0f
3458
3459	ldr		q24, [x10, #0]
3460	add		x10, x10, x11
3461	fmla	v6.2d, v24.2d, v29.2d[0]
3462
3463	b 0f
3464
34653:
3466	cmp		w12, #1
3467	blt		0f
3468
3469	ldr		d24, [x10, #0]
3470	add		x10, x10, x11
3471	fmla	v0.2d, v24.2d, v29.2d[0]
3472
3473	cmp		w13, #1
3474	ble		0f
3475
3476	ldr		d24, [x10, #0]
3477	add		x10, x10, x11
3478	fmla	v2.2d, v24.2d, v29.2d[0]
3479
3480	cmp		w13, #2
3481	ble		0f
3482
3483	ldr		d24, [x10, #0]
3484	add		x10, x10, x11
3485	fmla	v4.2d, v24.2d, v29.2d[0]
3486
3487	cmp		w13, #3
3488	ble		0f
3489
3490	ldr		d24, [x10, #0]
3491	add		x10, x10, x11
3492	fmla	v6.2d, v24.2d, v29.2d[0]
3493
34940:
3495
3496#if MACRO_LEVEL>=1
3497	.endm
3498#else
3499	ret
3500
3501	FUN_END(inner_scale_ab_4x4_vs_lib)
3502#endif
3503
3504
3505
3506
3507
3508// subroutine
3509//
3510// input arguments:
3511// x8  <- beta
3512// x9  <- C
3513// x10 <- ldc
3514//
3515// output arguments:
3516
3517#if MACRO_LEVEL>=1
3518	.macro INNER_SCALE_M1B_4X4_LIB
3519#else
3520	.align	4
3521	FUN_START(inner_scale_m1b_4x4_lib)
3522#endif
3523
3524	ld1		{v29.2d}, [x8]
3525
3526	fneg	v0.2d, v0.2d
3527	fneg	v1.2d, v1.2d
3528	fneg	v2.2d, v2.2d
3529	fneg	v3.2d, v3.2d
3530
3531	fneg	v4.2d, v4.2d
3532	fneg	v5.2d, v5.2d
3533	fneg	v6.2d, v6.2d
3534	fneg	v7.2d, v7.2d
3535
3536	fcmpe	d29, #0
3537	beq		0f
3538
3539	ldp		q24, q25, [x9, #0]
3540	add		x9, x9, x10
3541	ldp		q26, q27, [x9, #0]
3542	add		x9, x9, x10
3543	fmla	v0.2d, v24.2d, v29.2d[0]
3544	fmla	v1.2d, v25.2d, v29.2d[0]
3545	fmla	v2.2d, v26.2d, v29.2d[0]
3546	fmla	v3.2d, v27.2d, v29.2d[0]
3547
3548	ldp		q24, q25, [x9, #0]
3549	add		x9, x9, x10
3550	ldp		q26, q27, [x9, #0]
3551	add		x9, x9, x10
3552	fmla	v4.2d, v24.2d, v29.2d[0]
3553	fmla	v5.2d, v25.2d, v29.2d[0]
3554	fmla	v6.2d, v26.2d, v29.2d[0]
3555	fmla	v7.2d, v27.2d, v29.2d[0]
3556
35570:
3558
3559#if MACRO_LEVEL>=1
3560	.endm
3561#else
3562	ret
3563
3564	FUN_END(inner_scale_m1b_4x4_lib)
3565#endif
3566
3567
3568
3569
3570
3571// subroutine
3572//
3573// input arguments:
3574// x8   <- beta
3575// x9  <- C
3576// x19  <- ldc*sizeof(double)
3577// x11  <- km
3578// x12  <- kn
3579//
3580// output arguments:
3581
3582#if MACRO_LEVEL>=1
3583	.macro INNER_SCALE_M1B_4X4_VS_LIB
3584#else
3585	.align	4
3586	FUN_START(inner_scale_m1b_4x4_vs_lib)
3587#endif
3588
3589	ld1		{v29.2d}, [x8]
3590
3591	fneg	v0.2d, v0.2d
3592	fneg	v1.2d, v1.2d
3593	fneg	v2.2d, v2.2d
3594	fneg	v3.2d, v3.2d
3595
3596	fneg	v4.2d, v4.2d
3597	fneg	v5.2d, v5.2d
3598	fneg	v6.2d, v6.2d
3599	fneg	v7.2d, v7.2d
3600
3601	fcmpe	d29, #0
3602	beq		0f
3603
3604	cmp		w11, #4
3605	blt		1f
3606
3607	ldp		q24, q25, [x9, #0]
3608	add		x9, x9, x10
3609	fmla	v0.2d, v24.2d, v29.2d[0]
3610	fmla	v1.2d, v25.2d, v29.2d[0]
3611
3612	cmp		w12, #1
3613	ble		0f
3614
3615	ldp		q24, q25, [x9, #0]
3616	add		x9, x9, x10
3617	fmla	v2.2d, v24.2d, v29.2d[0]
3618	fmla	v3.2d, v25.2d, v29.2d[0]
3619
3620	cmp		w12, #2
3621	ble		0f
3622
3623	ldp		q24, q25, [x9, #0]
3624	add		x9, x9, x10
3625	fmla	v4.2d, v24.2d, v29.2d[0]
3626	fmla	v5.2d, v25.2d, v29.2d[0]
3627
3628	cmp		w12, #3
3629	ble		0f
3630
3631	ldp		q24, q25, [x9, #0]
3632	add		x9, x9, x10
3633	fmla	v6.2d, v24.2d, v29.2d[0]
3634	fmla	v7.2d, v25.2d, v29.2d[0]
3635
3636	b 0f
3637
36381:
3639	cmp		w11, #3
3640	blt		2f
3641
3642	ldr		q24, [x9, #0]
3643	ldr		d25, [x9, #16]
3644	add		x9, x9, x10
3645	fmla	v0.2d, v24.2d, v29.2d[0]
3646	fmla	v1.2d, v25.2d, v29.2d[0]
3647
3648	cmp		w12, #1
3649	ble		0f
3650
3651	ldr		q24, [x9, #0]
3652	ldr		d25, [x9, #16]
3653	add		x9, x9, x10
3654	fmla	v2.2d, v24.2d, v29.2d[0]
3655	fmla	v3.2d, v25.2d, v29.2d[0]
3656
3657	cmp		w12, #2
3658	ble		0f
3659
3660	ldr		q24, [x9, #0]
3661	ldr		d25, [x9, #16]
3662	add		x9, x9, x10
3663	fmla	v4.2d, v24.2d, v29.2d[0]
3664	fmla	v5.2d, v25.2d, v29.2d[0]
3665
3666	cmp		w12, #3
3667	ble		0f
3668
3669	ldr		q24, [x9, #0]
3670	ldr		d25, [x9, #16]
3671	add		x9, x9, x10
3672	fmla	v6.2d, v24.2d, v29.2d[0]
3673	fmla	v7.2d, v25.2d, v29.2d[0]
3674
3675	b 0f
3676
36772:
3678	cmp		w11, #2
3679	blt		3f
3680
3681	ldr		q24, [x9, #0]
3682	add		x9, x9, x10
3683	fmla	v0.2d, v24.2d, v29.2d[0]
3684
3685	cmp		w12, #1
3686	ble		0f
3687
3688	ldr		q24, [x9, #0]
3689	add		x9, x9, x10
3690	fmla	v2.2d, v24.2d, v29.2d[0]
3691
3692	cmp		w12, #2
3693	ble		0f
3694
3695	ldr		q24, [x9, #0]
3696	add		x9, x9, x10
3697	fmla	v4.2d, v24.2d, v29.2d[0]
3698
3699	cmp		w12, #3
3700	ble		0f
3701
3702	ldr		q24, [x9, #0]
3703	add		x9, x9, x10
3704	fmla	v6.2d, v24.2d, v29.2d[0]
3705
3706	b 0f
3707
37083:
3709	cmp		w11, #1
3710	blt		0f
3711
3712	ldr		d24, [x9, #0]
3713	add		x9, x9, x10
3714	fmla	v0.2d, v24.2d, v29.2d[0]
3715
3716	cmp		w12, #1
3717	ble		0f
3718
3719	ldr		d24, [x9, #0]
3720	add		x9, x9, x10
3721	fmla	v2.2d, v24.2d, v29.2d[0]
3722
3723	cmp		w12, #2
3724	ble		0f
3725
3726	ldr		d24, [x9, #0]
3727	add		x9, x9, x10
3728	fmla	v4.2d, v24.2d, v29.2d[0]
3729
3730	cmp		w12, #3
3731	ble		0f
3732
3733	ldr		d24, [x9, #0]
3734	add		x9, x9, x10
3735	fmla	v6.2d, v24.2d, v29.2d[0]
3736
37370:
3738
3739#if MACRO_LEVEL>=1
3740	.endm
3741#else
3742	ret
3743
3744	FUN_END(inner_scale_m1b_4x4_vs_lib)
3745#endif
3746
3747
3748
3749
3750
3751// subroutine
3752//
3753// input arguments:
3754// x8  <- C
3755// x9  <- ldc*sizeof(double)
3756//
3757// output arguments:
3758
3759#if MACRO_LEVEL>=1
3760	.macro INNER_SCALE_M11_4X4_LIB
3761#else
3762	.align	4
3763	FUN_START(inner_scale_m11_4x4_lib)
3764#endif
3765
3766	ldp		q24, q25, [x8, #0]
3767	add		x8, x8, x9
3768	ldp		q26, q27, [x8, #0]
3769	add		x8, x8, x9
3770	fsub	v0.2d, v24.2d, v0.2d
3771	fsub	v1.2d, v25.2d, v1.2d
3772	fsub	v2.2d, v26.2d, v2.2d
3773	fsub	v3.2d, v27.2d, v3.2d
3774
3775	ldp		q24, q25, [x8, #0]
3776	add		x8, x8, x9
3777	ldp		q26, q27, [x8, #0]
3778	add		x8, x8, x9
3779	fsub	v4.2d, v24.2d, v4.2d
3780	fsub	v5.2d, v25.2d, v5.2d
3781	fsub	v6.2d, v26.2d, v6.2d
3782	fsub	v7.2d, v27.2d, v7.2d
3783
3784#if MACRO_LEVEL>=1
3785	.endm
3786#else
3787	ret
3788
3789	FUN_END(inner_scale_m11_4x4_lib)
3790#endif
3791
3792
3793
3794
3795
3796// subroutine
3797//
3798// input arguments:
3799// x8  <- C
3800// x9  <- ldc*sizeof(double)
3801// x10  <- km
3802// x11  <- kn
3803//
3804// output arguments:
3805
3806#if MACRO_LEVEL>=1
3807	.macro INNER_SCALE_M11_4X4_VS_LIB
3808#else
3809	.align	4
3810	FUN_START(inner_scale_m11_4x4_vs_lib)
3811#endif
3812
3813	cmp		w10, #4
3814	blt		1f
3815
3816	ldp		q24, q25, [x8, #0]
3817	add		x8, x8, x9
3818	fsub	v0.2d, v24.2d, v0.2d
3819	fsub	v1.2d, v25.2d, v1.2d
3820
3821	cmp		w11, #1
3822	ble		0f
3823
3824	ldp		q24, q25, [x8, #0]
3825	add		x8, x8, x9
3826	fsub	v2.2d, v24.2d, v2.2d
3827	fsub	v3.2d, v25.2d, v3.2d
3828
3829	cmp		w11, #2
3830	ble		0f
3831
3832	ldp		q24, q25, [x8, #0]
3833	add		x8, x8, x9
3834	fsub	v4.2d, v24.2d, v4.2d
3835	fsub	v5.2d, v25.2d, v5.2d
3836
3837	cmp		w11, #3
3838	ble		0f
3839
3840	ldp		q24, q25, [x8, #0]
3841	add		x8, x8, x9
3842	fsub	v6.2d, v24.2d, v6.2d
3843	fsub	v7.2d, v25.2d, v7.2d
3844
3845	b 0f
3846
38471:
3848	cmp		w10, #3
3849	blt		2f
3850
3851	ldr		q24, [x8, #0]
3852	ldr		d25, [x8, #16]
3853	add		x8, x8, x9
3854	fsub	v0.2d, v24.2d, v0.2d
3855	fsub	v1.2d, v25.2d, v1.2d
3856
3857	cmp		w11, #1
3858	ble		0f
3859
3860	ldr		q24, [x8, #0]
3861	ldr		d25, [x8, #16]
3862	add		x8, x8, x9
3863	fsub	v2.2d, v24.2d, v2.2d
3864	fsub	v3.2d, v25.2d, v3.2d
3865
3866	cmp		w11, #2
3867	ble		0f
3868
3869	ldr		q24, [x8, #0]
3870	ldr		d25, [x8, #16]
3871	add		x8, x8, x9
3872	fsub	v4.2d, v24.2d, v4.2d
3873	fsub	v5.2d, v25.2d, v5.2d
3874
3875	cmp		w11, #3
3876	ble		0f
3877
3878	ldr		q24, [x8, #0]
3879	ldr		d25, [x8, #16]
3880	add		x8, x8, x9
3881	fsub	v6.2d, v24.2d, v6.2d
3882	fsub	v7.2d, v25.2d, v7.2d
3883
3884	b 0f
3885
38862:
3887	cmp		w10, #2
3888	blt		3f
3889
3890	ldr		q24, [x8, #0]
3891	add		x8, x8, x9
3892	fsub	v0.2d, v24.2d, v0.2d
3893
3894	cmp		w11, #1
3895	ble		0f
3896
3897	ldr		q24, [x8, #0]
3898	add		x8, x8, x9
3899	fsub	v2.2d, v24.2d, v2.2d
3900
3901	cmp		w11, #2
3902	ble		0f
3903
3904	ldr		q24, [x8, #0]
3905	add		x8, x8, x9
3906	fsub	v4.2d, v24.2d, v4.2d
3907
3908	cmp		w11, #3
3909	ble		0f
3910
3911	ldr		q24, [x8, #0]
3912	add		x8, x8, x9
3913	fsub	v6.2d, v24.2d, v6.2d
3914
3915	b 0f
3916
39173:
3918	cmp		w10, #1
3919	blt		0f
3920
3921	ldr		d24, [x8, #0]
3922	add		x8, x8, x9
3923	fsub	v0.2d, v24.2d, v0.2d
3924
3925	cmp		w11, #1
3926	ble		0f
3927
3928	ldr		d24, [x8, #0]
3929	add		x8, x8, x9
3930	fsub	v2.2d, v24.2d, v2.2d
3931
3932	cmp		w11, #2
3933	ble		0f
3934
3935	ldr		d24, [x8, #0]
3936	add		x8, x8, x9
3937	fsub	v4.2d, v24.2d, v4.2d
3938
3939	cmp		w11, #3
3940	ble		0f
3941
3942	ldr		d24, [x8, #0]
3943	add		x8, x8, x9
3944	fsub	v6.2d, v24.2d, v6.2d
3945
39460:
3947
3948#if MACRO_LEVEL>=1
3949	.endm
3950#else
3951	ret
3952
3953	FUN_END(inner_scale_m11_4x4_vs_lib)
3954#endif
3955
3956
3957
3958
3959
3960// subroutine
3961//
3962// input arguments:
3963// x8   <- D
3964// x9   <- ldd*sizeof(double)
3965//
3966// output arguments:
3967
3968#if MACRO_LEVEL>=1
3969	.macro INNER_STORE_4X4_LIB
3970#else
3971	.align 4
3972	FUN_START(inner_store_4x4_lib)
3973#endif
3974
3975	stp		q0, q1, [x8, #0]
3976	add		x8, x8, x9
3977	stp		q2, q3, [x8, #0]
3978	add		x8, x8, x9
3979	stp		q4, q5, [x8, #0]
3980	add		x8, x8, x9
3981	stp		q6, q7, [x8, #0]
3982
3983#if MACRO_LEVEL>=1
3984	.endm
3985#else
3986	ret
3987
3988	FUN_END(inner_store_4x4_lib)
3989#endif
3990
3991
3992
3993
3994
3995// subroutine
3996//
3997// input arguments:
3998// x8   <- D
3999// x9   <- ldd*sizeof(double)
4000// x10  <- km
4001// x11  <- kn
4002//
4003// output arguments:
4004
4005#if MACRO_LEVEL>=1
4006	.macro INNER_STORE_4X4_VS_LIB
4007#else
4008	.align 4
4009	FUN_START(inner_store_4x4_vs_lib)
4010#endif
4011
4012	cmp		w10, #4
4013	bge		1f
4014
4015	mov		x12, x8
4016
4017	ldp		q24, q25, [x12, #0]
4018	add		x12, x12, x9
4019	ldp		q26, q27, [x12, #0]
4020	add		x12, x12, x9
4021	ldp		q28, q29, [x12, #0]
4022	add		x12, x12, x9
4023	ldp		q30, q31, [x12, #0]
4024
4025	// 4th row
4026	ins		v1.d[1], v25.d[1]
4027	ins		v3.d[1], v27.d[1]
4028	ins		v5.d[1], v29.d[1]
4029	ins		v7.d[1], v31.d[1]
4030	cmp		w10, #3
4031	bge		1f
4032	// 3th row
4033	ins		v1.d[0], v25.d[0]
4034	ins		v3.d[0], v27.d[0]
4035	ins		v5.d[0], v29.d[0]
4036	ins		v7.d[0], v31.d[0]
4037	cmp		w10, #2
4038	bge		1f
4039	// 2nd row
4040	ins		v0.d[1], v24.d[1]
4041	ins		v2.d[1], v26.d[1]
4042	ins		v4.d[1], v28.d[1]
4043	ins		v6.d[1], v30.d[1]
4044	cmp		w10, #1
4045	bge		1f
4046	// 1st row
4047	ins		v0.d[0], v24.d[0]
4048	ins		v2.d[0], v26.d[0]
4049	ins		v4.d[0], v28.d[0]
4050	ins		v6.d[0], v30.d[0]
4051
40521:
4053	// 1st col
4054	stp		q0, q1, [x8, #0]
4055	add		x8, x8, x9
4056	cmp		w11, #2
4057	blt		0f
4058	// 2nd col
4059	stp		q2, q3, [x8, #0]
4060	add		x8, x8, x9
4061	cmp		w11, #3
4062	blt		0f
4063	// 3rd col
4064	stp		q4, q5, [x8, #0]
4065	add		x8, x8, x9
4066	cmp		w11, #3
4067	beq		0f
4068	// 4th col
4069	stp		q6, q7, [x8, #0]
4070
40710:
4072
4073#if MACRO_LEVEL>=1
4074	.endm
4075#else
4076	ret
4077
4078	FUN_END(inner_store_4x4_vs_lib)
4079#endif
4080
4081
4082
4083
4084
4085// subroutine
4086//
4087// input arguments:
4088// x8   <- D
4089// x9   <- ldd*sizeof(double)
4090//
4091// output arguments:
4092
4093#if MACRO_LEVEL>=1
4094	.macro INNER_STORE_L_4X4_LIB
4095#else
4096	.align 4
4097	FUN_START(inner_store_l_4x4_lib)
4098#endif
4099
4100	mov		x12, x8
4101
4102	add		x12, x12, x9
4103	ldr		q16, [x12, #0]
4104	add		x12, x12, x9
4105	add		x12, x12, x9
4106	ldr		q17, [x12, #16]
4107
4108	ins		v2.d[0], v16.d[0]
4109	ins		v7.d[0], v17.d[0]
4110
4111	stp		q0, q1, [x8, #0]
4112	add		x8, x8, x9
4113	stp		q2, q3, [x8, #0]
4114	add		x8, x8, x9
4115	str		q5, [x8, #16]
4116	add		x8, x8, x9
4117	str		q7, [x8, #16]
4118
4119#if MACRO_LEVEL>=1
4120	.endm
4121#else
4122	ret
4123
4124	FUN_END(inner_store_l_4x4_lib)
4125#endif
4126
4127
4128
4129
4130
4131// subroutine
4132//
4133// input arguments:
4134// x8   <- D
4135// x9   <- ldd*sizeof(double)
4136// x10  <- km
4137// x11  <- kn
4138//
4139// output arguments:
4140
4141#if MACRO_LEVEL>=1
4142	.macro INNER_STORE_L_4X4_VS_LIB
4143#else
4144	.align 4
4145	FUN_START(inner_store_l_4x4_vs_lib)
4146#endif
4147
4148	cmp		w10, #4
4149	bge		1f
4150
4151	mov		x12, x8
4152
4153	ldp		q24, q25, [x12, #0]
4154	add		x12, x12, x9
4155	ldp		q26, q27, [x12, #0]
4156	add		x12, x12, x9
4157	ldp		q28, q29, [x12, #0]
4158	add		x12, x12, x9
4159	ldp		q30, q31, [x12, #0]
4160
4161	// 4th row
4162	ins		v1.d[1], v25.d[1]
4163	ins		v3.d[1], v27.d[1]
4164	ins		v5.d[1], v29.d[1]
4165	ins		v7.d[1], v31.d[1]
4166	cmp		w10, #3
4167	bge		1f
4168	// 3th row
4169	ins		v1.d[0], v25.d[0]
4170	ins		v3.d[0], v27.d[0]
4171	ins		v5.d[0], v29.d[0]
4172	ins		v7.d[0], v31.d[0]
4173	cmp		w10, #2
4174	bge		1f
4175	// 2nd row
4176	ins		v0.d[1], v24.d[1]
4177	ins		v2.d[1], v26.d[1]
4178	ins		v4.d[1], v28.d[1]
4179	ins		v6.d[1], v30.d[1]
4180	cmp		w10, #1
4181	bge		1f
4182	// 1st row
4183	ins		v0.d[0], v24.d[0]
4184	ins		v2.d[0], v26.d[0]
4185	ins		v4.d[0], v28.d[0]
4186	ins		v6.d[0], v30.d[0]
4187
41881:
4189	mov		x12, x8
4190
4191	add		x12, x12, x9
4192	ldr		q16, [x12, #0]
4193	add		x12, x12, x9
4194	add		x12, x12, x9
4195	ldr		q17, [x12, #16]
4196
4197	ins		v2.d[0], v16.d[0]
4198	ins		v7.d[0], v17.d[0]
4199
4200	// 1st col
4201	stp		q0, q1, [x8, #0]
4202	add		x8, x8, x9
4203	cmp		w11, #2
4204	blt		0f
4205	// 2nd col
4206	stp		q2, q3, [x8, #0]
4207	add		x8, x8, x9
4208	cmp		w11, #3
4209	blt		0f
4210	// 3rd col
4211	str		q5, [x8, #16]
4212	add		x8, x8, x9
4213	beq		0f
4214	// 4th col
4215	str		q7, [x8, #16]
4216
42170:
4218
4219#if MACRO_LEVEL>=1
4220	.endm
4221#else
4222	ret
4223
4224	FUN_END(inner_store_l_4x4_vs_lib)
4225#endif
4226
4227
4228
4229
4230
4231// subroutine
4232//
4233// input arguments:
4234// x8   <- D
4235// x9   <- ldd*sizeof(double)
4236//
4237// output arguments:
4238
4239#if MACRO_LEVEL>=1
4240	.macro INNER_STORE_U_4X4_LIB
4241#else
4242	.align 4
4243	FUN_START(inner_store_u_4x4_lib)
4244#endif
4245
4246	str		d0, [x8, #0]
4247	add		x8, x8, x9
4248	str		q2, [x8, #0]
4249	add		x8, x8, x9
4250	str		q4, [x8, #0]
4251	str		d5, [x8, #16]
4252	add		x8, x8, x9
4253	stp		q6, q7, [x8, #0]
4254
4255#if MACRO_LEVEL>=1
4256	.endm
4257#else
4258	ret
4259
4260	FUN_END(inner_store_u_4x4_lib)
4261#endif
4262
4263
4264
4265
4266
4267// subroutine
4268//
4269// input arguments:
4270// x8   <- D
4271// x9   <- ldd*sizeof(double)
4272// x10  <- km
4273// x11  <- kn
4274//
4275// output arguments:
4276
4277#if MACRO_LEVEL>=1
4278	.macro INNER_STORE_U_4X4_VS_LIB
4279#else
4280	.align 4
4281	FUN_START(inner_store_u_4x4_vs_lib)
4282#endif
4283
4284	cmp		w10, #4
4285	bge		1f
4286
4287	mov		x12, x8
4288
4289	ldp		q24, q25, [x12, #0]
4290	add		x12, x12, x9
4291	ldp		q26, q27, [x12, #0]
4292	add		x12, x12, x9
4293	ldp		q28, q29, [x12, #0]
4294	add		x12, x12, x9
4295	ldp		q30, q31, [x12, #0]
4296
4297	// 4th row
4298	ins		v1.d[1], v25.d[1]
4299	ins		v3.d[1], v27.d[1]
4300	ins		v5.d[1], v29.d[1]
4301	ins		v7.d[1], v31.d[1]
4302	cmp		w10, #3
4303	bge		1f
4304	// 3th row
4305	ins		v1.d[0], v25.d[0]
4306	ins		v3.d[0], v27.d[0]
4307	ins		v5.d[0], v29.d[0]
4308	ins		v7.d[0], v31.d[0]
4309	cmp		w10, #2
4310	bge		1f
4311	// 2nd row
4312	ins		v0.d[1], v24.d[1]
4313	ins		v2.d[1], v26.d[1]
4314	ins		v4.d[1], v28.d[1]
4315	ins		v6.d[1], v30.d[1]
4316	cmp		w10, #1
4317	bge		1f
4318	// 1st row
4319	ins		v0.d[0], v24.d[0]
4320	ins		v2.d[0], v26.d[0]
4321	ins		v4.d[0], v28.d[0]
4322	ins		v6.d[0], v30.d[0]
4323
43241:
4325	// 1st col
4326	str		d0, [x8, #0]
4327	add		x8, x8, x9
4328	cmp		w11, #2
4329	blt		0f
4330	// 2nd col
4331	str		q2, [x8, #0]
4332	add		x8, x8, x9
4333	cmp		w11, #3
4334	blt		0f
4335	// 3rd col
4336	str		q4, [x8, #0]
4337	str		d5, [x8, #16]
4338	add		x8, x8, x9
4339	beq		0f
4340	// 4th col
4341	stp		q6, q7, [x8, #0]
4342
43430:
4344
4345#if MACRO_LEVEL>=1
4346	.endm
4347#else
4348	ret
4349
4350	FUN_END(inner_store_u_4x4_vs_lib)
4351#endif
4352
4353
4354
4355
4356
4357// subroutine
4358//
4359// input arguments:
4360// x8   <- D
4361// x9   <- ldd*sizeof(double)
4362//
4363// output arguments:
4364
4365#if MACRO_LEVEL>=1
4366	.macro INNER_PREFETCH_4X4_LIB
4367#else
4368	.align 4
4369	FUN_START(inner_prefetch_4x4_lib)
4370#endif
4371
4372	prfm	PLDL1KEEP, [x8, #0]
4373	add		x8, x8, x9
4374	prfm	PLDL1KEEP, [x8, #0]
4375	add		x8, x8, x9
4376	prfm	PLDL1KEEP, [x8, #0]
4377	add		x8, x8, x9
4378	prfm	PLDL1KEEP, [x8, #0]
4379
4380#if MACRO_LEVEL>=1
4381	.endm
4382#else
4383	ret
4384
4385	FUN_END(inner_prefetch_4x4_lib)
4386#endif
4387
4388
4389
4390
4391
4392//                                 w0        x1             x2         x3         x4            x5         w6       x7         sp+0
4393// void kernel_dgemm_nt_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
4394
4395	.align	4
4396	GLOB_FUN_START(kernel_dgemm_nt_4x4_lib44cc)
4397
4398
4399
4400	PROLOGUE
4401
4402
4403
4404	ZERO_ACC
4405
4406
4407
4408	// call inner kernel gemm nt
4409	mov		w8, w0 // kmax
4410	mov		x9, x2 // A
4411	mov		x10, x3 // B
4412
4413#if MACRO_LEVEL>=2
4414	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
4415#else
4416	bl	inner_kernel_gemm_add_nt_4x4_lib4
4417#endif
4418
4419
4420
4421	// prefetch
4422	mov		x8, x7 // D
4423	ldr		w9, [sp, #(STACKSIZE + 0)] // ldd
4424	lsl		w9, w9, #3 // 8*ldd
4425
4426#if MACRO_LEVEL>=1
4427	INNER_PREFETCH_4X4_LIB
4428#else
4429	bl inner_prefetch_4x4_lib
4430#endif
4431
4432
4433
4434	// call inner blend for generic alpha and beta
4435	mov		x8, x1 // alpha
4436	mov		x9, x4 // beta
4437	mov		x10, x5 // C
4438	mov		w11, w6 // ldc
4439	lsl		w11, w11, #3 // 8*ldc
4440
4441#if MACRO_LEVEL>=1
4442	INNER_SCALE_AB_4X4_LIB
4443#else
4444	bl inner_scale_ab_4x4_lib
4445#endif
4446
4447
4448
4449	// store n
4450	mov		x8, x7 // D
4451	ldr		w9, [sp, #(STACKSIZE + 0)] // ldd
4452	lsl		w9, w9, #3 // 8*ldd
4453
4454#if MACRO_LEVEL>=1
4455	INNER_STORE_4X4_LIB
4456#else
4457	bl inner_store_4x4_lib
4458#endif
4459
4460
4461
4462	EPILOGUE
4463
4464	mov	x0, #0
4465
4466	ret
4467
4468	FUN_END(kernel_dgemm_nt_4x4_lib44cc)
4469
4470
4471
4472
4473
4474//                                    w0        x1             x2         x3         x4            x5         w6       x7         sp+0     sp+8    sp+16
4475// void kernel_dgemm_nt_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
4476
4477	.align	4
4478	GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_lib44cc)
4479
4480
4481
4482	PROLOGUE
4483
4484
4485
4486	ZERO_ACC
4487
4488
4489
4490	// call inner kernel gemm nt
4491	mov		w8, w0 // kmax
4492	mov		x9, x2 // A
4493	mov		x10, x3 // B
4494
4495#if MACRO_LEVEL>=2
4496	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
4497#else
4498	bl	inner_kernel_gemm_add_nt_4x4_lib4
4499#endif
4500
4501
4502
4503	// call inner blend for generic alpha and beta
4504	mov		x8, x1 // alpha
4505	mov		x9, x4 // beta
4506	mov		x10, x5 // C
4507	mov		w11, w6 // ldc
4508	lsl		w11, w11, #3 // 8*ldc
4509	ldr		w12, [sp, #(STACKSIZE + 8)] // m1
4510	ldr		w13, [sp, #(STACKSIZE + 16)] // n1
4511
4512#if MACRO_LEVEL>=1
4513	INNER_SCALE_AB_4X4_VS_LIB
4514#else
4515	bl inner_scale_ab_4x4_vs_lib
4516#endif
4517
4518
4519
4520	// store n
4521	mov		x8, x7 // D
4522	ldr		w9, [sp, #(STACKSIZE + 0)] // ldd
4523	lsl		w9, w9, #3 // 8*ldd
4524	ldr		w10, [sp, #(STACKSIZE + 8)] // m1
4525	ldr		w11, [sp, #(STACKSIZE + 16)] // n1
4526
4527#if MACRO_LEVEL>=1
4528	INNER_STORE_4X4_VS_LIB
4529#else
4530	bl inner_store_4x4_vs_lib
4531#endif
4532
4533
4534
4535	EPILOGUE
4536
4537	mov	x0, #0
4538
4539	ret
4540
4541	FUN_END(kernel_dgemm_nt_4x4_vs_lib44cc)
4542
4543
4544
4545
4546
4547//                                 w0        x1             x2         x3         x4       x5            w6         x7       sp+0       sp+8
4548// void kernel_dgemm_nt_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
4549
4550	.align	4
4551	GLOB_FUN_START(kernel_dgemm_nt_4x4_lib4ccc)
4552
4553
4554
4555	PROLOGUE
4556
4557
4558
4559	ZERO_ACC
4560
4561
4562
4563	// call inner kernel gemm nt
4564	mov		w8, w0 // kmax
4565	mov		x9, x2 // A
4566	mov		x10, x3 // B
4567	mov		w11, w4 // ldb
4568	lsl		w11, w11, #3 // 8*ldb
4569
4570#if MACRO_LEVEL>=2
4571	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C
4572#else
4573	bl	inner_kernel_gemm_add_nt_4x4_lib4c
4574#endif
4575
4576
4577
4578	// prefetch
4579	ldr		x8, [sp, #(STACKSIZE + 0)] // D
4580	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
4581	lsl		w9, w9, #3 // 8*sdd
4582
4583#if MACRO_LEVEL>=1
4584	INNER_PREFETCH_4X4_LIB
4585#else
4586	bl inner_prefetch_4x4_lib
4587#endif
4588
4589
4590
4591	// call inner blend for generic alpha and beta
4592	mov		x8, x1 // alpha
4593	mov		x9, x5 // beta
4594	mov		x10, x6 // C
4595	mov		w11, w7 // ldc
4596	lsl		w11, w11, #3 // 8*ldc
4597
4598#if MACRO_LEVEL>=1
4599	INNER_SCALE_AB_4X4_LIB
4600#else
4601	bl inner_scale_ab_4x4_lib
4602#endif
4603
4604
4605
4606	// store n
4607	ldr		x8, [sp, #(STACKSIZE + 0)] // D
4608	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
4609	lsl		w9, w9, #3 // 8*ldd
4610
4611#if MACRO_LEVEL>=1
4612	INNER_STORE_4X4_LIB
4613#else
4614	bl inner_store_4x4_lib
4615#endif
4616
4617
4618
4619	EPILOGUE
4620
4621	mov	x0, #0
4622
4623	ret
4624
4625	FUN_END(kernel_dgemm_nt_4x4_lib4ccc)
4626
4627
4628
4629
4630
4631
4632//                                    w0        x1             x2         x3         x4       x5            w6         x7       sp+0       sp+8     sp+16   sp+24
4633// void kernel_dgemm_nt_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
4634
4635	.align	4
4636	GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_lib4ccc)
4637
4638
4639
4640	PROLOGUE
4641
4642
4643
4644	ZERO_ACC
4645
4646
4647
4648	// call inner kernel gemm nt
4649	mov		w8, w0 // kmax
4650	mov		x9, x2 // A
4651	mov		x10, x3 // B
4652	mov		w11, w4 // ldb
4653	lsl		w11, w11, #3 // 8*ldb
4654
4655	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
4656	cmp		w12, #1
4657	bgt		100f
4658
4659#if MACRO_LEVEL>=2
4660	INNER_KERNEL_GEMM_ADD_NT_4X1_LIB4C
4661#else
4662	bl	inner_kernel_gemm_add_nt_4x1_lib4c
4663#endif
4664
4665	b		103f
4666
4667100:
4668
4669	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
4670	cmp		w12, #2
4671	bgt		101f
4672
4673#if MACRO_LEVEL>=2
4674	INNER_KERNEL_GEMM_ADD_NT_4X2_LIB4C
4675#else
4676	bl	inner_kernel_gemm_add_nt_4x2_lib4c
4677#endif
4678
4679	b		103f
4680
4681101:
4682
4683	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
4684	cmp		w12, #3
4685	bgt		102f
4686
4687#if MACRO_LEVEL>=2
4688	INNER_KERNEL_GEMM_ADD_NT_4X3_LIB4C
4689#else
4690	bl	inner_kernel_gemm_add_nt_4x3_lib4c
4691#endif
4692
4693	b		103f
4694
4695102:
4696
4697#if MACRO_LEVEL>=2
4698	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C
4699#else
4700	bl	inner_kernel_gemm_add_nt_4x4_lib4c
4701#endif
4702
4703103:
4704
4705
4706
4707	// prefetch
4708	// TODO prefethc vs
4709//	ldr		x8, [sp, #(STACKSIZE + 0)] // D
4710//	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
4711//	lsl		w9, w9, #3 // 8*sdd
4712
4713#if MACRO_LEVEL>=1
4714//	INNER_PREFETCH_4X4_LIB
4715#else
4716//	bl inner_prefetch_4x4_lib
4717#endif
4718
4719
4720
4721	// call inner blend for generic alpha and beta
4722	mov		x8, x1 // alpha
4723	mov		x9, x5 // beta
4724	mov		x10, x6 // C
4725	mov		w11, w7 // ldc
4726	lsl		w11, w11, #3 // 8*ldc
4727	ldr		w12, [sp, #(STACKSIZE + 16)] // m1
4728	ldr		w13, [sp, #(STACKSIZE + 24)] // n1
4729
4730#if MACRO_LEVEL>=1
4731	INNER_SCALE_AB_4X4_VS_LIB
4732#else
4733	bl inner_scale_ab_4x4_vs_lib
4734#endif
4735
4736
4737
4738	// store n
4739	ldr		x8, [sp, #(STACKSIZE + 0)] // D
4740	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
4741	lsl		w9, w9, #3 // 8*ldd
4742	ldr		w10, [sp, #(STACKSIZE + 16)] // m1
4743	ldr		w11, [sp, #(STACKSIZE + 24)] // n1
4744
4745#if MACRO_LEVEL>=1
4746	INNER_STORE_4X4_VS_LIB
4747#else
4748	bl inner_store_4x4_vs_lib
4749#endif
4750
4751
4752
4753	EPILOGUE
4754
4755	mov	x0, #0
4756
4757	ret
4758
4759	FUN_END(kernel_dgemm_nt_4x4_vs_lib4ccc)
4760
4761
4762
4763
4764
4765
4766//                                 w0        x1             x2         x3         x4       x5            w6         x7       sp+0       sp+8
4767// void kernel_dgemm_nn_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd)
4768
4769	.align	4
4770	GLOB_FUN_START(kernel_dgemm_nn_4x4_lib4ccc)
4771
4772
4773
4774	PROLOGUE
4775
4776
4777
4778	ZERO_ACC
4779
4780
4781
4782	// call inner kernel gemm nt
4783	mov		w8, w0 // kmax
4784	mov		x9, x2 // A
4785	mov		x10, x3 // B
4786	mov		w11, w4 // ldb
4787	lsl		w11, w11, #3 // 8*ldb
4788
4789#if MACRO_LEVEL>=2
4790	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C
4791#else
4792	bl	inner_kernel_gemm_add_nn_4x4_lib4c
4793#endif
4794
4795
4796
4797	// prefetch
4798	ldr		x8, [sp, #(STACKSIZE + 0)] // D
4799	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
4800	lsl		w9, w9, #3 // 8*sdd
4801
4802#if MACRO_LEVEL>=1
4803	INNER_PREFETCH_4X4_LIB
4804#else
4805	bl inner_prefetch_4x4_lib
4806#endif
4807
4808
4809
4810	// call inner blend for generic alpha and beta
4811	mov		x8, x1 // alpha
4812	mov		x9, x5 // beta
4813	mov		x10, x6 // C
4814	mov		w11, w7 // ldc
4815	lsl		w11, w11, #3 // 8*ldc
4816
4817#if MACRO_LEVEL>=1
4818	INNER_SCALE_AB_4X4_LIB
4819#else
4820	bl inner_scale_ab_4x4_lib
4821#endif
4822
4823
4824
4825	// store n
4826	ldr		x8, [sp, #(STACKSIZE + 0)] // D
4827	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
4828	lsl		w9, w9, #3 // 8*ldd
4829
4830#if MACRO_LEVEL>=1
4831	INNER_STORE_4X4_LIB
4832#else
4833	bl inner_store_4x4_lib
4834#endif
4835
4836
4837
4838	EPILOGUE
4839
4840	mov	x0, #0
4841
4842	ret
4843
4844	FUN_END(kernel_dgemm_nn_4x4_lib4ccc)
4845
4846
4847
4848
4849
4850//                                    w0        x1             x2         x3         x4       x5            w6         x7       sp+0       sp+8     sp+16   sp+24
4851// void kernel_dgemm_nn_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
4852
4853	.align	4
4854	GLOB_FUN_START(kernel_dgemm_nn_4x4_vs_lib4ccc)
4855
4856
4857
4858	PROLOGUE
4859
4860
4861
4862	ZERO_ACC
4863
4864
4865
4866	// call inner kernel gemm nt
4867	mov		w8, w0 // kmax
4868	mov		x9, x2 // A
4869	mov		x10, x3 // B
4870	mov		w11, w4 // ldb
4871	lsl		w11, w11, #3 // 8*ldb
4872
4873	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
4874	cmp		w12, #1
4875	bgt		100f
4876
4877#if MACRO_LEVEL>=2
4878	INNER_KERNEL_GEMM_ADD_NN_4X1_LIB4C
4879#else
4880	bl	inner_kernel_gemm_add_nn_4x1_lib4c
4881#endif
4882
4883	b		103f
4884
4885100:
4886
4887	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
4888	cmp		w12, #2
4889	bgt		101f
4890
4891#if MACRO_LEVEL>=2
4892	INNER_KERNEL_GEMM_ADD_NN_4X2_LIB4C
4893#else
4894	bl	inner_kernel_gemm_add_nn_4x2_lib4c
4895#endif
4896
4897	b		103f
4898
4899101:
4900
4901	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
4902	cmp		w12, #3
4903	bgt		102f
4904
4905#if MACRO_LEVEL>=2
4906	INNER_KERNEL_GEMM_ADD_NN_4X3_LIB4C
4907#else
4908	bl	inner_kernel_gemm_add_nn_4x3_lib4c
4909#endif
4910
4911	b		103f
4912
4913102:
4914
4915#if MACRO_LEVEL>=2
4916	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C
4917#else
4918	bl	inner_kernel_gemm_add_nn_4x4_lib4c
4919#endif
4920
4921103:
4922
4923
4924
4925	// prefetch
4926	// TODO prefethc vs
4927//	ldr		x8, [sp, #(STACKSIZE + 0)] // D
4928//	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
4929//	lsl		w9, w9, #3 // 8*sdd
4930
4931#if MACRO_LEVEL>=1
4932//	INNER_PREFETCH_4X4_LIB
4933#else
4934//	bl inner_prefetch_4x4_lib
4935#endif
4936
4937
4938
4939	// call inner blend for generic alpha and beta
4940	mov		x8, x1 // alpha
4941	mov		x9, x5 // beta
4942	mov		x10, x6 // C
4943	mov		w11, w7 // ldc
4944	lsl		w11, w11, #3 // 8*ldc
4945	ldr		w12, [sp, #(STACKSIZE + 16)] // m1
4946	ldr		w13, [sp, #(STACKSIZE + 24)] // n1
4947
4948#if MACRO_LEVEL>=1
4949	INNER_SCALE_AB_4X4_VS_LIB
4950#else
4951	bl inner_scale_ab_4x4_vs_lib
4952#endif
4953
4954
4955
4956	// store n
4957	ldr		x8, [sp, #(STACKSIZE + 0)] // D
4958	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
4959	lsl		w9, w9, #3 // 8*ldd
4960	ldr		w10, [sp, #(STACKSIZE + 16)] // m1
4961	ldr		w11, [sp, #(STACKSIZE + 24)] // n1
4962
4963#if MACRO_LEVEL>=1
4964	INNER_STORE_4X4_VS_LIB
4965#else
4966	bl inner_store_4x4_vs_lib
4967#endif
4968
4969
4970
4971	EPILOGUE
4972
4973	mov	x0, #0
4974
4975	ret
4976
4977	FUN_END(kernel_dgemm_nn_4x4_vs_lib4ccc)
4978
4979
4980
4981
4982
4983//                                   w0        x1             x2         x3         x4            x5         w6       x7         sp+0
4984// void kernel_dsyrk_nt_l_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
4985
4986	.align	4
4987	GLOB_FUN_START(kernel_dsyrk_nt_l_4x4_lib44cc)
4988
4989
4990
4991	PROLOGUE
4992
4993
4994
4995	ZERO_ACC
4996
4997
4998
4999	// call inner kernel gemm nt
5000	mov		w8, w0 // kmax
5001	mov		x9, x2 // A
5002	mov		x10, x3 // B
5003
5004#if MACRO_LEVEL>=2
5005	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5006#else
5007	bl	inner_kernel_gemm_add_nt_4x4_lib4
5008#endif
5009
5010
5011
5012	// call inner blend for generic alpha and beta
5013	mov		x8, x1 // alpha
5014	mov		x9, x4 // beta
5015	mov		x10, x5 // C
5016	mov		w11, w6 // ldc
5017	lsl		w11, w11, #3 // 8*ldc
5018
5019#if MACRO_LEVEL>=1
5020	INNER_SCALE_AB_4X4_LIB
5021#else
5022	bl inner_scale_ab_4x4_lib
5023#endif
5024
5025
5026
5027	// store n
5028	mov		x8, x7 // D
5029	ldr		w9, [sp, #(STACKSIZE + 0)] // ldd
5030	lsl		w9, w9, #3 // 8*ldd
5031
5032#if MACRO_LEVEL>=1
5033	INNER_STORE_L_4X4_LIB
5034#else
5035	bl inner_store_l_4x4_lib
5036#endif
5037
5038
5039
5040	EPILOGUE
5041
5042	mov	x0, #0
5043
5044	ret
5045
5046	FUN_END(kernel_dsyrk_nt_l_4x4_lib44cc)
5047
5048
5049
5050
5051
5052//                                      w0        x1             x2         x3         x4            x5         w6       x7         sp+0     sp+8    sp+16
5053// void kernel_dsyrk_nt_l_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
5054
5055	.align	4
5056	GLOB_FUN_START(kernel_dsyrk_nt_l_4x4_vs_lib44cc)
5057
5058
5059
5060	PROLOGUE
5061
5062
5063
5064	ZERO_ACC
5065
5066
5067
5068	// call inner kernel gemm nt
5069	mov		w8, w0 // kmax
5070	mov		x9, x2 // A
5071	mov		x10, x3 // B
5072
5073#if MACRO_LEVEL>=2
5074	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5075#else
5076	bl	inner_kernel_gemm_add_nt_4x4_lib4
5077#endif
5078
5079
5080
5081	// call inner blend for generic alpha and beta
5082	mov		x8, x1 // alpha
5083	mov		x9, x4 // beta
5084	mov		x10, x5 // C
5085	mov		w11, w6 // ldc
5086	lsl		w11, w11, #3 // 8*ldc
5087	ldr		w12, [sp, #(STACKSIZE + 8)] // m1
5088	ldr		w13, [sp, #(STACKSIZE + 16)] // n1
5089
5090#if MACRO_LEVEL>=1
5091	INNER_SCALE_AB_4X4_VS_LIB
5092#else
5093	bl inner_scale_ab_4x4_vs_lib
5094#endif
5095
5096
5097
5098	// store n
5099	mov		x8, x7 // D
5100	ldr		w9, [sp, #(STACKSIZE + 0)] // ldd
5101	lsl		w9, w9, #3 // 8*ldd
5102	ldr		w10, [sp, #(STACKSIZE + 8)] // m1
5103	ldr		w11, [sp, #(STACKSIZE + 16)] // n1
5104
5105#if MACRO_LEVEL>=1
5106	INNER_STORE_L_4X4_VS_LIB
5107#else
5108	bl inner_store_l_4x4_vs_lib
5109#endif
5110
5111
5112
5113	EPILOGUE
5114
5115	mov	x0, #0
5116
5117	ret
5118
5119	FUN_END(kernel_dsyrk_nt_l_4x4_vs_lib44cc)
5120
5121
5122
5123
5124
5125
5126//                                   w0        x1             x2         x3         x4            x5         w6       x7         sp+0
5127// void kernel_dsyrk_nt_u_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd)
5128
5129	.align	4
5130	GLOB_FUN_START(kernel_dsyrk_nt_u_4x4_lib44cc)
5131
5132
5133
5134	PROLOGUE
5135
5136
5137
5138	ZERO_ACC
5139
5140
5141
5142	// call inner kernel gemm nt
5143	mov		w8, w0 // kmax
5144	mov		x9, x2 // A
5145	mov		x10, x3 // B
5146
5147#if MACRO_LEVEL>=2
5148	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5149#else
5150	bl	inner_kernel_gemm_add_nt_4x4_lib4
5151#endif
5152
5153
5154
5155	// call inner blend for generic alpha and beta
5156	mov		x8, x1 // alpha
5157	mov		x9, x4 // beta
5158	mov		x10, x5 // C
5159	mov		w11, w6 // ldc
5160	lsl		w11, w11, #3 // 8*ldc
5161
5162#if MACRO_LEVEL>=1
5163	INNER_SCALE_AB_4X4_LIB
5164#else
5165	bl inner_scale_ab_4x4_lib
5166#endif
5167
5168
5169
5170	// store n
5171	mov		x8, x7 // D
5172	ldr		w9, [sp, #(STACKSIZE + 0)] // ldd
5173	lsl		w9, w9, #3 // 8*ldd
5174
5175#if MACRO_LEVEL>=1
5176	INNER_STORE_U_4X4_LIB
5177#else
5178	bl inner_store_u_4x4_lib
5179#endif
5180
5181
5182
5183	EPILOGUE
5184
5185	mov	x0, #0
5186
5187	ret
5188
5189	FUN_END(kernel_dsyrk_nt_u_4x4_lib44cc)
5190
5191
5192
5193
5194
5195//                                      w0        x1             x2         x3         x4            x5         w6       x7         sp+0     sp+8    sp+16
5196// void kernel_dsyrk_nt_u_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
5197
5198	.align	4
5199	GLOB_FUN_START(kernel_dsyrk_nt_u_4x4_vs_lib44cc)
5200
5201
5202
5203	PROLOGUE
5204
5205
5206
5207	ZERO_ACC
5208
5209
5210
5211	// call inner kernel gemm nt
5212	mov		w8, w0 // kmax
5213	mov		x9, x2 // A
5214	mov		x10, x3 // B
5215
5216#if MACRO_LEVEL>=2
5217	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5218#else
5219	bl	inner_kernel_gemm_add_nt_4x4_lib4
5220#endif
5221
5222
5223
5224	// call inner blend for generic alpha and beta
5225	mov		x8, x1 // alpha
5226	mov		x9, x4 // beta
5227	mov		x10, x5 // C
5228	mov		w11, w6 // ldc
5229	lsl		w11, w11, #3 // 8*ldc
5230	ldr		w12, [sp, #(STACKSIZE + 8)] // m1
5231	ldr		w13, [sp, #(STACKSIZE + 16)] // n1
5232
5233#if MACRO_LEVEL>=1
5234	INNER_SCALE_AB_4X4_VS_LIB
5235#else
5236	bl inner_scale_ab_4x4_vs_lib
5237#endif
5238
5239
5240
5241	// store n
5242	mov		x8, x7 // D
5243	ldr		w9, [sp, #(STACKSIZE + 0)] // ldd
5244	lsl		w9, w9, #3 // 8*ldd
5245	ldr		w10, [sp, #(STACKSIZE + 8)] // m1
5246	ldr		w11, [sp, #(STACKSIZE + 16)] // n1
5247
5248#if MACRO_LEVEL>=1
5249	INNER_STORE_U_4X4_VS_LIB
5250#else
5251	bl inner_store_u_4x4_vs_lib
5252#endif
5253
5254
5255
5256	EPILOGUE
5257
5258	mov	x0, #0
5259
5260	ret
5261
5262	FUN_END(kernel_dsyrk_nt_u_4x4_vs_lib44cc)
5263
5264
5265
5266
5267
5268//                                         w0        x1         x2         x3         w4       x5         w6       x7         sp+0     sp+8
5269// void kernel_dtrsm_nt_rl_inv_4x4_lib44ccc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E)
5270
5271	.align	4
5272	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib44ccc)
5273
5274
5275
5276	PROLOGUE
5277
5278
5279
5280	ZERO_ACC
5281
5282
5283
5284	// call inner kernel gemm nt
5285	mov		w8, w0 // kmax
5286	mov		x9, x1 // A
5287	mov		x10, x2 // B
5288
5289#if MACRO_LEVEL>=2
5290	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5291#else
5292	bl	inner_kernel_gemm_add_nt_4x4_lib4
5293#endif
5294
5295
5296
5297	// call inner blend for alpha=1.0 and beta=1.0
5298	mov		x8, x3 // C
5299	mov		w9, w4 // ldc
5300	lsl		w9, w9, #3 // 8*ldc
5301
5302#if MACRO_LEVEL>=1
5303	INNER_SCALE_M11_4X4_LIB
5304#else
5305	bl inner_scale_m11_4x4_lib
5306#endif
5307
5308
5309
5310	// solution
5311	mov		x8, x7 // E
5312	ldr		w9, [sp, #(STACKSIZE + 0)] // sde
5313	lsl		w9, w9, #3 // 8*ldc
5314	ldr		x10, [sp, #(STACKSIZE + 8)] // inv_diag_E
5315
5316#if MACRO_LEVEL>=1
5317	INNER_EDGE_TRSM_RLT_INV_4X4_LIB
5318#else
5319	bl inner_edge_trsm_rlt_inv_4x4_lib
5320#endif
5321
5322
5323
5324	// store l
5325	mov		x8, x5 // D
5326	mov		w9, w6 // ldd
5327	lsl		w9, w9, #3 // 8*ldd
5328
5329#if MACRO_LEVEL>=1
5330	INNER_STORE_4X4_LIB
5331#else
5332	bl inner_store_4x4_lib
5333#endif
5334
5335
5336
5337	EPILOGUE
5338
5339	mov	x0, #0
5340
5341	ret
5342
5343	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib44ccc)
5344
5345
5346
5347
5348
5349//                                            w0        x1         x2         x3         w4       x5         w6       x7         sp+0     sp+8                sp+16   sp+24
5350// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1)
5351
5352	.align	4
5353	GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc)
5354
5355
5356
5357	PROLOGUE
5358
5359
5360
5361	ZERO_ACC
5362
5363
5364
5365	// call inner kernel gemm nt
5366	mov		w8, w0 // kmax
5367	mov		x9, x1 // A
5368	mov		x10, x2 // B
5369
5370#if MACRO_LEVEL>=2
5371	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5372#else
5373	bl	inner_kernel_gemm_add_nt_4x4_lib4
5374#endif
5375
5376
5377
5378	// call inner blend for alpha=1.0 and beta=1.0
5379	mov		x8, x3 // C
5380	mov		w9, w4 // ldc
5381	lsl		w9, w9, #3 // 8*ldc
5382	ldr		w10, [sp, #(STACKSIZE + 16)] // m1
5383	ldr		w11, [sp, #(STACKSIZE + 24)] // n1
5384
5385#if MACRO_LEVEL>=1
5386	INNER_SCALE_M11_4X4_VS_LIB
5387#else
5388	bl inner_scale_m11_4x4_vs_lib
5389#endif
5390
5391
5392
5393	// solution
5394	mov		x8, x7 // E
5395	ldr		w9, [sp, #(STACKSIZE + 0)] // sde
5396	lsl		w9, w9, #3 // 8*ldc
5397	ldr		x10, [sp, #(STACKSIZE + 8)] // inv_diag_E
5398	ldr		w11, [sp, #(STACKSIZE + 24)] // n1
5399
5400#if MACRO_LEVEL>=1
5401	INNER_EDGE_TRSM_RLT_INV_4X4_VS_LIB
5402#else
5403	bl inner_edge_trsm_rlt_inv_4x4_vs_lib
5404#endif
5405
5406
5407
5408	// store l
5409	mov		x8, x5 // D
5410	mov		w9, w6 // ldd
5411	lsl		w9, w9, #3 // 8*ldd
5412	ldr		w10, [sp, #(STACKSIZE + 16)] // m1
5413	ldr		w11, [sp, #(STACKSIZE + 24)] // n1
5414
5415#if MACRO_LEVEL>=1
5416	INNER_STORE_4X4_VS_LIB
5417#else
5418	bl inner_store_4x4_vs_lib
5419#endif
5420
5421
5422
5423	EPILOGUE
5424
5425	mov	x0, #0
5426
5427	ret
5428
5429	FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc)
5430
5431
5432
5433
5434
5435//                                         w0        x1         x2         x3            x4         w5       x6         w7       sp+0
5436// void kernel_dtrsm_nt_rl_one_4x4_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E)
5437
5438	.align	4
5439	GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_lib44cc4)
5440
5441
5442
5443	PROLOGUE
5444
5445
5446
5447	ZERO_ACC
5448
5449
5450
5451	// call inner kernel gemm nt
5452	mov		w8, w0 // kmax
5453	mov		x9, x1 // A
5454	mov		x10, x2 // B
5455
5456#if MACRO_LEVEL>=2
5457	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5458#else
5459	bl	inner_kernel_gemm_add_nt_4x4_lib4
5460#endif
5461
5462
5463
5464	// call inner blend for alpha=1.0
5465	mov		x8, x3 // beta
5466	mov		x9, x4 // C
5467	mov		w10, w5 // ldc
5468	lsl		w10, w10, #3 // 8*ldc
5469
5470#if MACRO_LEVEL>=1
5471	INNER_SCALE_M1B_4X4_LIB
5472#else
5473	bl inner_scale_m1b_4x4_lib
5474#endif
5475
5476
5477
5478	// solution
5479	ldr		x8, [sp, #(STACKSIZE + 0)] // E
5480
5481#if MACRO_LEVEL>=1
5482	INNER_EDGE_TRSM_RLT_ONE_4X4_LIB4
5483#else
5484	bl inner_edge_trsm_rlt_one_4x4_lib4
5485#endif
5486
5487
5488
5489	// store l
5490	mov		x8, x6 // D
5491	mov		w9, w7 // ldd
5492	lsl		w9, w9, #3 // 8*ldd
5493
5494#if MACRO_LEVEL>=1
5495	INNER_STORE_4X4_LIB
5496#else
5497	bl inner_store_4x4_lib
5498#endif
5499
5500
5501
5502	EPILOGUE
5503
5504	mov	x0, #0
5505
5506	ret
5507
5508	FUN_END(kernel_dtrsm_nt_rl_one_4x4_lib44cc4)
5509
5510
5511
5512
5513
5514//                                            w0        x1         x2         x3            x4         w5       x6         w7       sp+0       sp+8    sp+16
5515// void kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, int m1, int n1)
5516
5517	.align	4
5518	GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4)
5519
5520
5521
5522	PROLOGUE
5523
5524
5525
5526	ZERO_ACC
5527
5528
5529
5530	// call inner kernel gemm nt
5531	mov		w8, w0 // kmax
5532	mov		x9, x1 // A
5533	mov		x10, x2 // B
5534
5535#if MACRO_LEVEL>=2
5536	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5537#else
5538	bl	inner_kernel_gemm_add_nt_4x4_lib4
5539#endif
5540
5541
5542
5543	// call inner blend for alpha=1.0
5544	mov		x8, x3 // beta
5545	mov		x9, x4 // C
5546	mov		w10, w5 // ldc
5547	lsl		w10, w10, #3 // 8*ldc
5548	ldr		w11, [sp, #(STACKSIZE + 8)] // m1
5549	ldr		w12, [sp, #(STACKSIZE + 16)] // n1
5550
5551#if MACRO_LEVEL>=1
5552	INNER_SCALE_M1B_4X4_VS_LIB
5553#else
5554	bl inner_scale_m1b_4x4_vs_lib
5555#endif
5556
5557
5558
5559	// solution
5560	ldr		x8, [sp, #(STACKSIZE + 0)] // E
5561	ldr		w9, [sp, #(STACKSIZE + 16)] // n1
5562
5563#if MACRO_LEVEL>=1
5564	INNER_EDGE_TRSM_RLT_ONE_4X4_VS_LIB4
5565#else
5566	bl inner_edge_trsm_rlt_one_4x4_vs_lib4
5567#endif
5568
5569
5570
5571	// store l
5572	mov		x8, x6 // D
5573	mov		w9, w7 // ldd
5574	lsl		w9, w9, #3 // 8*ldd
5575	ldr		w10, [sp, #(STACKSIZE + 8)] // m1
5576	ldr		w11, [sp, #(STACKSIZE + 16)] // n1
5577
5578#if MACRO_LEVEL>=1
5579	INNER_STORE_4X4_VS_LIB
5580#else
5581	bl inner_store_4x4_vs_lib
5582#endif
5583
5584
5585
5586	EPILOGUE
5587
5588	mov	x0, #0
5589
5590	ret
5591
5592	FUN_END(kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4)
5593
5594
5595
5596
5597
5598//                                    w0        x1         x2         x3         w4       x5         w6       x7
5599// void kernel_dpotrf_nt_l_4x4_lib44cc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D)
5600
5601	.align	4
5602	GLOB_FUN_START(kernel_dpotrf_nt_l_4x4_lib44cc)
5603
5604
5605
5606	PROLOGUE
5607
5608
5609
5610	ZERO_ACC
5611
5612
5613
5614	// call inner kernel gemm nt
5615	mov		w8, w0 // kmax
5616	mov		x9, x1 // A
5617	mov		x10, x2 // B
5618
5619#if MACRO_LEVEL>=2
5620	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5621#else
5622	bl	inner_kernel_gemm_add_nt_4x4_lib4
5623#endif
5624
5625
5626
5627	// call inner blend for alpha=1.0 and beta=1.0
5628	mov		x8, x3 // C
5629	mov		w9, w4 // ldc
5630	lsl		w9, w9, #3 // 8*ldc
5631
5632#if MACRO_LEVEL>=1
5633	INNER_SCALE_M11_4X4_LIB
5634#else
5635	bl inner_scale_m11_4x4_lib
5636#endif
5637
5638
5639
5640	// factorization
5641	mov		x8, x7 // inv_diag_E
5642
5643#if MACRO_LEVEL>=1
5644	INNER_EDGE_POTRF_4X4_LIB4
5645#else
5646	bl inner_edge_potrf_4x4_lib4
5647#endif
5648
5649
5650
5651	// store l
5652	mov		x8, x5 // D
5653	mov		w9, w6 // ldd
5654	lsl		w9, w9, #3 // 8*ldd
5655
5656#if MACRO_LEVEL>=1
5657	INNER_STORE_L_4X4_LIB
5658#else
5659	bl inner_store_l_4x4_lib
5660#endif
5661
5662
5663
5664	EPILOGUE
5665
5666	mov	x0, #0
5667
5668	ret
5669
5670	FUN_END(kernel_dpotrf_nt_l_4x4_lib44cc)
5671
5672
5673
5674
5675
5676//                                       w0        x1         x2         x3         w4       x5         w6       x7                  sp+0    sp+1
5677// void kernel_dpotrf_nt_l_4x4_vs_lib44cc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D, int m1, int n1)
5678
5679	.align	4
5680	GLOB_FUN_START(kernel_dpotrf_nt_l_4x4_vs_lib44cc)
5681
5682
5683
5684	PROLOGUE
5685
5686
5687
5688	ZERO_ACC
5689
5690
5691
5692	// call inner kernel gemm nt
5693	mov		w8, w0 // kmax
5694	mov		x9, x1 // A
5695	mov		x10, x2 // B
5696
5697#if MACRO_LEVEL>=2
5698	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4
5699#else
5700	bl	inner_kernel_gemm_add_nt_4x4_lib4
5701#endif
5702
5703
5704
5705	// call inner blend for alpha=1.0 and beta=1.0
5706	mov		x8, x3 // C
5707	mov		w9, w4 // ldc
5708	lsl		w9, w9, #3 // 8*ldc
5709	ldr		w10, [sp, #(STACKSIZE + 0)] // m1
5710	ldr		w11, [sp, #(STACKSIZE + 8)] // n1
5711
5712#if MACRO_LEVEL>=1
5713	INNER_SCALE_M11_4X4_VS_LIB
5714#else
5715	bl inner_scale_m11_4x4_vs_lib
5716#endif
5717
5718
5719
5720	// factorization
5721	mov		x8, x7 // inv_diag_E
5722	ldr		w9, [sp, #(STACKSIZE + 8)] // n1
5723
5724#if MACRO_LEVEL>=1
5725	INNER_EDGE_POTRF_4X4_VS_LIB4
5726#else
5727	bl inner_edge_potrf_4x4_vs_lib4
5728#endif
5729
5730
5731
5732	// store l
5733	mov		x8, x5 // D
5734	mov		w9, w6 // ldd
5735	lsl		w9, w9, #3 // 8*ldd
5736	ldr		w10, [sp, #(STACKSIZE + 0)] // m1
5737	ldr		w11, [sp, #(STACKSIZE + 8)] // n1
5738
5739#if MACRO_LEVEL>=1
5740	INNER_STORE_L_4X4_VS_LIB
5741#else
5742	bl inner_store_l_4x4_vs_lib
5743#endif
5744
5745
5746
5747	EPILOGUE
5748
5749	mov	x0, #0
5750
5751	ret
5752
5753	FUN_END(kernel_dpotrf_nt_l_4x4_vs_lib44cc)
5754
5755
5756
5757
5758
5759//                                         w0        x1         x2         x3       x4            x5         w6       x7         sp+0     sp+8       sp+16
5760// void kernel_dtrsm_nn_ll_one_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde)
5761
5762	.align	4
5763	GLOB_FUN_START(kernel_dtrsm_nn_ll_one_4x4_lib4cccc)
5764
5765
5766
5767	PROLOGUE
5768
5769
5770
5771	ZERO_ACC
5772
5773
5774
5775	// call inner kernel gemm nt
5776	mov		w8, w0 // kmax
5777	mov		x9, x1 // A
5778	mov		x10, x2 // B
5779	mov		w11, w3 // ldb
5780	lsl		w11, w11, #3 // 8*ldb
5781
5782#if MACRO_LEVEL>=2
5783	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C
5784#else
5785	bl	inner_kernel_gemm_add_nn_4x4_lib4c
5786#endif
5787
5788
5789
5790	// call inner blend for alpha=1.0 and beta=1.0
5791	mov		x8, x4 // beta
5792	mov		x9, x5 // C
5793	mov		w10, w6 // ldc
5794	lsl		w10, w10, #3 // 8*ldc
5795
5796#if MACRO_LEVEL>=1
5797	INNER_SCALE_M1B_4X4_LIB
5798#else
5799	bl inner_scale_m1b_4x4_lib
5800#endif
5801
5802
5803
5804	// solution
5805	ldr		x8, [sp, #(STACKSIZE + 8)] // E
5806	ldr		w9, [sp, #(STACKSIZE + 16)] // sde
5807	lsl		w9, w9, #3 // 8*ldc
5808
5809#if MACRO_LEVEL>=1
5810	INNER_EDGE_TRSM_LLN_ONE_4X4_LIB
5811#else
5812	bl inner_edge_trsm_lln_one_4x4_lib
5813#endif
5814
5815
5816
5817	// store l
5818	mov		x8, x7 // D
5819	ldr		w9, [sp, #(STACKSIZE + 0)] // sdd
5820	lsl		w9, w9, #3 // 8*ldd
5821
5822#if MACRO_LEVEL>=1
5823	INNER_STORE_4X4_LIB
5824#else
5825	bl inner_store_4x4_lib
5826#endif
5827
5828
5829
5830	EPILOGUE
5831
5832	mov	x0, #0
5833
5834	ret
5835
5836	FUN_END(kernel_dtrsm_nn_ll_one_4x4_lib4cccc)
5837
5838
5839
5840
5841
5842
5843//                                            w0        x1         x2         x3       x4            x5         w6       x7         sp+0     sp+8       sp+16    sp+24   sp+32
5844// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1)
5845
5846	.align	4
5847	GLOB_FUN_START(kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc)
5848
5849
5850
5851	PROLOGUE
5852
5853
5854
5855	ZERO_ACC
5856
5857
5858
5859	// call inner kernel gemm nt
5860	mov		w8, w0 // kmax
5861	mov		x9, x1 // A
5862	mov		x10, x2 // B
5863	mov		w11, w3 // ldb
5864	lsl		w11, w11, #3 // 8*ldb
5865
5866	ldr		w12, [sp, #(STACKSIZE + 32)] // n1
5867	cmp		w12, #1
5868	bgt		100f
5869
5870#if MACRO_LEVEL>=2
5871	INNER_KERNEL_GEMM_ADD_NN_4X1_LIB4C
5872#else
5873	bl	inner_kernel_gemm_add_nn_4x1_lib4c
5874#endif
5875
5876	b		103f
5877
5878100:
5879
5880	ldr		w12, [sp, #(STACKSIZE + 32)] // n1
5881	cmp		w12, #2
5882	bgt		101f
5883
5884#if MACRO_LEVEL>=2
5885	INNER_KERNEL_GEMM_ADD_NN_4X2_LIB4C
5886#else
5887	bl	inner_kernel_gemm_add_nn_4x2_lib4c
5888#endif
5889
5890	b		103f
5891
5892101:
5893
5894	ldr		w12, [sp, #(STACKSIZE + 32)] // n1
5895	cmp		w12, #3
5896	bgt		102f
5897
5898#if MACRO_LEVEL>=2
5899	INNER_KERNEL_GEMM_ADD_NN_4X3_LIB4C
5900#else
5901	bl	inner_kernel_gemm_add_nn_4x3_lib4c
5902#endif
5903
5904	b		103f
5905
5906102:
5907
5908#if MACRO_LEVEL>=2
5909	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C
5910#else
5911	bl	inner_kernel_gemm_add_nn_4x4_lib4c
5912#endif
5913
5914103:
5915
5916
5917
5918	// call inner blend for alpha=1.0 and beta=1.0
5919	mov		x8, x4 // beta
5920	mov		x9, x5 // C
5921	mov		w10, w6 // ldc
5922	lsl		w10, w10, #3 // 8*ldc
5923	ldr		w11, [sp, #(STACKSIZE + 24)] // m1
5924	ldr		w12, [sp, #(STACKSIZE + 32)] // n1
5925
5926#if MACRO_LEVEL>=1
5927	INNER_SCALE_M1B_4X4_VS_LIB
5928#else
5929	bl inner_scale_m1b_4x4_vs_lib
5930#endif
5931
5932
5933
5934	// solution
5935	ldr		x8, [sp, #(STACKSIZE + 8)] // E
5936	ldr		w9, [sp, #(STACKSIZE + 16)] // sde
5937	lsl		w9, w9, #3 // 8*ldc
5938
5939#if MACRO_LEVEL>=1
5940	INNER_EDGE_TRSM_LLN_ONE_4X4_LIB
5941#else
5942	bl inner_edge_trsm_lln_one_4x4_lib
5943#endif
5944
5945
5946
5947	// store l
5948	mov		x8, x7 // D
5949	ldr		w9, [sp, #(STACKSIZE + 0)] // sdd
5950	lsl		w9, w9, #3 // 8*ldd
5951	ldr		w10, [sp, #(STACKSIZE + 24)] // m1
5952	ldr		w11, [sp, #(STACKSIZE + 32)] // n1
5953
5954#if MACRO_LEVEL>=1
5955	INNER_STORE_4X4_VS_LIB
5956#else
5957	bl inner_store_4x4_vs_lib
5958#endif
5959
5960
5961
5962	EPILOGUE
5963
5964	mov	x0, #0
5965
5966	ret
5967
5968	FUN_END(kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc)
5969
5970
5971
5972
5973
5974//                                 w0        x1             x2         x3         x4       x5            w6         x7       sp+0       sp+8
5975// void kernel_dgemm_nt_4x4_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd)
5976
5977	.align	4
5978	GLOB_FUN_START(kernel_dgemm_nt_4x4_libc4cc)
5979
5980
5981
5982	PROLOGUE
5983
5984
5985
5986	ZERO_ACC
5987
5988
5989
5990	// call inner kernel gemm nt
5991	mov		w8, w0 // kmax
5992	mov		x9, x4 // B
5993	mov		x10, x2 // A
5994	mov		w11, w3 // lda
5995	lsl		w11, w11, #3 // 8*lda
5996
5997#if MACRO_LEVEL>=2
5998	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C
5999#else
6000	bl	inner_kernel_gemm_add_nt_4x4_lib4c
6001#endif
6002
6003
6004
6005	// call inner blend for generic alpha and beta
6006	mov		x8, x1 // alpha
6007	mov		x9, x5 // beta
6008	mov		x10, x6 // C
6009	mov		w11, w7 // ldc
6010	lsl		w11, w11, #3 // 8*ldc
6011
6012#if MACRO_LEVEL>=1
6013	INNER_TRAN_4X4_LIB
6014#else
6015	bl inner_tran_4x4_lib
6016#endif
6017
6018
6019#if MACRO_LEVEL>=1
6020	INNER_SCALE_AB_4X4_LIB
6021#else
6022	bl inner_scale_ab_4x4_lib
6023#endif
6024
6025
6026
6027	// store n
6028	ldr		x8, [sp, #(STACKSIZE + 0)] // D
6029	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
6030	lsl		w9, w9, #3 // 8*ldd
6031
6032#if MACRO_LEVEL>=1
6033	INNER_STORE_4X4_LIB
6034#else
6035	bl inner_store_4x4_lib
6036#endif
6037
6038
6039
6040	EPILOGUE
6041
6042	mov	x0, #0
6043
6044	ret
6045
6046	FUN_END(kernel_dgemm_nt_4x4_libc4cc)
6047
6048
6049
6050
6051
6052//                                    w0        x1             x2         x3         x4       x5            w6         x7       sp+0       sp+8     sp+16   sp+24
6053// void kernel_dgemm_nt_4x4_vs_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
6054
6055	.align	4
6056	GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_libc4cc)
6057
6058
6059
6060	PROLOGUE
6061
6062
6063
6064	ZERO_ACC
6065
6066
6067
6068	// call inner kernel gemm nt
6069	mov		w8, w0 // kmax
6070	mov		x9, x4 // B
6071	mov		x10, x2 // A
6072	mov		w11, w3 // lda
6073	lsl		w11, w11, #3 // 8*ldb
6074
6075	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
6076	cmp		w12, #1
6077	bgt		100f
6078
6079#if MACRO_LEVEL>=2
6080	INNER_KERNEL_GEMM_ADD_NT_4X1_LIB4C
6081#else
6082	bl	inner_kernel_gemm_add_nt_4x1_lib4c
6083#endif
6084
6085	b		103f
6086
6087100:
6088
6089	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
6090	cmp		w12, #2
6091	bgt		101f
6092
6093#if MACRO_LEVEL>=2
6094	INNER_KERNEL_GEMM_ADD_NT_4X2_LIB4C
6095#else
6096	bl	inner_kernel_gemm_add_nt_4x2_lib4c
6097#endif
6098
6099	b		103f
6100
6101101:
6102
6103	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
6104	cmp		w12, #3
6105	bgt		102f
6106
6107#if MACRO_LEVEL>=2
6108	INNER_KERNEL_GEMM_ADD_NT_4X3_LIB4C
6109#else
6110	bl	inner_kernel_gemm_add_nt_4x3_lib4c
6111#endif
6112
6113	b		103f
6114
6115102:
6116
6117#if MACRO_LEVEL>=2
6118	INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C
6119#else
6120	bl	inner_kernel_gemm_add_nt_4x4_lib4c
6121#endif
6122
6123103:
6124
6125
6126
6127	// call inner blend for generic alpha and beta
6128	mov		x8, x1 // alpha
6129	mov		x9, x5 // beta
6130	mov		x10, x6 // C
6131	mov		w11, w7 // ldc
6132	lsl		w11, w11, #3 // 8*ldc
6133	ldr		w12, [sp, #(STACKSIZE + 16)] // m1
6134	ldr		w13, [sp, #(STACKSIZE + 24)] // n1
6135
6136#if MACRO_LEVEL>=1
6137	INNER_TRAN_4X4_LIB
6138#else
6139	bl inner_tran_4x4_lib
6140#endif
6141
6142
6143#if MACRO_LEVEL>=1
6144	INNER_SCALE_AB_4X4_VS_LIB
6145#else
6146	bl inner_scale_ab_4x4_vs_lib
6147#endif
6148
6149
6150
6151	// store n
6152	ldr		x8, [sp, #(STACKSIZE + 0)] // D
6153	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
6154	lsl		w9, w9, #3 // 8*ldd
6155	ldr		w10, [sp, #(STACKSIZE + 16)] // m1
6156	ldr		w11, [sp, #(STACKSIZE + 24)] // n1
6157
6158#if MACRO_LEVEL>=1
6159	INNER_STORE_4X4_VS_LIB
6160#else
6161	bl inner_store_4x4_vs_lib
6162#endif
6163
6164
6165
6166	EPILOGUE
6167
6168	mov	x0, #0
6169
6170	ret
6171
6172	FUN_END(kernel_dgemm_nt_4x4_vs_libc4cc)
6173
6174
6175
6176
6177
6178//                                 w0        x1             x2         x3         x4       x5            w6         x7       sp+0       sp+8
6179// void kernel_dgemm_tt_4x4_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd)
6180
6181	.align	4
6182	GLOB_FUN_START(kernel_dgemm_tt_4x4_libc4cc)
6183
6184
6185
6186	PROLOGUE
6187
6188
6189
6190	ZERO_ACC
6191
6192
6193
6194	// call inner kernel gemm nt
6195	mov		w8, w0 // kmax
6196	mov		x9, x4 // B
6197	mov		x10, x2 // A
6198	mov		w11, w3 // lda
6199	lsl		w11, w11, #3 // 8*lda
6200
6201#if MACRO_LEVEL>=2
6202	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C
6203#else
6204	bl	inner_kernel_gemm_add_nn_4x4_lib4c
6205#endif
6206
6207
6208
6209	// call inner blend for generic alpha and beta
6210	mov		x8, x1 // alpha
6211	mov		x9, x5 // beta
6212	mov		x10, x6 // C
6213	mov		w11, w7 // ldc
6214	lsl		w11, w11, #3 // 8*ldc
6215
6216#if MACRO_LEVEL>=1
6217	INNER_TRAN_4X4_LIB
6218#else
6219	bl inner_tran_4x4_lib
6220#endif
6221
6222
6223#if MACRO_LEVEL>=1
6224	INNER_SCALE_AB_4X4_LIB
6225#else
6226	bl inner_scale_ab_4x4_lib
6227#endif
6228
6229
6230
6231	// store n
6232	ldr		x8, [sp, #(STACKSIZE + 0)] // D
6233	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
6234	lsl		w9, w9, #3 // 8*ldd
6235
6236#if MACRO_LEVEL>=1
6237	INNER_STORE_4X4_LIB
6238#else
6239	bl inner_store_4x4_lib
6240#endif
6241
6242
6243
6244	EPILOGUE
6245
6246	mov	x0, #0
6247
6248	ret
6249
6250	FUN_END(kernel_dgemm_tt_4x4_libc4cc)
6251
6252
6253
6254
6255
6256//                                    w0        x1             x2         x3         x4       x5            w6         x7       sp+0       sp+8     sp+16   sp+24
6257// void kernel_dgemm_tt_4x4_vs_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1)
6258
6259	.align	4
6260	GLOB_FUN_START(kernel_dgemm_tt_4x4_vs_libc4cc)
6261
6262
6263
6264	PROLOGUE
6265
6266
6267
6268	ZERO_ACC
6269
6270
6271
6272	// call inner kernel gemm nt
6273	mov		w8, w0 // kmax
6274	mov		x9, x4 // B
6275	mov		x10, x2 // A
6276	mov		w11, w3 // lda
6277	lsl		w11, w11, #3 // 8*ldb
6278
6279	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
6280	cmp		w12, #1
6281	bgt		100f
6282
6283#if MACRO_LEVEL>=2
6284	INNER_KERNEL_GEMM_ADD_NN_4X1_LIB4C
6285#else
6286	bl	inner_kernel_gemm_add_nn_4x1_lib4c
6287#endif
6288
6289	b		103f
6290
6291100:
6292
6293	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
6294	cmp		w12, #2
6295	bgt		101f
6296
6297#if MACRO_LEVEL>=2
6298	INNER_KERNEL_GEMM_ADD_NN_4X2_LIB4C
6299#else
6300	bl	inner_kernel_gemm_add_nn_4x2_lib4c
6301#endif
6302
6303	b		103f
6304
6305101:
6306
6307	ldr		w12, [sp, #(STACKSIZE + 24)] // n1
6308	cmp		w12, #3
6309	bgt		102f
6310
6311#if MACRO_LEVEL>=2
6312	INNER_KERNEL_GEMM_ADD_NN_4X3_LIB4C
6313#else
6314	bl	inner_kernel_gemm_add_nn_4x3_lib4c
6315#endif
6316
6317	b		103f
6318
6319102:
6320
6321#if MACRO_LEVEL>=2
6322	INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C
6323#else
6324	bl	inner_kernel_gemm_add_nn_4x4_lib4c
6325#endif
6326
6327103:
6328
6329
6330
6331	// call inner blend for generic alpha and beta
6332	mov		x8, x1 // alpha
6333	mov		x9, x5 // beta
6334	mov		x10, x6 // C
6335	mov		w11, w7 // ldc
6336	lsl		w11, w11, #3 // 8*ldc
6337	ldr		w12, [sp, #(STACKSIZE + 16)] // m1
6338	ldr		w13, [sp, #(STACKSIZE + 24)] // n1
6339
6340#if MACRO_LEVEL>=1
6341	INNER_TRAN_4X4_LIB
6342#else
6343	bl inner_tran_4x4_lib
6344#endif
6345
6346
6347#if MACRO_LEVEL>=1
6348	INNER_SCALE_AB_4X4_VS_LIB
6349#else
6350	bl inner_scale_ab_4x4_vs_lib
6351#endif
6352
6353
6354
6355	// store n
6356	ldr		x8, [sp, #(STACKSIZE + 0)] // D
6357	ldr		w9, [sp, #(STACKSIZE + 8)] // ldd
6358	lsl		w9, w9, #3 // 8*ldd
6359	ldr		w10, [sp, #(STACKSIZE + 16)] // m1
6360	ldr		w11, [sp, #(STACKSIZE + 24)] // n1
6361
6362#if MACRO_LEVEL>=1
6363	INNER_STORE_4X4_VS_LIB
6364#else
6365	bl inner_store_4x4_vs_lib
6366#endif
6367
6368
6369
6370	EPILOGUE
6371
6372	mov	x0, #0
6373
6374	ret
6375
6376	FUN_END(kernel_dgemm_tt_4x4_vs_libc4cc)
6377
6378
6379
6380
6381
6382
6383
6384