1/*******************************************************************************
2Copyright (c) 2015, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*******************************************************************************/
27
28#define ASSEMBLER
29#include "common.h"
30
31/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
32/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
33
34#define origM		x0
35#define origN		x1
36#define origK		x2
37#define origPA		x3
38#define origPB		x4
39#define pC		x5
40#define LDC		x6
41#define temp		x7
42#define counterL	x8
43#define counterI	x9
44#define counterJ	x10
45#define pB		x11
46#define pCRow0		x12
47#define pCRow1		x13
48#define pCRow2		x14
49#define pA		x15
50
51#define alpha0		d2
52#define alphaV0		v2.d[0]
53#define alpha1		d3
54#define alphaV1		v3.d[0]
55#define alpha2		d6
56#define alphaV2		v6.d[0]
57#define alpha3		d7
58#define alphaV3		v7.d[0]
59
60// 00 origM
61// 01 origN
62// 02 origK
63// 03 origPA
64// 04 origPB
65// 05 pC
66// 06 origLDC -> LDC
67// 07 temp
68// 08 counterL
69// 09 counterI
70// 10 counterJ
71// 11 pB
72// 12 pCRow0
73// 13 pCRow1
74// 14 pCRow2
75// 15 pA
76// 16
77// 17
78// 18 must save
79// 19 must save
80// 20 must save
81// 21 must save
82// 22 must save
83// 23 must save
84// 24 must save
85// 25 must save
86// 26 must save
87// 27 must save
88// 28 must save
89// 29 frame
90// 30 link
91// 31 sp
92
93//v00 ALPHA -> pA00, pA01
94//v01 pA02, pA03
95//v02 ALPHA0
96//v03 ALPHA1
97//v04 pA10, pA11
98//v05 pA12, pA13
99//v06 ALPHA2
100//v07 ALPHA3
101//v08 must save pB0_0, pB0_1
102//v09 must save pB0_2, pB0_3
103//v10 must save pB0_4, pB0_5
104//v11 must save pB0_6, pB0_7
105//v12 must save pB1_0, pB1_1
106//v13 must save pB1_2, pB1_3
107//v14 must save pB1_4, pB1_5
108//v15 must save pB1_6, pB1_7
109//v16 must save C00, C01
110//v17 must save C02, C03
111//v18 C04, C05
112//v19 C06, C07
113//v20 C10, C11
114//v21 C12, C13
115//v22 C14, C15
116//v23 C16, C17
117//v24 C20, C21
118//v25 C22, C23
119//v26 C24, C25
120//v27 C26, C27
121//v28 C30, C31
122//v29 C32, C33
123//v30 C34, C35
124//v31 C36, C37
125
126/*******************************************************************************
127* Macro definitions
128*******************************************************************************/
129
130.macro INIT4x8
131	fmov		d16, xzr
132	fmov		d17, xzr
133	fmov		d18, xzr
134	fmov		d19, d16
135	fmov		d20, xzr
136	fmov		d21, d16
137	fmov		d22, d17
138	fmov		d23, d18
139	fmov		d24, xzr
140	fmov		d25, d16
141	fmov		d26, d17
142	fmov		d27, d18
143	fmov		d28, xzr
144	fmov		d29, d16
145	fmov		d30, d17
146	fmov		d31, d18
147.endm
148
149.macro KERNEL4x8_I
150	ld1	{v8.2d, v9.2d}, [pB]
151	add	pB, pB, #32
152	ld1	{v0.2d, v1.2d}, [pA]
153	add	pA, pA, #32
154	ld1	{v10.2d, v11.2d}, [pB]
155	add	pB, pB, #32
156
157	fmul	v16.2d, v0.2d, v8.d[0]
158	fmul	v17.2d, v1.2d, v8.d[0]
159	fmul	v18.2d, v0.2d, v8.d[1]
160	fmul	v19.2d, v1.2d, v8.d[1]
161
162	fmul	v20.2d, v0.2d, v9.d[0]
163	fmul	v21.2d, v1.2d, v9.d[0]
164	fmul	v22.2d, v0.2d, v9.d[1]
165	fmul	v23.2d, v1.2d, v9.d[1]
166
167	fmul	v24.2d, v0.2d, v10.d[0]
168	fmul	v25.2d, v1.2d, v10.d[0]
169	fmul	v26.2d, v0.2d, v10.d[1]
170	fmul	v27.2d, v1.2d, v10.d[1]
171
172	fmul	v28.2d, v0.2d, v11.d[0]
173	fmul	v29.2d, v1.2d, v11.d[0]
174	fmul	v30.2d, v0.2d, v11.d[1]
175	fmul	v31.2d, v1.2d, v11.d[1]
176
177	ld1	{v12.2d, v13.2d}, [pB]
178	add	pB, pB, #32
179	ld1	{v4.2d, v5.2d}, [pA]
180	add	pA, pA, #32
181	ld1	{v14.2d, v15.2d}, [pB]
182	add	pB, pB, #32
183.endm
184
185.macro KERNEL4x8_M1
186	fmla	v16.2d, v0.2d, v8.d[0]
187	fmla	v17.2d, v1.2d, v8.d[0]
188	fmla	v18.2d, v0.2d, v8.d[1]
189	fmla	v19.2d, v1.2d, v8.d[1]
190
191	fmla	v20.2d, v0.2d, v9.d[0]
192	fmla	v21.2d, v1.2d, v9.d[0]
193	fmla	v22.2d, v0.2d, v9.d[1]
194	fmla	v23.2d, v1.2d, v9.d[1]
195
196	fmla	v24.2d, v0.2d, v10.d[0]
197	fmla	v25.2d, v1.2d, v10.d[0]
198	fmla	v26.2d, v0.2d, v10.d[1]
199	fmla	v27.2d, v1.2d, v10.d[1]
200
201	fmla	v28.2d, v0.2d, v11.d[0]
202	fmla	v29.2d, v1.2d, v11.d[0]
203	fmla	v30.2d, v0.2d, v11.d[1]
204	fmla	v31.2d, v1.2d, v11.d[1]
205
206	ld1	{v12.2d, v13.2d}, [pB]		// For next round
207	add	pB, pB, #32
208	ld1	{v4.2d, v5.2d}, [pA]		// For next round
209	add	pA, pA, #32
210	ld1	{v14.2d, v15.2d}, [pB]
211	add	pB, pB, #32
212
213	prfm	PLDL1KEEP, [pA, #512]
214.endm
215
216.macro KERNEL4x8_M2
217	fmla	v16.2d, v4.2d, v12.d[0]
218	fmla	v17.2d, v5.2d, v12.d[0]
219	fmla	v18.2d, v4.2d, v12.d[1]
220	fmla	v19.2d, v5.2d, v12.d[1]
221
222	fmla	v20.2d, v4.2d, v13.d[0]
223	fmla	v21.2d, v5.2d, v13.d[0]
224	fmla	v22.2d, v4.2d, v13.d[1]
225	fmla	v23.2d, v5.2d, v13.d[1]
226
227	fmla	v24.2d, v4.2d, v14.d[0]
228	fmla	v25.2d, v5.2d, v14.d[0]
229	fmla	v26.2d, v4.2d, v14.d[1]
230	fmla	v27.2d, v5.2d, v14.d[1]
231
232	fmla	v28.2d, v4.2d, v15.d[0]
233	fmla	v29.2d, v5.2d, v15.d[0]
234	fmla	v30.2d, v4.2d, v15.d[1]
235	fmla	v31.2d, v5.2d, v15.d[1]
236
237	ld1	{v8.2d, v9.2d}, [pB]		// For next round
238	add	pB, pB, #32
239	ld1	{v0.2d, v1.2d}, [pA]		// For next round
240	add	pA, pA, #32
241	ld1	{v10.2d, v11.2d}, [pB]
242	add	pB, pB, #32
243
244	prfm	PLDL1KEEP, [pB, #512]
245.endm
246
247.macro KERNEL4x8_E
248	fmla	v16.2d, v4.2d, v12.d[0]
249	fmla	v17.2d, v5.2d, v12.d[0]
250	fmla	v18.2d, v4.2d, v12.d[1]
251	fmla	v19.2d, v5.2d, v12.d[1]
252
253	fmla	v20.2d, v4.2d, v13.d[0]
254	fmla	v21.2d, v5.2d, v13.d[0]
255	fmla	v22.2d, v4.2d, v13.d[1]
256	fmla	v23.2d, v5.2d, v13.d[1]
257
258	fmla	v24.2d, v4.2d, v14.d[0]
259	fmla	v25.2d, v5.2d, v14.d[0]
260	fmla	v26.2d, v4.2d, v14.d[1]
261	fmla	v27.2d, v5.2d, v14.d[1]
262
263	fmla	v28.2d, v4.2d, v15.d[0]
264	fmla	v29.2d, v5.2d, v15.d[0]
265	fmla	v30.2d, v4.2d, v15.d[1]
266	fmla	v31.2d, v5.2d, v15.d[1]
267.endm
268
269.macro KERNEL4x8_SUB
270	ld1	{v8.2d, v9.2d}, [pB]		// For next round
271	add	pB, pB, #32
272	ld1	{v0.2d, v1.2d}, [pA]		// For next round
273	add	pA, pA, #32
274	ld1	{v10.2d, v11.2d}, [pB]
275	add	pB, pB, #32
276
277	fmla	v16.2d, v0.2d, v8.d[0]
278	fmla	v17.2d, v1.2d, v8.d[0]
279	fmla	v18.2d, v0.2d, v8.d[1]
280	fmla	v19.2d, v1.2d, v8.d[1]
281
282	fmla	v20.2d, v0.2d, v9.d[0]
283	fmla	v21.2d, v1.2d, v9.d[0]
284	fmla	v22.2d, v0.2d, v9.d[1]
285	fmla	v23.2d, v1.2d, v9.d[1]
286
287	fmla	v24.2d, v0.2d, v10.d[0]
288	fmla	v25.2d, v1.2d, v10.d[0]
289	fmla	v26.2d, v0.2d, v10.d[1]
290	fmla	v27.2d, v1.2d, v10.d[1]
291
292	fmla	v28.2d, v0.2d, v11.d[0]
293	fmla	v29.2d, v1.2d, v11.d[0]
294	fmla	v30.2d, v0.2d, v11.d[1]
295	fmla	v31.2d, v1.2d, v11.d[1]
296.endm
297
298.macro SAVE4x8
299	add	pCRow1, pCRow0, LDC
300
301	ld1	{v8.2d, v9.2d}, [pCRow0]
302	fmla	v8.2d, v16.2d, alphaV0
303	fmla	v9.2d, v17.2d, alphaV1
304	st1 	{v8.2d, v9.2d}, [pCRow0]
305
306	add	pCRow2, pCRow1, LDC
307
308	ld1	{v10.2d, v11.2d}, [pCRow1]
309	fmla	v10.2d, v18.2d, alphaV2
310	fmla	v11.2d, v19.2d, alphaV3
311	st1 	{v10.2d, v11.2d}, [pCRow1]
312
313	add	pCRow1, pCRow2, LDC
314
315	ld1	{v12.2d, v13.2d}, [pCRow2]
316	fmla	v12.2d, v20.2d, alphaV0
317	fmla	v13.2d, v21.2d, alphaV1
318	st1 	{v12.2d, v13.2d}, [pCRow2]
319
320	add	pCRow2, pCRow1, LDC
321
322	ld1	{v14.2d, v15.2d}, [pCRow1]
323	fmla	v14.2d, v22.2d, alphaV2
324	fmla	v15.2d, v23.2d, alphaV3
325	st1 	{v14.2d, v15.2d}, [pCRow1]
326
327	add	pCRow1, pCRow2, LDC
328
329	ld1	{v8.2d, v9.2d}, [pCRow2]
330	fmla	v8.2d, v24.2d, alphaV0
331	fmla	v9.2d, v25.2d, alphaV1
332	st1 	{v8.2d, v9.2d}, [pCRow2]
333
334	add	pCRow2, pCRow1, LDC
335
336	ld1	{v10.2d, v11.2d}, [pCRow1]
337	fmla	v10.2d, v26.2d, alphaV2
338	fmla	v11.2d, v27.2d, alphaV3
339	st1 	{v10.2d, v11.2d}, [pCRow1]
340
341	add	pCRow1, pCRow2, LDC
342
343	ld1	{v12.2d, v13.2d}, [pCRow2]
344	fmla	v12.2d, v28.2d, alphaV0
345	fmla	v13.2d, v29.2d, alphaV1
346	st1 	{v12.2d, v13.2d}, [pCRow2]
347
348	ld1	{v14.2d, v15.2d}, [pCRow1]
349	fmla	v14.2d, v30.2d, alphaV2
350	fmla	v15.2d, v31.2d, alphaV3
351	st1 	{v14.2d, v15.2d}, [pCRow1]
352
353	add	pCRow0, pCRow0, #32
354.endm
355
356/******************************************************************************/
357
358.macro INIT2x8
359	fmov	d16, xzr
360	fmov	d18, xzr
361	fmov	d20, xzr
362	fmov	d22, d16
363	fmov	d24, xzr
364	fmov	d26, d16
365	fmov	d28, xzr
366	fmov	d30, d16
367.endm
368
369.macro KERNEL2x8_SUB
370	ld1	{v8.2d, v9.2d}, [pB]
371	add	pB, pB, #32
372	ld1	{v0.2d}, [pA]
373	add	pA, pA, #16
374	ld1	{v10.2d, v11.2d}, [pB]
375	add	pB, pB, #32
376
377	fmla	v16.2d, v0.2d, v8.d[0]
378	fmla	v18.2d, v0.2d, v8.d[1]
379
380	fmla	v20.2d, v0.2d, v9.d[0]
381	fmla	v22.2d, v0.2d, v9.d[1]
382
383	fmla	v24.2d, v0.2d, v10.d[0]
384	fmla	v26.2d, v0.2d, v10.d[1]
385
386	fmla	v28.2d, v0.2d, v11.d[0]
387	fmla	v30.2d, v0.2d, v11.d[1]
388.endm
389
390.macro SAVE2x8
391	add	pCRow1, pCRow0, LDC
392
393	ld1	{v8.2d}, [pCRow0]
394	fmla	v8.2d, v16.2d, alphaV0
395	st1 	{v8.2d}, [pCRow0]
396
397	add	pCRow2, pCRow1, LDC
398
399	ld1	{v10.2d}, [pCRow1]
400	fmla	v10.2d, v18.2d, alphaV2
401	st1 	{v10.2d}, [pCRow1]
402
403	add	pCRow1, pCRow2, LDC
404
405	ld1	{v12.2d}, [pCRow2]
406	fmla	v12.2d, v20.2d, alphaV0
407	st1 	{v12.2d}, [pCRow2]
408
409	add	pCRow2, pCRow1, LDC
410
411	ld1	{v14.2d}, [pCRow1]
412	fmla	v14.2d, v22.2d, alphaV2
413	st1 	{v14.2d}, [pCRow1]
414
415	add	pCRow1, pCRow2, LDC
416
417	ld1	{v8.2d}, [pCRow2]
418	fmla	v8.2d, v24.2d, alphaV0
419	st1 	{v8.2d}, [pCRow2]
420
421	add	pCRow2, pCRow1, LDC
422
423	ld1	{v10.2d}, [pCRow1]
424	fmla	v10.2d, v26.2d, alphaV2
425	st1 	{v10.2d}, [pCRow1]
426
427	add	pCRow1, pCRow2, LDC
428
429	ld1	{v12.2d}, [pCRow2]
430	fmla	v12.2d, v28.2d, alphaV0
431	st1 	{v12.2d}, [pCRow2]
432
433	add	pCRow2, pCRow1, LDC
434
435	ld1	{v14.2d}, [pCRow1]
436	fmla	v14.2d, v30.2d, alphaV2
437	st1 	{v14.2d}, [pCRow1]
438
439	add	pCRow0, pCRow0, #16
440.endm
441
442/******************************************************************************/
443
444.macro INIT1x8
445	fmov	d16, xzr
446	fmov	d20, xzr
447	fmov	d24, xzr
448	fmov	d28, xzr
449.endm
450
451.macro KERNEL1x8_SUB
452	ld1	{v8.2d, v9.2d}, [pB]
453	add	pB, pB, #32
454	ldr	d0, [pA]
455	add	pA, pA, #8
456	ld1	{v10.2d, v11.2d}, [pB]
457	add	pB, pB, #32
458
459	fmla	v16.2d, v8.2d, v0.d[0]
460	fmla	v20.2d, v9.2d, v0.d[0]
461	fmla	v24.2d, v10.2d, v0.d[0]
462	fmla	v28.2d, v11.2d, v0.d[0]
463.endm
464
465.macro SAVE1x8
466	add	pCRow1, pCRow0, LDC
467
468	ld1	{v8.d}[0], [pCRow0]
469	ld1	{v8.d}[1], [pCRow1]
470	fmla	v8.2d, v16.2d, alphaV0
471	st1	{v8.d}[0], [pCRow0]
472	st1	{v8.d}[1], [pCRow1]
473
474	add	pCRow2, pCRow1, LDC
475	add	pCRow1, pCRow2, LDC
476
477	ld1	{v10.d}[0], [pCRow2]
478	ld1	{v10.d}[1], [pCRow1]
479	fmla	v10.2d, v20.2d, alphaV1
480	st1	{v10.d}[0], [pCRow2]
481	st1	{v10.d}[1], [pCRow1]
482
483	add	pCRow2, pCRow1, LDC
484	add	pCRow1, pCRow2, LDC
485
486	ld1	{v12.d}[0], [pCRow2]
487	ld1	{v12.d}[1], [pCRow1]
488	fmla	v12.2d, v24.2d, alphaV2
489	st1	{v12.d}[0], [pCRow2]
490	st1	{v12.d}[1], [pCRow1]
491
492	add	pCRow2, pCRow1, LDC
493	add	pCRow1, pCRow2, LDC
494
495	ld1	{v14.d}[0], [pCRow2]
496	ld1	{v14.d}[1], [pCRow1]
497	fmla	v14.2d, v28.2d, alphaV3
498	st1	{v14.d}[0], [pCRow2]
499	st1	{v14.d}[1], [pCRow1]
500
501	add	pCRow0, pCRow0, #8
502.endm
503
504/******************************************************************************/
505
506.macro INIT4x4
507	fmov		d16, xzr
508	fmov		d17, d16
509	fmov		d20, d17
510	fmov		d21, d16
511	fmov		d24, d17
512	fmov		d25, d16
513	fmov		d28, d17
514	fmov		d29, d16
515.endm
516
517.macro KERNEL4x4_I
518	ld1	{v8.2d, v9.2d}, [pB]
519	add	pB, pB, #32
520	ld1	{v0.2d, v1.2d}, [pA]
521	add	pA, pA, #32
522
523	fmul	v16.2d, v0.2d, v8.d[0]
524	fmul	v29.2d, v1.2d, v9.d[1]
525
526	fmul	v20.2d, v0.2d, v8.d[1]
527	fmul	v25.2d, v1.2d, v9.d[0]
528
529	fmul	v24.2d, v0.2d, v9.d[0]
530	fmul	v21.2d, v1.2d, v8.d[1]
531
532	fmul	v28.2d, v0.2d, v9.d[1]
533	fmul	v17.2d, v1.2d, v8.d[0]
534
535	ld1	{v12.2d, v13.2d}, [pB]
536	add	pB, pB, #32
537	ld1	{v4.2d, v5.2d}, [pA]
538	add	pA, pA, #32
539.endm
540
541.macro KERNEL4x4_M1
542	fmla	v16.2d, v0.2d, v8.d[0]
543	fmla	v29.2d, v1.2d, v9.d[1]
544
545	ld1	{v12.2d, v13.2d}, [pB]		// For next round
546	add	pB, pB, #32
547
548	fmla	v20.2d, v0.2d, v8.d[1]
549	fmla	v25.2d, v1.2d, v9.d[0]
550
551	ld1	{v4.2d, v5.2d}, [pA]		// For next round
552	add	pA, pA, #32
553
554	fmla	v24.2d, v0.2d, v9.d[0]
555	fmla	v21.2d, v1.2d, v8.d[1]
556
557	prfm	PLDL1KEEP, [pA, #512]
558
559	fmla	v28.2d, v0.2d, v9.d[1]
560	fmla	v17.2d, v1.2d, v8.d[0]
561.endm
562
563.macro KERNEL4x4_M2
564	fmla	v16.2d, v4.2d, v12.d[0]
565	fmla	v29.2d, v5.2d, v13.d[1]
566
567	ld1	{v8.2d, v9.2d}, [pB]		// For next round
568	add	pB, pB, #32
569
570	fmla	v20.2d, v4.2d, v12.d[1]
571	fmla	v25.2d, v5.2d, v13.d[0]
572
573	ld1	{v0.2d, v1.2d}, [pA]		// For next round
574	add	pA, pA, #32
575
576	fmla	v24.2d, v4.2d, v13.d[0]
577	fmla	v21.2d, v5.2d, v12.d[1]
578
579	prfm	PLDL1KEEP, [pB, #512]
580
581	fmla	v28.2d, v4.2d, v13.d[1]
582	fmla	v17.2d, v5.2d, v12.d[0]
583.endm
584
585.macro KERNEL4x4_E
586	fmla	v16.2d, v4.2d, v12.d[0]
587	fmla	v29.2d, v5.2d, v13.d[1]
588
589	fmla	v20.2d, v4.2d, v12.d[1]
590	fmla	v25.2d, v5.2d, v13.d[0]
591
592	fmla	v24.2d, v4.2d, v13.d[0]
593	fmla	v21.2d, v5.2d, v12.d[1]
594
595	fmla	v28.2d, v4.2d, v13.d[1]
596	fmla	v17.2d, v5.2d, v12.d[0]
597.endm
598
599.macro KERNEL4x4_SUB
600	ld1	{v8.2d, v9.2d}, [pB]
601	add	pB, pB, #32
602	ld1	{v0.2d, v1.2d}, [pA]
603	add	pA, pA, #32
604
605	fmla	v16.2d, v0.2d, v8.d[0]
606	fmla	v29.2d, v1.2d, v9.d[1]
607
608	fmla	v20.2d, v0.2d, v8.d[1]
609	fmla	v25.2d, v1.2d, v9.d[0]
610
611	fmla	v24.2d, v0.2d, v9.d[0]
612	fmla	v21.2d, v1.2d, v8.d[1]
613
614	fmla	v28.2d, v0.2d, v9.d[1]
615	fmla	v17.2d, v1.2d, v8.d[0]
616.endm
617
618.macro SAVE4x4
619	ld1	{v8.2d, v9.2d}, [pCRow0]
620	fmla	v8.2d, v16.2d, alphaV0
621	fmla	v9.2d, v17.2d, alphaV1
622	st1 	{v8.2d, v9.2d}, [pCRow0]
623
624	add	pCRow1, pCRow0, LDC
625
626	ld1	{v12.2d, v13.2d}, [pCRow1]
627	fmla	v12.2d, v20.2d, alphaV2
628	fmla	v13.2d, v21.2d, alphaV3
629	st1 	{v12.2d, v13.2d}, [pCRow1]
630
631	add	pCRow2, pCRow1, LDC
632
633	ld1	{v8.2d, v9.2d}, [pCRow2]
634	fmla	v8.2d, v24.2d, alphaV0
635	fmla	v9.2d, v25.2d, alphaV1
636	st1 	{v8.2d, v9.2d}, [pCRow2]
637
638	add	pCRow1, pCRow2, LDC
639
640	ld1	{v12.2d, v13.2d}, [pCRow1]
641	fmla	v12.2d, v28.2d, alphaV2
642	fmla	v13.2d, v29.2d, alphaV3
643	st1 	{v12.2d, v13.2d}, [pCRow1]
644
645	add	pCRow0, pCRow0, #32
646.endm
647
648/******************************************************************************/
649
650.macro INIT2x4
651	fmov		d16, xzr
652	fmov		d20, d16
653	fmov		d24, d20
654	fmov		d28, d16
655.endm
656
657.macro KERNEL2x4_SUB
658	ld1	{v8.2d, v9.2d}, [pB]
659	add	pB, pB, #32
660	ld1	{v0.2d}, [pA]
661	add	pA, pA, #16
662
663	fmla	v16.2d, v0.2d, v8.d[0]
664	fmla	v20.2d, v0.2d, v8.d[1]
665	fmla	v24.2d, v0.2d, v9.d[0]
666	fmla	v28.2d, v0.2d, v9.d[1]
667.endm
668
669.macro SAVE2x4
670	ld1	{v8.2d}, [pCRow0]
671	fmla	v8.2d, v16.2d, alphaV0
672	st1	{v8.2d}, [pCRow0]
673
674	add	pCRow1, pCRow0, LDC
675
676	ld1	{v12.2d}, [pCRow1]
677	fmla	v12.2d, v20.2d, alphaV1
678	st1	{v12.2d}, [pCRow1]
679
680	add	pCRow2, pCRow1, LDC
681
682	ld1	{v8.2d}, [pCRow2]
683	fmla	v8.2d, v24.2d, alphaV2
684	st1	{v8.2d}, [pCRow2]
685
686	add	pCRow1, pCRow2, LDC
687
688	ld1	{v12.2d}, [pCRow1]
689	fmla	v12.2d, v28.2d, alphaV3
690	st1	{v12.2d}, [pCRow1]
691
692	add	pCRow0, pCRow0, #16
693.endm
694
695/******************************************************************************/
696
697.macro INIT1x4
698	fmov		d16, xzr
699	fmov		d20, d16
700.endm
701
702.macro KERNEL1x4_SUB
703	ldr	d0, [pA]
704	add	pA, pA, #8
705
706	ld1	{v8.2d, v9.2d}, [pB]
707	add	pB, pB, #32
708
709	fmla	v16.2d, v8.2d, v0.d[0]
710	fmla	v20.2d, v9.2d, v0.d[0]
711.endm
712
713.macro SAVE1x4
714	add	pCRow1, pCRow0, LDC
715
716	ld1	{v8.d}[0], [pCRow0]
717	ld1	{v8.d}[1], [pCRow1]
718	fmla	v8.2d, v16.2d, alphaV0
719	st1	{v8.d}[0], [pCRow0]
720	st1	{v8.d}[1], [pCRow1]
721
722	add	pCRow2, pCRow1, LDC
723	add	pCRow1, pCRow2, LDC
724
725	ld1	{v12.d}[0], [pCRow2]
726	ld1	{v12.d}[1], [pCRow1]
727	fmla	v12.2d, v20.2d, alphaV1
728	st1	{v12.d}[0], [pCRow2]
729	st1	{v12.d}[1], [pCRow1]
730
731	add	pCRow0, pCRow0, #8
732.endm
733
734/******************************************************************************/
735
736.macro INIT4x2
737	fmov	d16, xzr
738	fmov	d17, d16
739	fmov	d20, d17
740	fmov	d21, d16
741.endm
742
743.macro KERNEL4x2_SUB
744	ld1	{v8.2d}, [pB]
745	add	pB, pB, #16
746	ld1	{v0.2d, v1.2d}, [pA]
747	add	pA, pA, #32
748
749	fmla	v16.2d, v0.2d, v8.d[0]
750	fmla	v17.2d, v1.2d, v8.d[0]
751	fmla	v20.2d, v0.2d, v8.d[1]
752	fmla	v21.2d, v1.2d, v8.d[1]
753.endm
754
755.macro SAVE4x2
756	ld1	{v8.2d, v9.2d}, [pCRow0]
757	fmla	v8.2d, v16.2d, alphaV0
758	fmla	v9.2d, v17.2d, alphaV1
759	st1	{v8.2d, v9.2d}, [pCRow0]
760
761	add	pCRow1, pCRow0, LDC
762
763	ld1	{v12.2d, v13.2d}, [pCRow1]
764	fmla	v12.2d, v20.2d, alphaV2
765	fmla	v13.2d, v21.2d, alphaV3
766	st1	{v12.2d, v13.2d}, [pCRow1]
767
768	add	pCRow0, pCRow0, #32
769.endm
770
771/******************************************************************************/
772
773.macro INIT2x2
774	fmov		d16, xzr
775	fmov		d20, d16
776.endm
777
778.macro KERNEL2x2_SUB
779	ld1	{v8.2d}, [pB]
780	add	pB, pB, #16
781
782	ld1	{v0.2d}, [pA]
783	add	pA, pA, #16
784
785	fmla	v16.2d, v0.2d, v8.d[0]
786	fmla	v20.2d, v0.2d, v8.d[1]
787.endm
788
789.macro SAVE2x2
790	ld1	{v8.2d}, [pCRow0]
791	fmla	v8.2d, v16.2d, alphaV0
792	st1	{v8.2d}, [pCRow0]
793
794	add	pCRow1 , pCRow0, LDC
795
796	ld1	{v12.2d}, [pCRow1]
797	fmla	v12.2d, v20.2d, alphaV1
798	st1	{v12.2d}, [pCRow1]
799
800	add	pCRow0, pCRow0, #16
801.endm
802
803/******************************************************************************/
804
805.macro INIT1x2
806	fmov		d16, xzr
807.endm
808
809.macro KERNEL1x2_SUB
810	ld1	{v8.2d} , [pB]
811	add	pB , pB, #16
812
813	ldr	d0 , [pA]
814	add	pA, pA, #8
815
816	fmla	v16.2d, v8.2d, v0.d[0]
817.endm
818
819.macro SAVE1x2
820	add	pCRow1 , pCRow0, LDC
821
822	ld1	{v8.d}[0], [pCRow0]
823	ld1	{v8.d}[1], [pCRow1]
824	fmla	v8.2d, v16.2d, alphaV0
825	st1	{v8.d}[0], [pCRow0]
826	st1	{v8.d}[1], [pCRow1]
827
828	add	pCRow0, pCRow0, #8
829.endm
830
831/******************************************************************************/
832
833.macro INIT4x1
834	fmov	d16, xzr
835	fmov	d17, d16
836.endm
837
838.macro KERNEL4x1_SUB
839	ldr	d8, [pB]
840	add	pB , pB, #8
841
842	ld1	{v0.2d, v1.2d}, [pA]
843	add	pA , pA, #32
844
845	fmla	v16.2d, v0.2d, v8.d[0]
846	fmla	v17.2d, v1.2d, v8.d[0]
847.endm
848
849.macro SAVE4x1
850	ld1	{v8.2d, v9.2d}, [pCRow0]
851	fmla	v8.2d, v16.2d, alphaV0
852	fmla	v9.2d, v17.2d, alphaV1
853	st1	{v8.2d, v9.2d}, [pCRow0]
854
855	add	pCRow0, pCRow0, #32
856.endm
857
858
859
860
861/******************************************************************************/
862
863.macro INIT2x1
864	fmov		d16, xzr
865.endm
866
867.macro KERNEL2x1_SUB
868	ldr	d8, [pB]
869	add	pB , pB, #8
870
871	ld1	{v0.2d}, [pA]
872	add	pA , pA, #16
873
874	fmla	v16.2d, v0.2d, v8.d[0]
875.endm
876
877.macro SAVE2x1
878	ld1	{v8.2d}, [pCRow0]
879	fmla	v8.2d, v16.2d, alphaV0
880	st1	{v8.2d}, [pCRow0]
881
882	add	pCRow0, pCRow0, #16
883.endm
884
885/******************************************************************************/
886
887.macro INIT1x1
888	fmov	d16, xzr
889.endm
890
891.macro KERNEL1x1_SUB
892	ldr	d8, [pB]
893	add	pB , pB, #8
894
895	ldr	d0, [pA]
896	add	pA , pA, #8
897
898	fmadd 	d16, d0, d8, d16
899.endm
900
901.macro SAVE1x1
902	ldr	d8, [pCRow0]
903	fmadd	d8, d16, alpha0, d8
904	str 	d8, [pCRow0]
905
906	add	pCRow0, pCRow0, #8
907.endm
908
909/*******************************************************************************
910* End of macro definitions
911*******************************************************************************/
912
913	PROLOGUE
914
915	.align 5
916	add	sp, sp, #-(11 * 16)
917	stp	d8, d9, [sp, #(0 * 16)]
918	stp	d10, d11, [sp, #(1 * 16)]
919	stp	d12, d13, [sp, #(2 * 16)]
920	stp	d14, d15, [sp, #(3 * 16)]
921	stp	d16, d17, [sp, #(4 * 16)]
922	stp	x18, x19, [sp, #(5 * 16)]
923	stp	x20, x21, [sp, #(6 * 16)]
924	stp	x22, x23, [sp, #(7 * 16)]
925	stp	x24, x25, [sp, #(8 * 16)]
926	stp	x26, x27, [sp, #(9 * 16)]
927	str	x28, [sp, #(10 * 16)]
928
929	fmov	alpha0, d0
930	fmov	alpha1, d0
931	fmov	alpha2, d0
932	fmov	alpha3, d0
933
934	lsl	LDC, LDC, #3			// ldc = ldc * 8
935
936	mov	pB, origPB
937
938	mov	counterJ, origN
939	asr 	counterJ, counterJ, #3		// J = J / 8
940	cmp 	counterJ, #0
941	ble	.Ldgemm_kernel_L4_BEGIN
942
943/******************************************************************************/
944
945.Ldgemm_kernel_L8_BEGIN:
946
947	mov	pCRow0, pC			// pCRow0 = C
948	add	pC, pC, LDC, lsl #3
949
950	mov	pA, origPA			// pA = start of A array
951
952.Ldgemm_kernel_L8_M4_BEGIN:
953
954	mov	counterI, origM
955	asr 	counterI, counterI, #2		// counterI = counterI / 4
956	cmp 	counterI, #0
957	ble	.Ldgemm_kernel_L8_M2_BEGIN
958
959.Ldgemm_kernel_L8_M4_20:
960
961	mov	pB, origPB
962
963	asr 	counterL , origK, #1		// L = K / 2
964	cmp	counterL , #2			// is there at least 4 to do?
965	blt	.Ldgemm_kernel_L8_M4_32
966
967	KERNEL4x8_I				// do one in the K
968	KERNEL4x8_M2				// do another in the K
969
970	subs	counterL, counterL, #2
971	ble	.Ldgemm_kernel_L8_M4_22a
972	.align 5
973
974.Ldgemm_kernel_L8_M4_22:
975
976	KERNEL4x8_M1
977	KERNEL4x8_M2
978
979	subs	counterL, counterL, #1
980	bgt	.Ldgemm_kernel_L8_M4_22
981
982
983.Ldgemm_kernel_L8_M4_22a:
984
985	KERNEL4x8_M1
986	KERNEL4x8_E
987
988	b	 .Ldgemm_kernel_L8_M4_44
989
990.Ldgemm_kernel_L8_M4_32:
991
992	tst	counterL, #1
993	ble	.Ldgemm_kernel_L8_M4_40
994
995	KERNEL4x8_I
996
997	KERNEL4x8_E
998
999	b	.Ldgemm_kernel_L8_M4_44
1000
1001
1002.Ldgemm_kernel_L8_M4_40:
1003
1004	INIT4x8
1005
1006.Ldgemm_kernel_L8_M4_44:
1007
1008	ands	counterL , origK, #1
1009	ble	.Ldgemm_kernel_L8_M4_100
1010
1011.Ldgemm_kernel_L8_M4_46:
1012
1013	KERNEL4x8_SUB
1014
1015.Ldgemm_kernel_L8_M4_100:
1016
1017	SAVE4x8
1018
1019.Ldgemm_kernel_L8_M4_END:
1020	subs	counterI, counterI, #1
1021	bne	.Ldgemm_kernel_L8_M4_20
1022
1023.Ldgemm_kernel_L8_M2_BEGIN:
1024
1025	mov	counterI, origM
1026	tst	counterI , #3
1027	ble	.Ldgemm_kernel_L8_END
1028
1029	tst	counterI, #2			// counterI = counterI / 2
1030	ble	.Ldgemm_kernel_L8_M1_BEGIN
1031
1032.Ldgemm_kernel_L8_M2_20:
1033
1034	INIT2x8
1035
1036	mov	pB, origPB
1037
1038	asr 	counterL , origK, #3		// counterL = counterL / 8
1039	cmp	counterL , #0
1040	ble	.Ldgemm_kernel_L8_M2_40
1041
1042.Ldgemm_kernel_L8_M2_22:
1043
1044	KERNEL2x8_SUB
1045	KERNEL2x8_SUB
1046	KERNEL2x8_SUB
1047	KERNEL2x8_SUB
1048
1049	KERNEL2x8_SUB
1050	KERNEL2x8_SUB
1051	KERNEL2x8_SUB
1052	KERNEL2x8_SUB
1053
1054	subs	counterL, counterL, #1
1055	bgt	.Ldgemm_kernel_L8_M2_22
1056
1057
1058.Ldgemm_kernel_L8_M2_40:
1059
1060	ands	counterL , origK, #7		// counterL = counterL % 8
1061	ble	.Ldgemm_kernel_L8_M2_100
1062
1063.Ldgemm_kernel_L8_M2_42:
1064
1065	KERNEL2x8_SUB
1066
1067	subs	counterL, counterL, #1
1068	bgt	.Ldgemm_kernel_L8_M2_42
1069
1070.Ldgemm_kernel_L8_M2_100:
1071
1072	SAVE2x8
1073
1074.Ldgemm_kernel_L8_M2_END:
1075
1076
1077.Ldgemm_kernel_L8_M1_BEGIN:
1078
1079	tst	counterI, #1			// counterI = counterI % 2
1080	ble	.Ldgemm_kernel_L8_END
1081
1082.Ldgemm_kernel_L8_M1_20:
1083
1084	INIT1x8
1085
1086	mov	pB, origPB
1087
1088	asr 	counterL , origK, #3		// counterL = counterL / 8
1089	cmp	counterL , #0
1090	ble	.Ldgemm_kernel_L8_M1_40
1091
1092.Ldgemm_kernel_L8_M1_22:
1093	KERNEL1x8_SUB
1094	KERNEL1x8_SUB
1095	KERNEL1x8_SUB
1096	KERNEL1x8_SUB
1097
1098	KERNEL1x8_SUB
1099	KERNEL1x8_SUB
1100	KERNEL1x8_SUB
1101	KERNEL1x8_SUB
1102
1103	subs	counterL, counterL, #1
1104	bgt	.Ldgemm_kernel_L8_M1_22
1105
1106
1107.Ldgemm_kernel_L8_M1_40:
1108
1109	ands	counterL , origK, #7		// counterL = counterL % 8
1110	ble	.Ldgemm_kernel_L8_M1_100
1111
1112.Ldgemm_kernel_L8_M1_42:
1113
1114	KERNEL1x8_SUB
1115
1116	subs	counterL, counterL, #1
1117	bgt	.Ldgemm_kernel_L8_M1_42
1118
1119.Ldgemm_kernel_L8_M1_100:
1120
1121	SAVE1x8
1122
1123.Ldgemm_kernel_L8_END:
1124
1125	lsl	temp, origK, #6
1126	add	origPB, origPB, temp		// B = B + K * 8 * 8
1127
1128	subs	counterJ, counterJ , #1		// j--
1129	bgt	.Ldgemm_kernel_L8_BEGIN
1130
1131
1132/******************************************************************************/
1133
1134.Ldgemm_kernel_L4_BEGIN:
1135
1136	mov	counterJ , origN
1137	tst	counterJ , #7
1138	ble	.Ldgemm_kernel_L999
1139
1140	tst	counterJ , #4
1141	ble	.Ldgemm_kernel_L2_BEGIN
1142
1143	mov	pCRow0, pC			// pCRow0 = C
1144	add	pC, pC, LDC, lsl #2
1145
1146	mov	pA, origPA			// pA = start of A array
1147
1148.Ldgemm_kernel_L4_M4_BEGIN:
1149
1150	mov	counterI, origM
1151	asr 	counterI, counterI, #2		// counterI = counterI / 4
1152	cmp 	counterI, #0
1153	ble	.Ldgemm_kernel_L4_M2_BEGIN
1154
1155.Ldgemm_kernel_L4_M4_20:
1156
1157	mov	pB, origPB
1158
1159	asr 	counterL , origK, #1		// L = K / 2
1160	cmp	counterL , #2			// is there at least 4 to do?
1161	blt	.Ldgemm_kernel_L4_M4_32
1162
1163	KERNEL4x4_I				// do one in the K
1164	KERNEL4x4_M2				// do another in the K
1165
1166	subs	counterL, counterL, #2
1167	ble	.Ldgemm_kernel_L4_M4_22a
1168	.align 5
1169
1170.Ldgemm_kernel_L4_M4_22:
1171
1172	KERNEL4x4_M1
1173	KERNEL4x4_M2
1174
1175	subs	counterL, counterL, #1
1176	bgt	.Ldgemm_kernel_L4_M4_22
1177
1178
1179.Ldgemm_kernel_L4_M4_22a:
1180
1181	KERNEL4x4_M1
1182	KERNEL4x4_E
1183
1184	b	 .Ldgemm_kernel_L4_M4_44
1185
1186.Ldgemm_kernel_L4_M4_32:
1187
1188	tst	counterL, #1
1189	ble	.Ldgemm_kernel_L4_M4_40
1190
1191	KERNEL4x4_I
1192
1193	KERNEL4x4_E
1194
1195	b	.Ldgemm_kernel_L4_M4_44
1196
1197
1198.Ldgemm_kernel_L4_M4_40:
1199
1200	INIT4x4
1201
1202.Ldgemm_kernel_L4_M4_44:
1203
1204	ands	counterL , origK, #1
1205	ble	.Ldgemm_kernel_L4_M4_100
1206
1207.Ldgemm_kernel_L4_M4_46:
1208
1209	KERNEL4x4_SUB
1210
1211.Ldgemm_kernel_L4_M4_100:
1212
1213	SAVE4x4
1214
1215.Ldgemm_kernel_L4_M4_END:
1216	subs	counterI, counterI, #1
1217	bne	.Ldgemm_kernel_L4_M4_20
1218
1219.Ldgemm_kernel_L4_M2_BEGIN:
1220
1221	mov	counterI, origM
1222	tst	counterI , #3
1223	ble	.Ldgemm_kernel_L4_END
1224
1225	tst	counterI, #2			// counterI = counterI / 2
1226	ble	.Ldgemm_kernel_L4_M1_BEGIN
1227
1228.Ldgemm_kernel_L4_M2_20:
1229
1230	INIT2x4
1231
1232	mov	pB, origPB
1233
1234	asr 	counterL , origK, #3		// counterL = counterL / 8
1235	cmp	counterL , #0
1236	ble	.Ldgemm_kernel_L4_M2_40
1237
1238.Ldgemm_kernel_L4_M2_22:
1239
1240	KERNEL2x4_SUB
1241	KERNEL2x4_SUB
1242	KERNEL2x4_SUB
1243	KERNEL2x4_SUB
1244
1245	KERNEL2x4_SUB
1246	KERNEL2x4_SUB
1247	KERNEL2x4_SUB
1248	KERNEL2x4_SUB
1249
1250	subs	counterL, counterL, #1
1251	bgt	.Ldgemm_kernel_L4_M2_22
1252
1253
1254.Ldgemm_kernel_L4_M2_40:
1255
1256	ands	counterL , origK, #7		// counterL = counterL % 8
1257	ble	.Ldgemm_kernel_L4_M2_100
1258
1259.Ldgemm_kernel_L4_M2_42:
1260
1261	KERNEL2x4_SUB
1262
1263	subs	counterL, counterL, #1
1264	bgt	.Ldgemm_kernel_L4_M2_42
1265
1266.Ldgemm_kernel_L4_M2_100:
1267
1268	SAVE2x4
1269
1270.Ldgemm_kernel_L4_M2_END:
1271
1272
1273.Ldgemm_kernel_L4_M1_BEGIN:
1274
1275	tst	counterI, #1			// counterI = counterI % 2
1276	ble	.Ldgemm_kernel_L4_END
1277
1278.Ldgemm_kernel_L4_M1_20:
1279
1280	INIT1x4
1281
1282	mov	pB, origPB
1283
1284	asr 	counterL , origK, #3		// counterL = counterL / 8
1285	cmp	counterL , #0
1286	ble	.Ldgemm_kernel_L4_M1_40
1287
1288.Ldgemm_kernel_L4_M1_22:
1289	KERNEL1x4_SUB
1290	KERNEL1x4_SUB
1291	KERNEL1x4_SUB
1292	KERNEL1x4_SUB
1293
1294	KERNEL1x4_SUB
1295	KERNEL1x4_SUB
1296	KERNEL1x4_SUB
1297	KERNEL1x4_SUB
1298
1299	subs	counterL, counterL, #1
1300	bgt	.Ldgemm_kernel_L4_M1_22
1301
1302
1303.Ldgemm_kernel_L4_M1_40:
1304
1305	ands	counterL , origK, #7		// counterL = counterL % 8
1306	ble	.Ldgemm_kernel_L4_M1_100
1307
1308.Ldgemm_kernel_L4_M1_42:
1309
1310	KERNEL1x4_SUB
1311
1312	subs	counterL, counterL, #1
1313	bgt	.Ldgemm_kernel_L4_M1_42
1314
1315.Ldgemm_kernel_L4_M1_100:
1316
1317	SAVE1x4
1318
1319.Ldgemm_kernel_L4_END:
1320
1321	lsl	temp, origK, #5
1322	add	origPB, origPB, temp		// B = B + K * 4 * 8
1323
1324/******************************************************************************/
1325
1326.Ldgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
1327
1328	mov	counterJ , origN
1329	tst	counterJ , #3
1330	ble	.Ldgemm_kernel_L999   // error, N was less than 4?
1331
1332	tst	counterJ , #2
1333	ble	.Ldgemm_kernel_L1_BEGIN
1334
1335	mov	pCRow0, pC			// pCRow0 = pC
1336
1337	add	pC,pC,LDC, lsl #1
1338
1339	mov	pA, origPA			// pA = A
1340
1341
1342.Ldgemm_kernel_L2_M4_BEGIN:
1343
1344	mov	counterI, origM
1345	asr 	counterI, counterI, #2		// counterI = counterI / 4
1346	cmp	counterI,#0
1347	ble	.Ldgemm_kernel_L2_M2_BEGIN
1348
1349.Ldgemm_kernel_L2_M4_20:
1350
1351	INIT4x2
1352
1353	mov	pB, origPB
1354
1355	asr	counterL , origK, #3		// counterL = counterL / 8
1356	cmp	counterL,#0
1357	ble	.Ldgemm_kernel_L2_M4_40
1358	.align 5
1359
1360.Ldgemm_kernel_L2_M4_22:
1361	KERNEL4x2_SUB
1362	KERNEL4x2_SUB
1363	KERNEL4x2_SUB
1364	KERNEL4x2_SUB
1365
1366	KERNEL4x2_SUB
1367	KERNEL4x2_SUB
1368	KERNEL4x2_SUB
1369	KERNEL4x2_SUB
1370
1371	subs	counterL, counterL, #1
1372	bgt	.Ldgemm_kernel_L2_M4_22
1373
1374
1375.Ldgemm_kernel_L2_M4_40:
1376
1377	ands	counterL , origK, #7		// counterL = counterL % 8
1378	ble	.Ldgemm_kernel_L2_M4_100
1379
1380.Ldgemm_kernel_L2_M4_42:
1381
1382	KERNEL4x2_SUB
1383
1384	subs	counterL, counterL, #1
1385	bgt	.Ldgemm_kernel_L2_M4_42
1386
1387.Ldgemm_kernel_L2_M4_100:
1388
1389	SAVE4x2
1390
1391.Ldgemm_kernel_L2_M4_END:
1392
1393	subs	counterI, counterI, #1
1394	bgt	.Ldgemm_kernel_L2_M4_20
1395
1396
1397.Ldgemm_kernel_L2_M2_BEGIN:
1398
1399	mov	counterI, origM
1400	tst	counterI , #3
1401	ble	.Ldgemm_kernel_L2_END
1402
1403	tst	counterI, #2			// counterI = counterI / 2
1404	ble	.Ldgemm_kernel_L2_M1_BEGIN
1405
1406.Ldgemm_kernel_L2_M2_20:
1407
1408	INIT2x2
1409
1410	mov	pB, origPB
1411
1412	asr	counterL , origK, #3		// counterL = counterL / 8
1413        cmp	counterL,#0
1414	ble	.Ldgemm_kernel_L2_M2_40
1415
1416.Ldgemm_kernel_L2_M2_22:
1417
1418	KERNEL2x2_SUB
1419	KERNEL2x2_SUB
1420	KERNEL2x2_SUB
1421	KERNEL2x2_SUB
1422
1423	KERNEL2x2_SUB
1424	KERNEL2x2_SUB
1425	KERNEL2x2_SUB
1426	KERNEL2x2_SUB
1427
1428	subs	counterL, counterL, #1
1429	bgt	.Ldgemm_kernel_L2_M2_22
1430
1431
1432.Ldgemm_kernel_L2_M2_40:
1433
1434	ands	counterL , origK, #7		// counterL = counterL % 8
1435	ble	.Ldgemm_kernel_L2_M2_100
1436
1437.Ldgemm_kernel_L2_M2_42:
1438
1439	KERNEL2x2_SUB
1440
1441	subs	counterL, counterL, #1
1442	bgt	.Ldgemm_kernel_L2_M2_42
1443
1444.Ldgemm_kernel_L2_M2_100:
1445
1446	SAVE2x2
1447
1448.Ldgemm_kernel_L2_M2_END:
1449
1450
1451.Ldgemm_kernel_L2_M1_BEGIN:
1452
1453	tst	counterI, #1			// counterI = counterI % 2
1454	ble	.Ldgemm_kernel_L2_END
1455
1456.Ldgemm_kernel_L2_M1_20:
1457
1458	INIT1x2
1459
1460	mov	pB, origPB
1461
1462	asr 	counterL , origK, #3		// counterL = counterL / 8
1463        cmp     counterL, #0
1464	ble	.Ldgemm_kernel_L2_M1_40
1465
1466.Ldgemm_kernel_L2_M1_22:
1467	KERNEL1x2_SUB
1468	KERNEL1x2_SUB
1469	KERNEL1x2_SUB
1470	KERNEL1x2_SUB
1471
1472	KERNEL1x2_SUB
1473	KERNEL1x2_SUB
1474	KERNEL1x2_SUB
1475	KERNEL1x2_SUB
1476
1477	subs	counterL, counterL, #1
1478	bgt	.Ldgemm_kernel_L2_M1_22
1479
1480
1481.Ldgemm_kernel_L2_M1_40:
1482
1483	ands	counterL , origK, #7		// counterL = counterL % 8
1484	ble	.Ldgemm_kernel_L2_M1_100
1485
1486.Ldgemm_kernel_L2_M1_42:
1487
1488	KERNEL1x2_SUB
1489
1490	subs	counterL, counterL, #1
1491	bgt	.Ldgemm_kernel_L2_M1_42
1492
1493.Ldgemm_kernel_L2_M1_100:
1494
1495	SAVE1x2
1496
1497.Ldgemm_kernel_L2_END:
1498	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
1499
1500/******************************************************************************/
1501
1502.Ldgemm_kernel_L1_BEGIN:
1503
1504	mov	counterJ , origN
1505	tst	counterJ , #1
1506	ble	.Ldgemm_kernel_L999 // done
1507
1508
1509	mov	pCRow0, pC			// pCRow0 = C
1510	add	pC , pC , LDC			// Update pC to point to next
1511
1512	mov	pA, origPA			// pA = A
1513
1514.Ldgemm_kernel_L1_M4_BEGIN:
1515
1516	mov	counterI, origM
1517	asr 	counterI, counterI, #2		// counterI = counterI / 4
1518	cmp	counterI, #0
1519	ble	.Ldgemm_kernel_L1_M2_BEGIN
1520
1521.Ldgemm_kernel_L1_M4_20:
1522
1523	INIT4x1
1524
1525	mov	pB, origPB
1526	asr	counterL , origK, #3		// counterL = counterL / 8
1527	cmp	counterL , #0
1528	ble	.Ldgemm_kernel_L1_M4_40
1529	.align 5
1530
1531.Ldgemm_kernel_L1_M4_22:
1532	KERNEL4x1_SUB
1533	KERNEL4x1_SUB
1534	KERNEL4x1_SUB
1535	KERNEL4x1_SUB
1536
1537	KERNEL4x1_SUB
1538	KERNEL4x1_SUB
1539	KERNEL4x1_SUB
1540	KERNEL4x1_SUB
1541
1542	subs	counterL, counterL, #1
1543	bgt	.Ldgemm_kernel_L1_M4_22
1544
1545
1546.Ldgemm_kernel_L1_M4_40:
1547
1548	ands	counterL , origK, #7		// counterL = counterL % 8
1549	ble	.Ldgemm_kernel_L1_M4_100
1550
1551.Ldgemm_kernel_L1_M4_42:
1552
1553	KERNEL4x1_SUB
1554
1555	subs	counterL, counterL, #1
1556	bgt	.Ldgemm_kernel_L1_M4_42
1557
1558.Ldgemm_kernel_L1_M4_100:
1559
1560	SAVE4x1
1561
1562.Ldgemm_kernel_L1_M4_END:
1563
1564	subs	counterI, counterI, #1
1565	bgt	.Ldgemm_kernel_L1_M4_20
1566
1567
1568.Ldgemm_kernel_L1_M2_BEGIN:
1569
1570	mov	counterI, origM
1571	tst	counterI , #3
1572	ble	.Ldgemm_kernel_L1_END
1573
1574	tst	counterI, #2			// counterI = counterI / 2
1575	ble	.Ldgemm_kernel_L1_M1_BEGIN
1576
1577.Ldgemm_kernel_L1_M2_20:
1578
1579	INIT2x1
1580
1581	mov	pB, origPB
1582
1583	asr 	counterL , origK, #3		// counterL = counterL / 8
1584	cmp	counterL , #0
1585	ble	.Ldgemm_kernel_L1_M2_40
1586
1587.Ldgemm_kernel_L1_M2_22:
1588
1589	KERNEL2x1_SUB
1590	KERNEL2x1_SUB
1591	KERNEL2x1_SUB
1592	KERNEL2x1_SUB
1593
1594	KERNEL2x1_SUB
1595	KERNEL2x1_SUB
1596	KERNEL2x1_SUB
1597	KERNEL2x1_SUB
1598
1599	subs	counterL, counterL, #1
1600	bgt	.Ldgemm_kernel_L1_M2_22
1601
1602
1603.Ldgemm_kernel_L1_M2_40:
1604
1605	ands	counterL , origK, #7		// counterL = counterL % 8
1606	ble	.Ldgemm_kernel_L1_M2_100
1607
1608.Ldgemm_kernel_L1_M2_42:
1609
1610	KERNEL2x1_SUB
1611
1612	subs	counterL, counterL, #1
1613	bgt	.Ldgemm_kernel_L1_M2_42
1614
1615.Ldgemm_kernel_L1_M2_100:
1616
1617	SAVE2x1
1618
1619.Ldgemm_kernel_L1_M2_END:
1620
1621
1622.Ldgemm_kernel_L1_M1_BEGIN:
1623
1624	tst	counterI, #1			// counterI = counterI % 2
1625	ble	.Ldgemm_kernel_L1_END
1626
1627.Ldgemm_kernel_L1_M1_20:
1628
1629	INIT1x1
1630
1631	mov	pB, origPB
1632
1633	asr 	counterL , origK, #3		// counterL = counterL / 8
1634	cmp	counterL , #0
1635	ble	.Ldgemm_kernel_L1_M1_40
1636
1637.Ldgemm_kernel_L1_M1_22:
1638	KERNEL1x1_SUB
1639	KERNEL1x1_SUB
1640	KERNEL1x1_SUB
1641	KERNEL1x1_SUB
1642
1643	KERNEL1x1_SUB
1644	KERNEL1x1_SUB
1645	KERNEL1x1_SUB
1646	KERNEL1x1_SUB
1647
1648	subs	counterL, counterL, #1
1649	bgt	.Ldgemm_kernel_L1_M1_22
1650
1651
1652.Ldgemm_kernel_L1_M1_40:
1653
1654	ands	counterL , origK, #7		// counterL = counterL % 8
1655	ble	.Ldgemm_kernel_L1_M1_100
1656
1657.Ldgemm_kernel_L1_M1_42:
1658
1659	KERNEL1x1_SUB
1660
1661	subs	counterL, counterL, #1
1662	bgt	.Ldgemm_kernel_L1_M1_42
1663
1664.Ldgemm_kernel_L1_M1_100:
1665
1666	SAVE1x1
1667
1668
1669.Ldgemm_kernel_L1_END:
1670
1671
1672.Ldgemm_kernel_L999:
1673	mov	x0, #0				// set return value
1674	ldp	d8, d9, [sp, #(0 * 16)]
1675	ldp	d10, d11, [sp, #(1 * 16)]
1676	ldp	d12, d13, [sp, #(2 * 16)]
1677	ldp	d14, d15, [sp, #(3 * 16)]
1678	ldp	d16, d17, [sp, #(4 * 16)]
1679	ldp	x18, x19, [sp, #(5 * 16)]
1680	ldp	x20, x21, [sp, #(6 * 16)]
1681	ldp	x22, x23, [sp, #(7 * 16)]
1682	ldp	x24, x25, [sp, #(8 * 16)]
1683	ldp	x26, x27, [sp, #(9 * 16)]
1684	ldr	x28, [sp, #(10 * 16)]
1685	add	sp, sp, #(11*16)
1686	ret
1687
1688	EPILOGUE
1689
1690