1/***************************************************************************
2Copyright (c) 2016, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28#define ASSEMBLER
29#include "common.h"
30
31#define	M	x0
32#define	N	x1
33#define	A00	x2
34#define	LDA	x3
35#define	B00	x4
36
37#define	A01	x5
38#define	A02	x6
39#define	A03	x7
40#define	A04	x8
41#define	A05	x9
42#define	A06	x10
43#define	A07	x11
44#define	A08	x12
45
46#define I	x13
47#define	J	x14
48
49#define	TEMP1	x15
50#define	TEMP2	x16
51
52#define A_PREFETCH	2560
53
54/**************************************************************************************
55* Macro definitions
56**************************************************************************************/
57
58.macro SAVE_REGS
59	add	sp, sp, #-(11 * 16)
60	stp	d8, d9, [sp, #(0 * 16)]
61	stp	d10, d11, [sp, #(1 * 16)]
62	stp	d12, d13, [sp, #(2 * 16)]
63	stp	d14, d15, [sp, #(3 * 16)]
64	stp	d16, d17, [sp, #(4 * 16)]
65	stp	x18, x19, [sp, #(5 * 16)]
66	stp	x20, x21, [sp, #(6 * 16)]
67	stp	x22, x23, [sp, #(7 * 16)]
68	stp	x24, x25, [sp, #(8 * 16)]
69	stp	x26, x27, [sp, #(9 * 16)]
70	str	x28, [sp, #(10 * 16)]
71.endm
72
73.macro RESTORE_REGS
74	ldp	d8, d9, [sp, #(0 * 16)]
75	ldp	d10, d11, [sp, #(1 * 16)]
76	ldp	d12, d13, [sp, #(2 * 16)]
77	ldp	d14, d15, [sp, #(3 * 16)]
78	ldp	d16, d17, [sp, #(4 * 16)]
79	ldp	x18, x19, [sp, #(5 * 16)]
80	ldp	x20, x21, [sp, #(6 * 16)]
81	ldp	x22, x23, [sp, #(7 * 16)]
82	ldp	x24, x25, [sp, #(8 * 16)]
83	ldp	x26, x27, [sp, #(9 * 16)]
84	ldr	x28, [sp, #(10 * 16)]
85	add	sp, sp, #(11*16)
86.endm
87
88/*************************************************************************************/
89
90.macro COPY8x8
91	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
92	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
93	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
94	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
95	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
96	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
97	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
98	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
99
100	COPY4x8
101	COPY4x8
102.endm
103
104.macro COPY4x8
105	ldp	q0, q1, [A01], #32
106	ins	v16.d[0], v0.d[0]
107	ins	v20.d[0], v0.d[1]
108	ins	v24.d[0], v1.d[0]
109	ins	v28.d[0], v1.d[1]
110
111	ldp	q2, q3, [A02], #32
112	ins	v16.d[1], v2.d[0]
113	ins	v20.d[1], v2.d[1]
114	ins	v24.d[1], v3.d[0]
115	ins	v28.d[1], v3.d[1]
116
117	ldp	q4, q5, [A03], #32
118	ins	v17.d[0], v4.d[0]
119	ins	v21.d[0], v4.d[1]
120	ins	v25.d[0], v5.d[0]
121	ins	v29.d[0], v5.d[1]
122
123	ldp	q6, q7, [A04], #32
124	ins	v17.d[1], v6.d[0]
125	ins	v21.d[1], v6.d[1]
126	ins	v25.d[1], v7.d[0]
127	ins	v29.d[1], v7.d[1]
128
129	ldp	q8, q9, [A05], #32
130	ins	v18.d[0], v8.d[0]
131	ins	v22.d[0], v8.d[1]
132	ins	v26.d[0], v9.d[0]
133	ins	v30.d[0], v9.d[1]
134
135	ldp	q10, q11, [A06], #32
136	ins	v18.d[1], v10.d[0]
137	ins	v22.d[1], v10.d[1]
138	ins	v26.d[1], v11.d[0]
139	ins	v30.d[1], v11.d[1]
140
141	ldp	q12, q13, [A07], #32
142	ins	v19.d[0], v12.d[0]
143	ins	v23.d[0], v12.d[1]
144	ins	v27.d[0], v13.d[0]
145	ins	v31.d[0], v13.d[1]
146
147	ldp	q14, q15, [A08], #32
148	ins	v19.d[1], v14.d[0]
149	ins	v23.d[1], v14.d[1]
150	ins	v27.d[1], v15.d[0]
151	ins	v31.d[1], v15.d[1]
152
153	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [B00]
154	add	B00, B00, #64
155
156	st1	{v20.2d, v21.2d, v22.2d, v23.2d}, [B00]
157	add	B00, B00, #64
158
159	st1	{v24.2d, v25.2d, v26.2d, v27.2d}, [B00]
160	add	B00, B00, #64
161
162	st1	{v28.2d, v29.2d, v30.2d, v31.2d}, [B00]
163	add	B00, B00, #64
164.endm
165
166.macro COPY1x8
167	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
168	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
169	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
170	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
171	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
172	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
173	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
174	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
175
176	ldr	d0, [A01], #8
177	ldr	d1, [A02], #8
178	ldr	d2, [A03], #8
179	ldr	d3, [A04], #8
180	ldr	d4, [A05], #8
181	ldr	d5, [A06], #8
182	ldr	d6, [A07], #8
183	ldr	d7, [A08], #8
184
185	st1	{v0.1d, v1.1d, v2.1d, v3.1d}, [B00]
186	add	B00, B00, #32
187	st1	{v4.1d, v5.1d, v6.1d, v7.1d}, [B00]
188	add	B00, B00, #32
189
190.endm
191
192
193/*************************************************************************************/
194
195.macro COPY8x4
196	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
197	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
198	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
199	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
200
201	ldp	q0, q1, [A01], #32
202	ins	v8.d[0], v0.d[0]
203	ins	v10.d[0], v0.d[1]
204	ins	v12.d[0], v1.d[0]
205	ins	v14.d[0], v1.d[1]
206
207	ldp	q2, q3, [A02], #32
208	ins	v8.d[1], v2.d[0]
209	ins	v10.d[1], v2.d[1]
210	ins	v12.d[1], v3.d[0]
211	ins	v14.d[1], v3.d[1]
212
213	ldp	q4, q5, [A03], #32
214	ins	v9.d[0], v4.d[0]
215	ins	v11.d[0], v4.d[1]
216	ins	v13.d[0], v5.d[0]
217	ins	v15.d[0], v5.d[1]
218
219	ldp	q6, q7, [A04], #32
220	ins	v9.d[1], v6.d[0]
221	ins	v11.d[1], v6.d[1]
222	ins	v13.d[1], v7.d[0]
223	ins	v15.d[1], v7.d[1]
224
225	st1	{v8.2d, v9.2d, v10.2d, v11.2d}, [B00]
226	add	B00, B00, #64
227
228	st1	{v12.2d, v13.2d, v14.2d, v15.2d}, [B00]
229	add	B00, B00, #64
230
231	ldp	q16, q17, [A01], #32
232	ins	v24.d[0], v16.d[0]
233	ins	v26.d[0], v16.d[1]
234	ins	v28.d[0], v17.d[0]
235	ins	v30.d[0], v17.d[1]
236
237	ldp	q18, q19, [A02], #32
238	ins	v24.d[1], v18.d[0]
239	ins	v26.d[1], v18.d[1]
240	ins	v28.d[1], v19.d[0]
241	ins	v30.d[1], v19.d[1]
242
243	ldp	q20, q21, [A03], #32
244	ins	v25.d[0], v20.d[0]
245	ins	v27.d[0], v20.d[1]
246	ins	v29.d[0], v21.d[0]
247	ins	v31.d[0], v21.d[1]
248
249	ldp	q22, q23, [A04], #32
250	ins	v25.d[1], v22.d[0]
251	ins	v27.d[1], v22.d[1]
252	ins	v29.d[1], v23.d[0]
253	ins	v31.d[1], v23.d[1]
254
255	st1	{v24.2d, v25.2d, v26.2d, v27.2d}, [B00]
256	add	B00, B00, #64
257
258	st1	{v28.2d, v29.2d, v30.2d, v31.2d}, [B00]
259	add	B00, B00, #64
260.endm
261
262.macro COPY1x4
263	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
264	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
265	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
266	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
267
268	ldr	d0, [A01], #8
269	ldr	d1, [A02], #8
270	ldr	d2, [A03], #8
271	ldr	d3, [A04], #8
272
273	st1	{v0.1d, v1.1d, v2.1d, v3.1d}, [B00]
274	add	B00, B00, #32
275.endm
276
277/*************************************************************************************/
278
279.macro COPY8x2
280	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
281	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
282
283	ldp	q0, q1, [A01], #32
284	ldp	q2, q3, [A01], #32
285
286	ins	v8.d[0], v0.d[0]
287	ins	v9.d[0], v0.d[1]
288	ins	v10.d[0], v1.d[0]
289	ins	v11.d[0], v1.d[1]
290	ins	v12.d[0], v2.d[0]
291	ins	v13.d[0], v2.d[1]
292	ins	v14.d[0], v3.d[0]
293	ins	v15.d[0], v3.d[1]
294
295	ldp	q4, q5, [A02], #32
296	ldp	q6, q7, [A02], #32
297
298	ins	v8.d[1], v4.d[0]
299	ins	v9.d[1], v4.d[1]
300	ins	v10.d[1], v5.d[0]
301	ins	v11.d[1], v5.d[1]
302	ins	v12.d[1], v6.d[0]
303	ins	v13.d[1], v6.d[1]
304	ins	v14.d[1], v7.d[0]
305	ins	v15.d[1], v7.d[1]
306
307	st1	{v8.2d, v9.2d, v10.2d, v11.2d}, [B00]
308	add	B00, B00, #64
309	st1	{v12.2d, v13.2d, v14.2d, v15.2d}, [B00]
310	add	B00, B00, #64
311.endm
312
313
314.macro COPY1x2
315	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
316	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
317
318	ldr	d0, [A01], #8
319	ldr	d1, [A02], #8
320
321	stp	d0, d1, [B00]
322	add	B00, B00, #16
323.endm
324
325/*************************************************************************************/
326
327.macro COPY8x1
328	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
329
330	ldp	q0, q1, [A01], #32
331	ldp	q2, q3, [A01], #32
332	stp	q0, q1, [B00], #32
333	stp	q2, q3, [B00], #32
334.endm
335
336
337.macro COPY1x1
338	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
339
340	ldr	d0, [A01], #8
341	str	d0, [B00], #8
342.endm
343
344/**************************************************************************************
345* End of macro definitions
346**************************************************************************************/
347
348	PROLOGUE
349
350	.align 5
351
352	SAVE_REGS
353
354	lsl	LDA, LDA, #3					// LDA = LDA * SIZE
355
356.Ldgemm_ncopy_L8_BEGIN:
357
358	asr	J, N, #3					// J = N / 8
359	cmp 	J, #0
360	ble	.Ldgemm_ncopy_L4_BEGIN
361
362.Ldgemm_ncopy_L8_M8_BEGIN:
363
364	mov	A01, A00
365	add	A02, A01, LDA
366	add	A03, A02, LDA
367	add	A04, A03, LDA
368	add	A05, A04, LDA
369	add	A06, A05, LDA
370	add	A07, A06, LDA
371	add	A08, A07, LDA
372	add	A00, A08, LDA
373
374
375	asr	I, M, #3					// I = M / 8
376	cmp	I, #0
377	ble	.Ldgemm_ncopy_L8_M8_40
378
379.Ldgemm_ncopy_L8_M8_20:
380
381	COPY8x8
382
383	subs	I , I , #1
384	bne	.Ldgemm_ncopy_L8_M8_20
385
386
387.Ldgemm_ncopy_L8_M8_40:
388
389	and	I, M , #7
390	cmp	I, #0
391	ble	.Ldgemm_ncopy_L8_M8_END
392
393.Ldgemm_ncopy_L8_M8_60:
394
395	COPY1x8
396
397	subs	I , I , #1
398	bne	.Ldgemm_ncopy_L8_M8_60
399
400
401.Ldgemm_ncopy_L8_M8_END:
402
403	subs	J , J, #1						// j--
404	bne	.Ldgemm_ncopy_L8_M8_BEGIN
405
406/*********************************************************************************************/
407
408.Ldgemm_ncopy_L4_BEGIN:
409
410	tst	N, #7
411	ble	.Ldgemm_ncopy_L999
412
413	tst	N, #4
414	ble	.Ldgemm_ncopy_L2_BEGIN
415
416.Ldgemm_ncopy_L4_M8_BEGIN:
417
418	mov	A01, A00
419	add	A02, A01, LDA
420	add	A03, A02, LDA
421	add	A04, A03, LDA
422	add	A00, A04, LDA
423
424	asr	I, M, #3					// I = M / 8
425	cmp	I, #0
426	ble	.Ldgemm_ncopy_L4_M8_40
427
428.Ldgemm_ncopy_L4_M8_20:
429
430	COPY8x4
431
432	subs	I , I , #1
433	bne	.Ldgemm_ncopy_L4_M8_20
434
435
436.Ldgemm_ncopy_L4_M8_40:
437
438	and	I, M , #7
439	cmp	I, #0
440	ble	.Ldgemm_ncopy_L4_M8_END
441
442.Ldgemm_ncopy_L4_M8_60:
443
444	COPY1x4
445
446	subs	I , I , #1
447	bne	.Ldgemm_ncopy_L4_M8_60
448
449
450.Ldgemm_ncopy_L4_M8_END:
451
452
453/*********************************************************************************************/
454
455.Ldgemm_ncopy_L2_BEGIN:
456
457	tst	N, #3
458	ble	.Ldgemm_ncopy_L999
459
460	tst	N, #2
461	ble	.Ldgemm_ncopy_L1_BEGIN
462
463.Ldgemm_ncopy_L2_M8_BEGIN:
464	mov	A01, A00
465	add	A02, A01, LDA
466	add	A00, A02, LDA
467
468	asr	I, M, #3					// I = M / 8
469	cmp 	I, #0
470	ble	.Ldgemm_ncopy_L2_M8_40
471
472.Ldgemm_ncopy_L2_M8_20:
473
474	COPY8x2
475
476	subs	I , I , #1
477	bne	.Ldgemm_ncopy_L2_M8_20
478
479
480.Ldgemm_ncopy_L2_M8_40:
481
482	and	I, M , #7
483	cmp	I, #0
484	ble	.Ldgemm_ncopy_L2_M8_END
485
486.Ldgemm_ncopy_L2_M8_60:
487
488	COPY1x2
489
490	subs	I , I , #1
491	bne	.Ldgemm_ncopy_L2_M8_60
492
493
494.Ldgemm_ncopy_L2_M8_END:
495
496
497/*********************************************************************************************/
498
499.Ldgemm_ncopy_L1_BEGIN:
500
501	tst	N, #1
502	ble	.Ldgemm_ncopy_L999
503
504
505.Ldgemm_ncopy_L1_M8_BEGIN:
506
507	mov	A01, A00
508
509	asr	I, M, #3					// I = M / 8
510	cmp	I, #0
511	ble	.Ldgemm_ncopy_L1_M8_40
512
513.Ldgemm_ncopy_L1_M8_20:
514
515	COPY8x1
516
517	subs	I , I , #1
518	bne	.Ldgemm_ncopy_L1_M8_20
519
520
521.Ldgemm_ncopy_L1_M8_40:
522
523	and	I, M , #7
524	cmp	I, #0
525	ble	.Ldgemm_ncopy_L1_M8_END
526
527.Ldgemm_ncopy_L1_M8_60:
528
529	COPY1x1
530
531	subs	I , I , #1
532	bne	.Ldgemm_ncopy_L1_M8_60
533
534
535.Ldgemm_ncopy_L1_M8_END:
536
537.Ldgemm_ncopy_L999:
538
539	mov	x0, #0
540	RESTORE_REGS
541	ret
542
543	EPILOGUE
544
545