1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36#if defined(OS_LINUX) | defined(OS_MAC)
37
38//#define STACKSIZE 96
39#define STACKSIZE 64
40#define ARG1  %rdi
41#define ARG2  %rsi
42#define ARG3  %rdx
43#define ARG4  %rcx
44#define ARG5  %r8
45#define ARG6  %r9
46#define ARG7  STACKSIZE +  8(%rsp)
47#define ARG8  STACKSIZE + 16(%rsp)
48#define ARG9  STACKSIZE + 24(%rsp)
49#define ARG10 STACKSIZE + 32(%rsp)
50#define ARG11 STACKSIZE + 40(%rsp)
51#define ARG12 STACKSIZE + 48(%rsp)
52#define ARG13 STACKSIZE + 56(%rsp)
53#define ARG14 STACKSIZE + 64(%rsp)
54#define ARG15 STACKSIZE + 72(%rsp)
55#define ARG16 STACKSIZE + 80(%rsp)
56#define ARG17 STACKSIZE + 88(%rsp)
57#define ARG18 STACKSIZE + 96(%rsp)
58#define PROLOGUE \
59	subq	$STACKSIZE, %rsp; \
60	movq	%rbx,   (%rsp); \
61	movq	%rbp,  8(%rsp); \
62	movq	%r12, 16(%rsp); \
63	movq	%r13, 24(%rsp); \
64	movq	%r14, 32(%rsp); \
65	movq	%r15, 40(%rsp); \
66	vzeroupper;
67#define EPILOGUE \
68	vzeroupper; \
69	movq	  (%rsp), %rbx; \
70	movq	 8(%rsp), %rbp; \
71	movq	16(%rsp), %r12; \
72	movq	24(%rsp), %r13; \
73	movq	32(%rsp), %r14; \
74	movq	40(%rsp), %r15; \
75	addq	$STACKSIZE, %rsp;
76
77#elif defined(OS_WINDOWS)
78
79#define STACKSIZE 256
80#define ARG1  %rcx
81#define ARG2  %rdx
82#define ARG3  %r8
83#define ARG4  %r9
84#define ARG5  STACKSIZE + 40(%rsp)
85#define ARG6  STACKSIZE + 48(%rsp)
86#define ARG7  STACKSIZE + 56(%rsp)
87#define ARG8  STACKSIZE + 64(%rsp)
88#define ARG9  STACKSIZE + 72(%rsp)
89#define ARG10 STACKSIZE + 80(%rsp)
90#define ARG11 STACKSIZE + 88(%rsp)
91#define ARG12 STACKSIZE + 96(%rsp)
92#define ARG13 STACKSIZE + 104(%rsp)
93#define ARG14 STACKSIZE + 112(%rsp)
94#define ARG15 STACKSIZE + 120(%rsp)
95#define ARG16 STACKSIZE + 128(%rsp)
96#define ARG17 STACKSIZE + 136(%rsp)
97#define ARG18 STACKSIZE + 144(%rsp)
98#define PROLOGUE \
99	subq	$STACKSIZE, %rsp; \
100	movq	%rbx,   (%rsp); \
101	movq	%rbp,  8(%rsp); \
102	movq	%r12, 16(%rsp); \
103	movq	%r13, 24(%rsp); \
104	movq	%r14, 32(%rsp); \
105	movq	%r15, 40(%rsp); \
106	movq	%rdi, 48(%rsp); \
107	movq	%rsi, 56(%rsp); \
108	vmovups	%xmm6, 64(%rsp); \
109	vmovups	%xmm7, 80(%rsp); \
110	vmovups	%xmm8, 96(%rsp); \
111	vmovups	%xmm9, 112(%rsp); \
112	vmovups	%xmm10, 128(%rsp); \
113	vmovups	%xmm11, 144(%rsp); \
114	vmovups	%xmm12, 160(%rsp); \
115	vmovups	%xmm13, 176(%rsp); \
116	vmovups	%xmm14, 192(%rsp); \
117	vmovups	%xmm15, 208(%rsp); \
118	vzeroupper;
119#define EPILOGUE \
120	vzeroupper; \
121	movq	  (%rsp), %rbx; \
122	movq	 8(%rsp), %rbp; \
123	movq	16(%rsp), %r12; \
124	movq	24(%rsp), %r13; \
125	movq	32(%rsp), %r14; \
126	movq	40(%rsp), %r15; \
127	movq	48(%rsp), %rdi; \
128	movq	56(%rsp), %rsi; \
129	vmovups	64(%rsp), %xmm6; \
130	vmovups	80(%rsp), %xmm7; \
131	vmovups	96(%rsp), %xmm8; \
132	vmovups	112(%rsp), %xmm9; \
133	vmovups	128(%rsp), %xmm10; \
134	vmovups	144(%rsp), %xmm11; \
135	vmovups	160(%rsp), %xmm12; \
136	vmovups	176(%rsp), %xmm13; \
137	vmovups	192(%rsp), %xmm14; \
138	vmovups	208(%rsp), %xmm15; \
139	addq	$STACKSIZE, %rsp;
140
141#else
142
143#error wrong OS
144
145#endif
146
147
148
149#if defined(OS_LINUX) | defined(OS_WINDOWS)
150	.text
151#elif defined(OS_MAC)
152	.section	__TEXT,__text,regular,pure_instructions
153#endif
154
155
156
157
158// common inner routine with file scope
159//
160// input arguments:
161// r10d  <- k
162// r11   <- A
163// r12   <- B
164// r13   <- 4*sdb*sizeof(double)
165// r14   <= dirty
166// ymm0  <- [d00 d10 d20 d30]
167// ymm1  <- [d01 d11 d21 d31]
168
169//
170// output arguments:
171// r10d  <- 0
172// r11   <- A+4*k*sizeof(double)
173// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
174// r13   <- 4*sdb*sizeof(double)
175// r14   <= dirty
176// ymm0  <- [d00 d10 d20 d30]
177// ymm1  <- [d01 d11 d21 d31]
178
179#if MACRO_LEVEL>=2
180	.macro INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4
181#else
182	.p2align 4,,15
183#if defined(OS_LINUX)
184	.type inner_kernel_dgemm_add_nn_4x2_lib4, @function
185inner_kernel_dgemm_add_nn_4x2_lib4:
186#elif defined(OS_MAC)
187_inner_kernel_dgemm_add_nn_4x2_lib4:
188#elif defined(OS_WINDOWS)
189	.def inner_kernel_dgemm_add_nn_4x2_lib4; .scl 2; .type 32; .endef
190inner_kernel_dgemm_add_nn_4x2_lib4:
191#endif
192#endif
193
194	cmpl	$0, %r10d
195	jle		2f // return
196
197	// preload
198	vmovapd 0(%r11), %ymm8 // A0[0]
199
200	cmpl	$4, %r10d
201	jle		0f // consider clean-up loop
202
203	// main loop
204	.p2align 3
2051: // main loop
206
207	prefetcht0	0(%r12, %r13, 2) // software prefetch
208
209	// unroll 0
210	vbroadcastsd	0(%r12), %ymm12
211	vmulpd			%ymm8, %ymm12, %ymm15
212	vaddpd			%ymm15, %ymm0, %ymm0
213	vmovapd			32(%r11), %ymm10 // A0
214
215	vbroadcastsd	32(%r12), %ymm12
216	vmulpd			%ymm8, %ymm12, %ymm15
217	vaddpd			%ymm15, %ymm1, %ymm1
218	subl	$4, %r10d
219
220	// unroll 1
221	vbroadcastsd	8(%r12), %ymm12
222	vmulpd			%ymm10, %ymm12, %ymm15
223	vaddpd			%ymm15, %ymm0, %ymm0
224	vmovapd			64(%r11), %ymm8 // A0
225
226	vbroadcastsd	40(%r12), %ymm12
227	vmulpd			%ymm10, %ymm12, %ymm15
228	vaddpd			%ymm15, %ymm1, %ymm1
229
230	// unroll 2
231	vbroadcastsd	16(%r12), %ymm12
232	vmulpd			%ymm8, %ymm12, %ymm15
233	vaddpd			%ymm15, %ymm0, %ymm0
234	vmovapd			96(%r11), %ymm10 // A0
235
236	vbroadcastsd	48(%r12), %ymm12
237	vmulpd			%ymm8, %ymm12, %ymm15
238	vaddpd			%ymm15, %ymm1, %ymm1
239	addq	$128, %r11
240
241	// unroll 3
242	vbroadcastsd	24(%r12), %ymm12
243	vmulpd			%ymm10, %ymm12, %ymm15
244	vaddpd			%ymm15, %ymm0, %ymm0
245	vmovapd			0(%r11), %ymm8 // A0
246
247	vbroadcastsd	56(%r12), %ymm12
248	vmulpd			%ymm10, %ymm12, %ymm15
249	vaddpd			%ymm15, %ymm1, %ymm1
250	addq	%r13, %r12
251
252	cmpl	$4, %r10d
253	jg		1b // main loop
254
2550: // consider clean4-up
256
257	cmpl	$3, %r10d
258	jle		4f // clean1
259
260	// unroll 0
261	vbroadcastsd	0(%r12), %ymm12
262	vmulpd			%ymm8, %ymm12, %ymm15
263	vaddpd			%ymm15, %ymm0, %ymm0
264	vmovapd			32(%r11), %ymm10 // A0
265
266	vbroadcastsd	32(%r12), %ymm12
267	vmulpd			%ymm8, %ymm12, %ymm15
268	vaddpd			%ymm15, %ymm1, %ymm1
269	vaddpd			%ymm15, %ymm7, %ymm7
270	subl	$4, %r10d
271
272	// unroll 1
273	vbroadcastsd	8(%r12), %ymm12
274	vmulpd			%ymm10, %ymm12, %ymm15
275	vaddpd			%ymm15, %ymm0, %ymm0
276	vmovapd			64(%r11), %ymm8 // A0
277
278	vbroadcastsd	40(%r12), %ymm12
279	vmulpd			%ymm10, %ymm12, %ymm15
280	vaddpd			%ymm15, %ymm1, %ymm1
281
282	// unroll 2
283	vbroadcastsd	16(%r12), %ymm12
284	vmulpd			%ymm8, %ymm12, %ymm15
285	vaddpd			%ymm15, %ymm0, %ymm0
286	vmovapd			96(%r11), %ymm10 // A0
287
288	vbroadcastsd	48(%r12), %ymm12
289	vmulpd			%ymm8, %ymm12, %ymm15
290	vaddpd			%ymm15, %ymm1, %ymm1
291	addq	$128, %r11
292
293	// unroll 3
294	vbroadcastsd	24(%r12), %ymm12
295	vmulpd			%ymm10, %ymm12, %ymm15
296	vaddpd			%ymm15, %ymm0, %ymm0
297//	vmovapd			0(%r11), %ymm8 // A0
298
299	vbroadcastsd	56(%r12), %ymm12
300	vmulpd			%ymm10, %ymm12, %ymm15
301	vaddpd			%ymm15, %ymm1, %ymm1
302	addq	%r13, %r12
303
304	jmp		2f
305
306
3074: // consider clean1-up loop
308
309	cmpl	$0, %r10d
310	jle		2f // return
311
312	// clean-up loop
3133: // clean up loop
314
315	vmovapd			0(%r11), %ymm8 // A0[0]
316	vbroadcastsd	0(%r12), %ymm12
317	vmulpd			%ymm8, %ymm12, %ymm15
318	vaddpd			%ymm15, %ymm0, %ymm0
319
320	vbroadcastsd	32(%r12), %ymm12
321	vmulpd			%ymm8, %ymm12, %ymm15
322	vaddpd			%ymm15, %ymm1, %ymm1
323	addq	$32, %r11
324	subl	$1, %r10d
325	addq	$8, %r12
326
327	cmpl	$0, %r10d
328	jg		3b // clean up loop
329
3302: // return
331
332#if MACRO_LEVEL>=2
333	.endm
334#else
335	ret
336
337#if defined(OS_LINUX)
338	.size	inner_kernel_dgemm_add_nn_4x2_lib4, .-inner_kernel_dgemm_add_nn_4x2_lib4
339#endif
340#endif
341
342
343
344
345
346// common inner routine with file scope
347//
348// input arguments:
349// r10d  <- k
350// r11   <- A
351// r12   <- B
352// r13   <- 4*sdb*sizeof(double)
353
354//
355// output arguments:
356// r10d  <- 0
357// r11   <- A+4*k*sizeof(double)
358// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
359// r13   <- 4*sdb*sizeof(double)
360
361#if MACRO_LEVEL>=2
362	.macro INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4
363#else
364	.p2align 4,,15
365#if defined(OS_LINUX)
366	.type inner_kernel_dgemm_add_nn_2x4_lib4, @function
367inner_kernel_dgemm_add_nn_2x4_lib4:
368#elif defined(OS_MAC)
369_inner_kernel_dgemm_add_nn_2x4_lib4:
370#elif defined(OS_WINDOWS)
371	.def inner_kernel_dgemm_add_nn_2x4_lib4; .scl 2; .type 32; .endef
372inner_kernel_dgemm_add_nn_2x4_lib4:
373#endif
374#endif
375
376	cmpl	$0, %r10d
377	jle		5f // return
378
379	// preload
380	vbroadcastf128	0(%r11), %ymm11 // A
381	vbroadcastf128	32(%r11), %ymm12 // A
382
383	cmpl	$4, %r10d
384	jle		0f // consider clean-up loop
385
386	// main loop
387	.p2align 3
3881: // main loop
389
390	prefetcht0	0(%r12, %r13, 2) // software prefetch
391	prefetcht0	64(%r12, %r13, 2) // software prefetch
392
393	// unroll 0 1
394	vmovapd			0(%r12), %ymm13
395	vmovupd			16(%r12), %ymm14
396	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
397	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
398	vmulpd			%ymm11, %ymm13, %ymm15
399	vaddpd			%ymm0, %ymm15, %ymm0
400	vbroadcastf128	64(%r11), %ymm9 // A
401	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
402	vmulpd			%ymm12, %ymm13, %ymm15
403	vaddpd			%ymm1, %ymm15, %ymm1
404	vmovapd			64(%r12), %ymm13
405	vmovupd			80(%r12), %ymm14
406	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
407	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
408	vmulpd			%ymm11, %ymm13, %ymm15
409	vaddpd			%ymm2, %ymm15, %ymm2
410	vbroadcastf128	96(%r11), %ymm10 // A
411	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
412	vmulpd			%ymm12, %ymm13, %ymm15
413	vaddpd			%ymm3, %ymm15, %ymm3
414
415	// unroll 2 3
416	vmovupd			16(%r12), %ymm13
417	vmovapd			32(%r12), %ymm14
418	addq	$128, %r11
419	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
420	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
421	vmulpd			%ymm9, %ymm13, %ymm15
422	vaddpd			%ymm0, %ymm15, %ymm0
423	vbroadcastf128	0(%r11), %ymm11 // A
424	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
425	vmulpd			%ymm10, %ymm13, %ymm15
426	vaddpd			%ymm1, %ymm15, %ymm1
427	vmovupd			80(%r12), %ymm13
428	vmovapd			96(%r12), %ymm14
429	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
430	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
431	vmulpd			%ymm9, %ymm13, %ymm15
432	vaddpd			%ymm2, %ymm15, %ymm2
433	vbroadcastf128	32(%r11), %ymm12 // A
434	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
435	vmulpd			%ymm10, %ymm13, %ymm15
436	vaddpd			%ymm3, %ymm15, %ymm3
437
438	subl	$4, %r10d
439	addq	%r13, %r12
440
441	cmpl	$4, %r10d
442	jg		1b // main loop
443
444
4450: // consider clean4-up
446
447	cmpl	$3, %r10d
448	jle		4f // clean1
449
450	// unroll 0 1
451	vmovapd			0(%r12), %ymm13
452	vmovupd			16(%r12), %ymm14
453	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
454	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
455	vmulpd			%ymm11, %ymm13, %ymm15
456	vaddpd			%ymm0, %ymm15, %ymm0
457	vbroadcastf128	64(%r11), %ymm9 // A
458	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
459	vmulpd			%ymm12, %ymm13, %ymm15
460	vaddpd			%ymm1, %ymm15, %ymm1
461	vmovapd			64(%r12), %ymm13
462	vmovupd			80(%r12), %ymm14
463	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
464	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
465	vmulpd			%ymm11, %ymm13, %ymm15
466	vaddpd			%ymm2, %ymm15, %ymm2
467	vbroadcastf128	96(%r11), %ymm10 // A
468	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
469	vmulpd			%ymm12, %ymm13, %ymm15
470	vaddpd			%ymm3, %ymm15, %ymm3
471
472	// unroll 2 3
473	vmovupd			16(%r12), %ymm13
474	vmovapd			32(%r12), %ymm14
475	addq	$128, %r11
476	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
477	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
478	vmulpd			%ymm9, %ymm13, %ymm15
479	vaddpd			%ymm0, %ymm15, %ymm0
480//	vbroadcastf128	0(%r11), %ymm11 // A
481	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
482	vmulpd			%ymm10, %ymm13, %ymm15
483	vaddpd			%ymm1, %ymm15, %ymm1
484	vmovupd			80(%r12), %ymm13
485	vmovapd			96(%r12), %ymm14
486	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
487	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
488	vmulpd			%ymm9, %ymm13, %ymm15
489	vaddpd			%ymm2, %ymm15, %ymm2
490//	vbroadcastf128	32(%r11), %ymm12 // A
491	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
492	vmulpd			%ymm10, %ymm13, %ymm15
493	vaddpd			%ymm3, %ymm15, %ymm3
494
495	subl	$4, %r10d
496	addq	$128, %r11
497	addq	%r13, %r12
498
499	jmp		2f // return
500
501
5024: // consider clean1-up loop
503
504	cmpl	$0, %r10d
505	jle		2f // return
506
507	// clean-up loop
5083: // clean up loop
509
510	// unroll 0
511	vbroadcastf128	0(%r11), %ymm11 // A
512	vmovupd			0(%r12), %ymm13
513	vmovupd			16(%r12), %ymm14
514	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
515	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
516	vmulpd			%ymm11, %ymm13, %ymm15
517	vaddpd			%ymm0, %ymm15, %ymm0
518	vmovupd			64(%r12), %ymm13
519	vmovupd			80(%r12), %ymm14
520	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
521	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
522	vmulpd			%ymm11, %ymm13, %ymm15
523	vaddpd			%ymm2, %ymm15, %ymm2
524
525	addq	$32, %r11
526	addq	$8, %r12
527	subl	$1, %r10d
528
529	cmpl	$0, %r10d
530	jg		3b // clean up loop
531
532
5332: // reduce
534	vaddpd			%ymm0, %ymm1, %ymm0
535	vextractf128	$0x1, %ymm0, %xmm1
536	vaddpd			%ymm2, %ymm3, %ymm2
537	vextractf128	$0x1, %ymm2, %xmm3
538
5395: // return
540
541#if MACRO_LEVEL>=2
542	.endm
543#else
544	ret
545
546#if defined(OS_LINUX)
547	.size	inner_kernel_dgemm_add_nn_2x4_lib4, .-inner_kernel_dgemm_add_nn_2x4_lib4
548#endif
549#endif
550
551
552
553
554
555// common inner routine with file scope
556//
557// edge for B unaligned
558//
559// input arguments:
560// r10   <- k
561// r11   <- A
562// r12   <- B
563// r13   <- bs*sdb*sizeof(double)
564// r14   <- offB
565
566//
567// output arguments:
568// r10   <- k-(4-offB)
569// r11   <- A+(4-offB)*bs*sizeof(double)
570// r12   <- B-offB+bs*sdb*sizeof(double)
571// r13   <- bs*sdb*sizeof(double)
572// r14   <- offB
573
574
575#if MACRO_LEVEL>=1
576	.macro INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4
577#else
578	.p2align 4,,15
579#if defined(OS_LINUX)
580	.type inner_edge_dgemm_add_nn_4x2_lib4, @function
581inner_edge_dgemm_add_nn_4x2_lib4:
582#elif defined(OS_MAC)
583_inner_edge_dgemm_add_nn_4x2_lib4:
584#elif defined(OS_WINDOWS)
585	.def inner_edge_dgemm_add_nn_4x2_lib4; .scl 2; .type 32; .endef
586inner_edge_dgemm_add_nn_4x2_lib4:
587#endif
588#endif
589
590	cmpl			$0, %r14d // offset==0
591	jle				2f // end
592
593	cmpl			$0, %r10d // k==0
594	jle				2f // end
595
596	movl			$4, %r15d
597	subl			%r14d, %r15d // 4-offsetB
598	cmpl			%r10d, %r15d
599//	jle				0f
600//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
601//0:
602	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
603
604	movl			%r14d, %eax
605	sall			$3, %eax // offsetB*sizeof(double)
606	addq			%rax, %r12 // B+offsetB*sizeof(double)
607
6081:
609	vmovapd			0(%r11), %ymm8
610	vbroadcastsd	0(%r12), %ymm12
611	vmulpd			%ymm8, %ymm12, %ymm15
612	vaddpd			%ymm15, %ymm0, %ymm0
613	vbroadcastsd	32(%r12), %ymm12
614	vmulpd			%ymm8, %ymm12, %ymm15
615	vaddpd			%ymm15, %ymm1, %ymm1
616
617	subl			$1, %r10d // k-1
618	subl			$1, %r15d // kend-1
619	addq			$32, %r11 // A+1*bs*sizeof(float)
620	addq			$8, %r12 // B+1*sizeof(float)
621
622	cmpl			$0, %r15d
623	jg				1b
624
625	cmpl			$0, %r10d
626	jle				2f // end
627
628	addq			%r13, %r12
629	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
630
6312:
632
633#if MACRO_LEVEL>=1
634	.endm
635#else
636	ret
637
638#if defined(OS_LINUX)
639	.size	inner_edge_dgemm_add_nn_4x2_lib4, .-inner_edge_dgemm_add_nn_4x2_lib4
640#endif
641#endif
642
643
644
645
646
647// common inner routine with file scope
648//
649// edge for B unaligned
650//
651// input arguments:
652// r10   <- k
653// r11   <- A
654// r12   <- B
655// r13   <- bs*sdb*sizeof(double)
656// r14   <- offB
657
658//
659// output arguments:
660// r10   <- k-(4-offB)
661// r11   <- A+(4-offB)*bs*sizeof(double)
662// r12   <- B-offB+bs*sdb*sizeof(double)
663// r13   <- bs*sdb*sizeof(double)
664// r14   <- offB
665
666
667#if MACRO_LEVEL>=1
668	.macro INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4
669#else
670	.p2align 4,,15
671#if defined(OS_LINUX)
672	.type inner_edge_dgemm_add_nn_2x4_lib4, @function
673inner_edge_dgemm_add_nn_2x4_lib4:
674#elif defined(OS_MAC)
675_inner_edge_dgemm_add_nn_2x4_lib4:
676#elif defined(OS_WINDOWS)
677	.def inner_edge_dgemm_add_nn_2x4_lib4; .scl 2; .type 32; .endef
678inner_edge_dgemm_add_nn_2x4_lib4:
679#endif
680#endif
681
682	cmpl			$0, %r14d // offset==0
683	jle				2f // end
684
685	cmpl			$0, %r10d // k==0
686	jle				2f // end
687
688	movl			$4, %r15d
689	subl			%r14d, %r15d // 4-offsetB
690	cmpl			%r10d, %r15d
691//	jle				0f
692//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
693//0:
694	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
695
696	movl			%r14d, %eax
697	sall			$3, %eax // offsetB*sizeof(double)
698	addq			%rax, %r12 // B+offsetB*sizeof(double)
699
7001:
701	vbroadcastf128	0(%r11), %ymm11 // A
702	vmovupd			0(%r12), %ymm13
703	vmovupd			16(%r12), %ymm14
704	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
705	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
706	vmulpd			%ymm11, %ymm13, %ymm15
707	vaddpd			%ymm0, %ymm15, %ymm0
708	vmovupd			64(%r12), %ymm13
709	vmovupd			80(%r12), %ymm14
710	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
711	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
712	vmulpd			%ymm11, %ymm13, %ymm15
713	vaddpd			%ymm2, %ymm15, %ymm2
714
715	subl			$1, %r10d // k-1
716	subl			$1, %r15d // kend-1
717	addq			$32, %r11 // A+1*bs*sizeof(float)
718	addq			$8, %r12 // B+1*sizeof(float)
719
720	cmpl			$0, %r15d
721	jg				1b
722
723	cmpl			$0, %r10d
724	jle				2f // end
725
726	addq			%r13, %r12
727	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
728
7292:
730
731#if MACRO_LEVEL>=1
732	.endm
733#else
734	ret
735
736#if defined(OS_LINUX)
737	.size	inner_edge_dgemm_add_nn_2x4_lib4, .-inner_edge_dgemm_add_nn_2x4_lib4
738#endif
739#endif
740
741
742
743
744
745// common inner routine with file scope
746//
747// scale for generic alpha and beta
748//
749// input arguments:
750// r10   <- alpha
751// r11   <- beta
752// r12   <- C
753// r13   <- 4*sdc*sizeof(double)
754// r15   <- dirty
755// ymm0  <- [d00 d11 d22 d33]
756// ymm1  <- [d01 d10 d23 d32]
757//
758// output arguments:
759// r10   <- alpha
760// r11   <- beta
761// r12   <- C
762// r13   <- 4*sdc*sizeof(double)
763// r15   <- dirty
764// ymm0  <- [d00 d10 d20 d30]
765// ymm1  <- [d01 d11 d21 d31]
766
767#if MACRO_LEVEL>=1
768	.macro INNER_SCALE_AB_4X2_LIB4
769#else
770	.p2align 4,,15
771#if defined(OS_LINUX)
772	.type inner_scale_ab_4x2_lib4, @function
773inner_scale_ab_4x2_lib4:
774#elif defined(OS_MAC)
775_inner_scale_ab_4x2_lib4:
776#elif defined(OS_WINDOWS)
777	.def inner_scale_ab_4x2_lib4; .scl 2; .type 32; .endef
778inner_scale_ab_4x2_lib4:
779#endif
780#endif
781
782
783	// alpha
784	vbroadcastsd	0(%r10), %ymm15
785
786	vmulpd		%ymm0, %ymm15, %ymm0
787	vmulpd		%ymm1, %ymm15, %ymm1
788
789	// beta
790	vbroadcastsd	0(%r11), %ymm14
791
792	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
793
794	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
795	je			0f // end
796
797	// alg==1
798	vmovapd		0(%r12), %ymm15
799	vmulpd		%ymm15, %ymm14, %ymm15
800	vaddpd		%ymm0, %ymm15, %ymm0
801	vmovapd		32(%r12), %ymm15
802	vmulpd		%ymm15, %ymm14, %ymm15
803	vaddpd		%ymm1, %ymm15, %ymm1
804
8050:
806
807#if MACRO_LEVEL>=1
808	.endm
809#else
810	ret
811
812#if defined(OS_LINUX)
813	.size	inner_scale_ab_4x2_lib4, .-inner_scale_ab_4x2_lib4
814#endif
815#endif
816
817
818
819
820
821// common inner routine with file scope
822//
823// scale for generic alpha and beta
824//
825// input arguments:
826// r10   <- alpha
827// r11   <- beta
828// r12   <- C
829// ymm0 <- [d00 d11 d22 d33]
830// ymm1 <- [d01 d10 d23 d32]
831// ymm2 <- [d03 d12 d21 d30]
832// ymm3 <- [d02 d13 d20 d31]
833// ymm8  <- dirty
834// ymm9  <- dirty
835// ymm10 <- dirty
836// ymm11 <- dirty
837// ymm15 <- dirty
838//
839// output arguments:
840// r10   <- alpha
841// r11   <- beta
842// r10   <- C
843// ymm0 <- [d00 d10 d20 d30]
844// ymm1 <- [d01 d11 d21 d31]
845// ymm2 <- [d02 d12 d22 d32]
846// ymm3 <- [d03 d13 d23 d33]
847// ymm8  <- dirty
848// ymm9  <- dirty
849// ymm10 <- dirty
850// ymm11 <- dirty
851// ymm15 <- dirty
852
853#if MACRO_LEVEL>=1
854	.macro INNER_SCALE_AB_2X4_LIB4
855#else
856	.p2align 4,,15
857#if defined(OS_LINUX)
858	.type inner_scale_ab_2x4_lib4, @function
859inner_scale_ab_2x4_lib4:
860#elif defined(OS_MAC)
861_inner_scale_ab_2x4_lib4:
862#elif defined(OS_WINDOWS)
863	.def inner_scale_ab_2x4_lib4; .scl 2; .type 32; .endef
864inner_scale_ab_2x4_lib4:
865#endif
866#endif
867
868	// alpha
869	vmovddup	0(%r10), %xmm15
870
871	vmulpd		%xmm0, %xmm15, %xmm0
872	vmulpd		%xmm1, %xmm15, %xmm1
873	vmulpd		%xmm2, %xmm15, %xmm2
874	vmulpd		%xmm3, %xmm15, %xmm3
875
876	// beta
877	vmovddup	0(%r11), %xmm14
878
879	vxorpd		%xmm15, %xmm15, %xmm15 // 0.0
880
881	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
882	je			0f // end
883
884	vmovapd		0(%r12), %xmm15
885	vmulpd		%xmm14, %xmm15, %xmm15
886	vaddpd		%xmm15, %xmm0, %xmm0
887	vmovapd		32(%r12), %xmm15
888	vmulpd		%xmm14, %xmm15, %xmm15
889	vaddpd		%xmm15, %xmm1, %xmm1
890	vmovapd		64(%r12), %xmm15
891	vmulpd		%xmm14, %xmm15, %xmm15
892	vaddpd		%xmm15, %xmm2, %xmm2
893	vmovapd		96(%r12), %xmm15
894	vmulpd		%xmm14, %xmm15, %xmm15
895	vaddpd		%xmm15, %xmm3, %xmm3
896
8970:
898
899#if MACRO_LEVEL>=1
900	.endm
901#else
902	ret
903
904#if defined(OS_LINUX)
905	.size	inner_scale_ab_2x4_lib4, .-inner_scale_ab_2x4_lib4
906#endif
907#endif
908
909
910
911
912
913// common inner routine with file scope
914//
915// store n
916//
917// input arguments:
918// r10  <- D
919// r11  <- 4*sdd*sizeof(double)
920// r15  <- dirty
921// ymm0 <- [d00 d10 d20 d30]
922// ymm1 <- [d01 d11 d21 d31]
923//
924// output arguments:
925// r10  <- D
926// r11  <- 4*sdd*sizeof(double)
927// r15  <- dirty
928// ymm0 <- [d00 d10 d20 d30]
929// ymm1 <- [d01 d11 d21 d31]
930
931#if MACRO_LEVEL>=1
932	.macro INNER_STORE_4X2_LIB4
933#else
934	.p2align 4,,15
935#if defined(OS_LINUX)
936	.type inner_store_4x2_lib4, @function
937inner_store_4x2_lib4:
938#elif defined(OS_MAC)
939_inner_store_4x2_lib4:
940#elif defined(OS_WINDOWS)
941	.def inner_store_4x2_lib4; .scl 2; .type 32; .endef
942inner_store_4x2_lib4:
943#endif
944#endif
945
946	vmovapd %ymm0,  0(%r10)
947	vmovapd %ymm1, 32(%r10)
948
949#if MACRO_LEVEL>=1
950	.endm
951#else
952	ret
953
954#if defined(OS_LINUX)
955	.size	inner_store_4x2_lib4, .-inner_store_4x2_lib4
956#endif
957#endif
958
959
960
961
962
963// common inner routine with file scope
964//
965// store n
966//
967// input arguments:
968// r10   <- D
969// ymm0  <- [d00 d10]
970// ymm1  <- [d01 d11]
971// ymm2  <- [d02 d12]
972// ymm3  <- [d03 d13]
973//
974// output arguments:
975// r10   <- D
976// ymm0  <- [d00 d10]
977// ymm1  <- [d01 d11]
978// ymm2  <- [d02 d12]
979// ymm3  <- [d03 d13]
980
981#if MACRO_LEVEL>=1
982	.macro INNER_STORE_2X4_LIB4
983#else
984	.p2align 4,,15
985#if defined(OS_LINUX)
986	.type inner_store_2x4_lib4, @function
987inner_store_2x4_lib4:
988#elif defined(OS_MAC)
989_inner_store_2x4_lib4:
990#elif defined(OS_WINDOWS)
991	.def inner_store_2x4_lib4; .scl 2; .type 32; .endef
992inner_store_2x4_lib4:
993#endif
994#endif
995
996	vmovapd %xmm0,   0(%r10)
997	vmovapd %xmm1,  32(%r10)
998	vmovapd %xmm2,  64(%r10)
999	vmovapd %xmm3,  96(%r10)
1000
1001#if MACRO_LEVEL>=1
1002	.endm
1003#else
1004	ret
1005
1006#if defined(OS_LINUX)
1007	.size	inner_store_2x4_lib4, .-inner_store_2x4_lib4
1008#endif
1009#endif
1010
1011
1012
1013
1014
1015// common inner routine with file scope
1016//
1017// store n
1018//
1019// input arguments:
1020// r10   <- D
1021// ymm0  <- [d00 d10]
1022// ymm1  <- [d01 d11]
1023// ymm2  <- [d02 d12]
1024// ymm3  <- [d03 d13]
1025//
1026// output arguments:
1027// r10   <- D
1028// ymm0  <- [d00 d10]
1029// ymm1  <- [d01 d11]
1030// ymm2  <- [d02 d12]
1031// ymm3  <- [d03 d13]
1032
1033#if MACRO_LEVEL>=1
1034	.macro INNER_STORE_2X2_LIB4
1035#else
1036	.p2align 4,,15
1037#if defined(OS_LINUX)
1038	.type inner_store_2x2_lib4, @function
1039inner_store_2x2_lib4:
1040#elif defined(OS_MAC)
1041_inner_store_2x2_lib4:
1042#elif defined(OS_WINDOWS)
1043	.def inner_store_2x2_lib4; .scl 2; .type 32; .endef
1044inner_store_2x2_lib4:
1045#endif
1046#endif
1047
1048	vmovapd %xmm0,   0(%r10)
1049	vmovapd %xmm1,  32(%r10)
1050
1051#if MACRO_LEVEL>=1
1052	.endm
1053#else
1054	ret
1055
1056#if defined(OS_LINUX)
1057	.size	inner_store_2x2_lib4, .-inner_store_2x2_lib4
1058#endif
1059#endif
1060
1061
1062
1063
1064
1065// common inner routine with file scope
1066//
1067// store n vs
1068//
1069// input arguments:
1070// r10  <- D
1071// r11d  <- km
1072// r12d  <- kn
1073// ymm0 <- [d00 d10 d20 d30]
1074// ymm1 <- [d01 d11 d21 d31]
1075//
1076// output arguments:
1077// r10  <- D
1078// r11d  <- km
1079// r12d  <- kn
1080// ymm0 <- [d00 d10 d20 d30]
1081// ymm1 <- [d01 d11 d21 d31]
1082
1083#if MACRO_LEVEL>=1
1084	.macro INNER_STORE_4X2_VS_LIB4
1085#else
1086	.p2align 4,,15
1087#if defined(OS_LINUX)
1088	.type inner_store_4x2_vs_lib4, @function
1089inner_store_4x2_vs_lib4:
1090#elif defined(OS_MAC)
1091_inner_store_4x2_vs_lib4:
1092#elif defined(OS_WINDOWS)
1093	.def inner_store_4x2_vs_lib4; .scl 2; .type 32; .endef
1094inner_store_4x2_vs_lib4:
1095#endif
1096#endif
1097
1098	vcvtsi2sd	%r11d, %xmm15, %xmm15
1099#if defined(OS_LINUX) | defined(OS_WINDOWS)
1100	vmovupd		.LC02(%rip), %ymm14
1101#elif defined(OS_MAC)
1102	vmovupd		LC02(%rip), %ymm14
1103#endif
1104	vmovddup	%xmm15, %xmm15
1105	vinsertf128	$1, %xmm15, %ymm15, %ymm15
1106	vsubpd		%ymm15, %ymm14, %ymm15
1107
1108	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
1109	cmpl		$2, %r12d
1110	jl			0f // end
1111	vmaskmovpd	%ymm1, %ymm15, 32(%r10)
1112
11130:
1114
1115#if MACRO_LEVEL>=1
1116	.endm
1117#else
1118	ret
1119
1120#if defined(OS_LINUX)
1121	.size	inner_store_4x2_vs_lib4, .-inner_store_4x2_vs_lib4
1122#endif
1123#endif
1124
1125
1126
1127
1128
1129// common inner routine with file scope
1130//
1131// store n
1132//
1133// input arguments:
1134// r10   <- D
1135// r11   <- m1
1136// r12   <- n1
1137// ymm0  <- [d00 d10 d20 d30]
1138// ymm1  <- [d01 d11 d21 d31]
1139// ymm2  <- [d02 d12 d22 d32]
1140// ymm3  <- [d03 d13 d23 d33]
1141// ymm4  <- [d40 d50 d60 d70]
1142// ymm5  <- [d41 d51 d61 d71]
1143// ymm6  <- [d42 d52 d62 d72]
1144// ymm7  <- [d43 d53 d63 d73]
1145//
1146// output arguments:
1147// r10   <- D
1148// r11   <- m1
1149// r12   <- n1
1150// ymm0  <- [d00 d10 d20 d30]
1151// ymm1  <- [d01 d11 d21 d31]
1152// ymm2  <- [d02 d12 d22 d32]
1153// ymm3  <- [d03 d13 d23 d33]
1154// ymm4  <- [d40 d50 d60 d70]
1155// ymm5  <- [d41 d51 d61 d71]
1156// ymm6  <- [d42 d52 d62 d72]
1157// ymm7  <- [d43 d53 d63 d73]
1158
1159#if MACRO_LEVEL>=1
1160	.macro INNER_STORE_2X4_VS_LIB4
1161#else
1162	.p2align 4,,15
1163#if defined(OS_LINUX)
1164	.type inner_store_2x4_vs_lib4, @function
1165inner_store_2x4_vs_lib4:
1166#elif defined(OS_MAC)
1167_inner_store_2x4_vs_lib4:
1168#elif defined(OS_WINDOWS)
1169	.def inner_store_2x4_vs_lib4; .scl 2; .type 32; .endef
1170inner_store_2x4_vs_lib4:
1171#endif
1172#endif
1173
1174	vcvtsi2sd	%r11d, %xmm15, %xmm15
1175#if defined(OS_LINUX) | defined(OS_WINDOWS)
1176	vmovupd		.LC02(%rip), %ymm14
1177#elif defined(OS_MAC)
1178	vmovupd		LC02(%rip), %ymm14
1179#endif
1180	vmovddup	%xmm15, %xmm15
1181	vinsertf128	$1, %xmm15, %ymm15, %ymm15
1182	vsubpd		%ymm15, %ymm14, %ymm15
1183
1184	cmpl		$2, %r12d
1185	vmaskmovpd	%xmm0, %xmm15,  0(%r10)
1186	jl			0f // end
1187	cmpl		$3, %r12d
1188	vmaskmovpd	%xmm1, %xmm15, 32(%r10)
1189	jl			0f // end
1190	vmaskmovpd	%xmm2, %xmm15, 64(%r10)
1191	je			0f // end
1192	vmaskmovpd	%xmm3, %xmm15, 96(%r10)
1193
11940:
1195
1196#if MACRO_LEVEL>=1
1197	.endm
1198#else
1199	ret
1200
1201#if defined(OS_LINUX)
1202	.size	inner_store_2x4_vs_lib4, .-inner_store_2x4_vs_lib4
1203#endif
1204#endif
1205
1206
1207
1208
1209
1210//                               1      2              3          4            5          6        7             8          9
1211// void kernel_dgemm_nn_4x2_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
1212
1213	.p2align 4,,15
1214#if defined(OS_LINUX)
1215	.globl kernel_dgemm_nn_4x2_lib4
1216	.type kernel_dgemm_nn_4x2_lib4, @function
1217kernel_dgemm_nn_4x2_lib4:
1218#elif defined(OS_MAC)
1219	.globl _kernel_dgemm_nn_4x2_lib4
1220_kernel_dgemm_nn_4x2_lib4:
1221#elif defined(OS_WINDOWS)
1222	.globl kernel_dgemm_nn_4x2_lib4
1223	.def kernel_dgemm_nn_4x2_lib4; .scl 2; .type 32; .endef
1224kernel_dgemm_nn_4x2_lib4:
1225#endif
1226
1227	PROLOGUE
1228
1229	// zero accumulation registers
1230
1231	vxorpd	%ymm0, %ymm0, %ymm0
1232	vmovapd	%ymm0, %ymm1
1233	vmovapd	%ymm0, %ymm2
1234	vmovapd	%ymm0, %ymm3
1235
1236
1237	// call inner dgemm kernel nn
1238
1239	movq	ARG1, %r10 // k
1240	movq	ARG3, %r11  // A
1241	movq	ARG5, %r12  // B
1242	movq	ARG6, %r13 // sdb
1243	sall	$5, %r13d // 4*sdb*sizeof(double)
1244	movq	ARG4, %r14 // offsetB
1245
1246#if MACRO_LEVEL>=1
1247	INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4
1248#else
1249#if defined(OS_LINUX) | defined(OS_WINDOWS)
1250	call inner_edge_dgemm_add_nn_4x2_lib4
1251#elif defined(OS_MAC)
1252	callq _inner_edge_dgemm_add_nn_4x2_lib4
1253#endif
1254#endif
1255
1256#if MACRO_LEVEL>=2
1257	INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4
1258#else
1259#if defined(OS_LINUX) | defined(OS_WINDOWS)
1260	call inner_kernel_dgemm_add_nn_4x2_lib4
1261#elif defined(OS_MAC)
1262	callq _inner_kernel_dgemm_add_nn_4x2_lib4
1263#endif
1264#endif
1265
1266
1267	// call inner blend
1268
1269	movq	ARG2, %r10 // alpha
1270	movq	ARG7, %r11 // beta
1271	movq	ARG8, %r12   // C
1272
1273#if MACRO_LEVEL>=1
1274	INNER_SCALE_AB_4X2_LIB4
1275#else
1276#if defined(OS_LINUX) | defined(OS_WINDOWS)
1277	call inner_scale_ab_4x2_lib4
1278#elif defined(OS_MAC)
1279	callq _inner_scale_ab_4x2_lib4
1280#endif
1281#endif
1282
1283
1284	// store n
1285
1286	movq	ARG9, %r10 // D
1287
1288#if MACRO_LEVEL>=1
1289	INNER_STORE_4X2_LIB4
1290#else
1291#if defined(OS_LINUX) | defined(OS_WINDOWS)
1292	call inner_store_4x2_lib4
1293#elif defined(OS_MAC)
1294	callq _inner_store_4x2_lib4
1295#endif
1296#endif
1297
1298
1299	EPILOGUE
1300
1301	ret
1302
1303#if defined(OS_LINUX)
1304	.size	kernel_dgemm_nn_4x2_lib4, .-kernel_dgemm_nn_4x2_lib4
1305#endif
1306
1307
1308
1309
1310
1311//                                  1      2              3          4            5          6        7             8          9          10      11
1312// void kernel_dgemm_nn_4x2_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1);
1313
1314	.p2align 4,,15
1315#if defined(OS_LINUX)
1316	.globl kernel_dgemm_nn_4x2_vs_lib4
1317	.type kernel_dgemm_nn_4x2_vs_lib4, @function
1318kernel_dgemm_nn_4x2_vs_lib4:
1319#elif defined(OS_MAC)
1320	.globl _kernel_dgemm_nn_4x2_vs_lib4
1321_kernel_dgemm_nn_4x2_vs_lib4:
1322#elif defined(OS_WINDOWS)
1323	.globl kernel_dgemm_nn_4x2_vs_lib4
1324	.def kernel_dgemm_nn_4x2_vs_lib4; .scl 2; .type 32; .endef
1325kernel_dgemm_nn_4x2_vs_lib4:
1326#endif
1327
1328	PROLOGUE
1329
1330	// zero accumulation registers
1331
1332	vxorpd	%ymm0, %ymm0, %ymm0
1333	vmovapd	%ymm0, %ymm1
1334	vmovapd	%ymm0, %ymm2
1335	vmovapd	%ymm0, %ymm3
1336
1337
1338	// call inner dgemm kernel nn
1339
1340	movq	ARG1, %r10 // k
1341	movq	ARG3, %r11  // A
1342	movq	ARG5, %r12  // B
1343	movq	ARG6, %r13 // sdb
1344	sall	$5, %r13d // 4*sdb*sizeof(double)
1345	movq	ARG4, %r14 // offsetB
1346
1347#if MACRO_LEVEL>=1
1348	INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4
1349#else
1350#if defined(OS_LINUX) | defined(OS_WINDOWS)
1351	call inner_edge_dgemm_add_nn_4x2_lib4
1352#elif defined(OS_MAC)
1353	callq _inner_edge_dgemm_add_nn_4x2_lib4
1354#endif
1355#endif
1356
1357#if MACRO_LEVEL>=2
1358	INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4
1359#else
1360#if defined(OS_LINUX) | defined(OS_WINDOWS)
1361	call inner_kernel_dgemm_add_nn_4x2_lib4
1362#elif defined(OS_MAC)
1363	callq _inner_kernel_dgemm_add_nn_4x2_lib4
1364#endif
1365#endif
1366
1367
1368	// call inner blend
1369
1370	movq	ARG2, %r10 // alpha
1371	movq	ARG7, %r11 // beta
1372	movq	ARG8, %r12   // C
1373
1374#if MACRO_LEVEL>=1
1375	INNER_SCALE_AB_4X2_LIB4
1376#else
1377#if defined(OS_LINUX) | defined(OS_WINDOWS)
1378	call inner_scale_ab_4x2_lib4
1379#elif defined(OS_MAC)
1380	callq _inner_scale_ab_4x2_lib4
1381#endif
1382#endif
1383
1384
1385	// store n
1386
1387	movq	ARG9, %r10 // D
1388	movq	ARG10, %r11 // m1
1389	movq	ARG11, %r12 // n1
1390
1391#if MACRO_LEVEL>=1
1392	INNER_STORE_4X2_VS_LIB4
1393#else
1394#if defined(OS_LINUX) | defined(OS_WINDOWS)
1395	call inner_store_4x2_vs_lib4
1396#elif defined(OS_MAC)
1397	callq _inner_store_4x2_vs_lib4
1398#endif
1399#endif
1400
1401
1402	EPILOGUE
1403
1404	ret
1405
1406#if defined(OS_LINUX)
1407	.size	kernel_dgemm_nn_4x2_vs_lib4, .-kernel_dgemm_nn_4x2_vs_lib4
1408#endif
1409
1410
1411
1412
1413
1414//                               1      2              3          4            5          6        7             8          9
1415// void kernel_dgemm_nn_2x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
1416
1417	.p2align 4,,15
1418#if defined(OS_LINUX)
1419	.globl kernel_dgemm_nn_2x4_lib4
1420	.type kernel_dgemm_nn_2x4_lib4, @function
1421kernel_dgemm_nn_2x4_lib4:
1422#elif defined(OS_MAC)
1423	.globl _kernel_dgemm_nn_2x4_lib4
1424_kernel_dgemm_nn_2x4_lib4:
1425#elif defined(OS_WINDOWS)
1426	.globl kernel_dgemm_nn_2x4_lib4
1427	.def kernel_dgemm_nn_2x4_lib4; .scl 2; .type 32; .endef
1428kernel_dgemm_nn_2x4_lib4:
1429#endif
1430
1431	PROLOGUE
1432
1433	// zero accumulation registers
1434
1435	vxorpd	%ymm0, %ymm0, %ymm0
1436	vmovapd	%ymm0, %ymm1
1437	vmovapd	%ymm0, %ymm2
1438	vmovapd	%ymm0, %ymm3
1439	vmovapd	%ymm0, %ymm4
1440	vmovapd	%ymm0, %ymm5
1441	vmovapd	%ymm0, %ymm6
1442	vmovapd	%ymm0, %ymm7
1443
1444
1445	// call inner dgemm kernel nn
1446
1447	movq	ARG1, %r10 // k
1448	movq	ARG3, %r11  // A
1449	movq	ARG5, %r12  // B
1450	movq	ARG6, %r13 // sdb
1451	sall	$5, %r13d // 4*sdb*sizeof(double)
1452	movq	ARG4, %r14 // offsetB
1453
1454#if MACRO_LEVEL>=1
1455	INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4
1456#else
1457#if defined(OS_LINUX) | defined(OS_WINDOWS)
1458	call inner_edge_dgemm_add_nn_2x4_lib4
1459#elif defined(OS_MAC)
1460	callq _inner_edge_dgemm_add_nn_2x4_lib4
1461#endif
1462#endif
1463
1464#if MACRO_LEVEL>=2
1465	INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4
1466#else
1467#if defined(OS_LINUX) | defined(OS_WINDOWS)
1468	call inner_kernel_dgemm_add_nn_2x4_lib4
1469#elif defined(OS_MAC)
1470	callq _inner_kernel_dgemm_add_nn_2x4_lib4
1471#endif
1472#endif
1473
1474
1475	// call inner blend
1476
1477	movq	ARG2, %r10 // alpha
1478	movq	ARG7, %r11 // beta
1479	movq	ARG8, %r12   // C
1480
1481#if MACRO_LEVEL>=1
1482	INNER_SCALE_AB_2X4_LIB4
1483#else
1484#if defined(OS_LINUX) | defined(OS_WINDOWS)
1485	call inner_scale_ab_2x4_lib4
1486#elif defined(OS_MAC)
1487	callq _inner_scale_ab_2x4_lib4
1488#endif
1489#endif
1490
1491
1492	// store n
1493
1494	movq	ARG9, %r10 // D
1495
1496#if MACRO_LEVEL>=1
1497	INNER_STORE_2X4_LIB4
1498#else
1499#if defined(OS_LINUX) | defined(OS_WINDOWS)
1500	call inner_store_2x4_lib4
1501#elif defined(OS_MAC)
1502	callq _inner_store_2x4_lib4
1503#endif
1504#endif
1505
1506
1507	EPILOGUE
1508
1509	ret
1510
1511#if defined(OS_LINUX)
1512	.size	kernel_dgemm_nn_2x4_lib4, .-kernel_dgemm_nn_2x4_lib4
1513#endif
1514
1515
1516
1517
1518
1519//                                  1      2              3          4            5          6        7             8          9          10      11
1520// void kernel_dgemm_nn_2x4_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1);
1521
1522	.p2align 4,,15
1523#if defined(OS_LINUX)
1524	.globl kernel_dgemm_nn_2x4_vs_lib4
1525	.type kernel_dgemm_nn_2x4_vs_lib4, @function
1526kernel_dgemm_nn_2x4_vs_lib4:
1527#elif defined(OS_MAC)
1528	.globl _kernel_dgemm_nn_2x4_vs_lib4
1529_kernel_dgemm_nn_2x4_vs_lib4:
1530#elif defined(OS_WINDOWS)
1531	.globl kernel_dgemm_nn_2x4_vs_lib4
1532	.def kernel_dgemm_nn_2x4_vs_lib4; .scl 2; .type 32; .endef
1533kernel_dgemm_nn_2x4_vs_lib4:
1534#endif
1535
1536	PROLOGUE
1537
1538	// zero accumulation registers
1539
1540	vxorpd	%ymm0, %ymm0, %ymm0
1541	vmovapd	%ymm0, %ymm1
1542	vmovapd	%ymm0, %ymm2
1543	vmovapd	%ymm0, %ymm3
1544	vmovapd	%ymm0, %ymm4
1545	vmovapd	%ymm0, %ymm5
1546	vmovapd	%ymm0, %ymm6
1547	vmovapd	%ymm0, %ymm7
1548
1549
1550	// call inner dgemm kernel nn
1551
1552	movq	ARG1, %r10 // k
1553	movq	ARG3, %r11  // A
1554	movq	ARG5, %r12  // B
1555	movq	ARG6, %r13 // sdb
1556	sall	$5, %r13d // 4*sdb*sizeof(double)
1557	movq	ARG4, %r14 // offsetB
1558
1559#if MACRO_LEVEL>=1
1560	INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4
1561#else
1562#if defined(OS_LINUX) | defined(OS_WINDOWS)
1563	call inner_edge_dgemm_add_nn_2x4_lib4
1564#elif defined(OS_MAC)
1565	callq _inner_edge_dgemm_add_nn_2x4_lib4
1566#endif
1567#endif
1568
1569#if MACRO_LEVEL>=2
1570	INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4
1571#else
1572#if defined(OS_LINUX) | defined(OS_WINDOWS)
1573	call inner_kernel_dgemm_add_nn_2x4_lib4
1574#elif defined(OS_MAC)
1575	callq _inner_kernel_dgemm_add_nn_2x4_lib4
1576#endif
1577#endif
1578
1579
1580	// call inner blend
1581
1582	movq	ARG2, %r10 // alpha
1583	movq	ARG7, %r11 // beta
1584	movq	ARG8, %r12   // C
1585
1586#if MACRO_LEVEL>=1
1587	INNER_SCALE_AB_2X4_LIB4
1588#else
1589#if defined(OS_LINUX) | defined(OS_WINDOWS)
1590	call inner_scale_ab_2x4_lib4
1591#elif defined(OS_MAC)
1592	callq _inner_scale_ab_2x4_lib4
1593#endif
1594#endif
1595
1596
1597	// store n
1598
1599	movq	ARG9, %r10 // D
1600	movq	ARG10, %r11 // m1
1601	movq	ARG11, %r12 // n1
1602
1603#if MACRO_LEVEL>=1
1604	INNER_STORE_2X4_VS_LIB4
1605#else
1606#if defined(OS_LINUX) | defined(OS_WINDOWS)
1607	call inner_store_2x4_vs_lib4
1608#elif defined(OS_MAC)
1609	callq _inner_store_2x4_vs_lib4
1610#endif
1611#endif
1612
1613
1614	EPILOGUE
1615
1616	ret
1617
1618#if defined(OS_LINUX)
1619	.size	kernel_dgemm_nn_2x4_vs_lib4, .-kernel_dgemm_nn_2x4_vs_lib4
1620#endif
1621
1622
1623
1624
1625
1626//                               1      2              3          4            5          6        7             8          9
1627// void kernel_dgemm_nn_2x2_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
1628
1629	.p2align 4,,15
1630#if defined(OS_LINUX)
1631	.globl kernel_dgemm_nn_2x2_lib4
1632	.type kernel_dgemm_nn_2x2_lib4, @function
1633kernel_dgemm_nn_2x2_lib4:
1634#elif defined(OS_MAC)
1635	.globl _kernel_dgemm_nn_2x2_lib4
1636_kernel_dgemm_nn_2x2_lib4:
1637#elif defined(OS_WINDOWS)
1638	.globl kernel_dgemm_nn_2x2_lib4
1639	.def kernel_dgemm_nn_2x2_lib4; .scl 2; .type 32; .endef
1640kernel_dgemm_nn_2x2_lib4:
1641#endif
1642
1643	PROLOGUE
1644
1645	// zero accumulation registers
1646
1647	vxorpd	%ymm0, %ymm0, %ymm0
1648	vmovapd	%ymm0, %ymm1
1649	vmovapd	%ymm0, %ymm2
1650	vmovapd	%ymm0, %ymm3
1651
1652
1653	// call inner dgemm kernel nn
1654
1655	movq	ARG1, %r10 // k
1656	movq	ARG3, %r11  // A
1657	movq	ARG5, %r12  // B
1658	movq	ARG6, %r13 // sdb
1659	sall	$5, %r13d // 4*sdb*sizeof(double)
1660	movq	ARG4, %r14 // offsetB
1661
1662#if MACRO_LEVEL>=1
1663	INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4
1664#else
1665#if defined(OS_LINUX) | defined(OS_WINDOWS)
1666	call inner_edge_dgemm_add_nn_4x2_lib4
1667#elif defined(OS_MAC)
1668	callq _inner_edge_dgemm_add_nn_4x2_lib4
1669#endif
1670#endif
1671
1672#if MACRO_LEVEL>=2
1673	INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4
1674#else
1675#if defined(OS_LINUX) | defined(OS_WINDOWS)
1676	call inner_kernel_dgemm_add_nn_4x2_lib4
1677#elif defined(OS_MAC)
1678	callq _inner_kernel_dgemm_add_nn_4x2_lib4
1679#endif
1680#endif
1681
1682
1683	// call inner blend
1684
1685	movq	ARG2, %r10 // alpha
1686	movq	ARG7, %r11 // beta
1687	movq	ARG8, %r12   // C
1688
1689#if MACRO_LEVEL>=1
1690	INNER_SCALE_AB_4X2_LIB4
1691#else
1692#if defined(OS_LINUX) | defined(OS_WINDOWS)
1693	call inner_scale_ab_4x2_lib4
1694#elif defined(OS_MAC)
1695	callq _inner_scale_ab_4x2_lib4
1696#endif
1697#endif
1698
1699
1700	// store n
1701
1702	movq	ARG9, %r10 // D
1703
1704#if MACRO_LEVEL>=1
1705	INNER_STORE_2X2_LIB4
1706#else
1707#if defined(OS_LINUX) | defined(OS_WINDOWS)
1708	call inner_store_2x2_lib4
1709#elif defined(OS_MAC)
1710	callq _inner_store_2x2_lib4
1711#endif
1712#endif
1713
1714
1715	EPILOGUE
1716
1717	ret
1718
1719#if defined(OS_LINUX)
1720	.size	kernel_dgemm_nn_2x2_lib4, .-kernel_dgemm_nn_2x2_lib4
1721#endif
1722
1723
1724
1725
1726
1727	// read-only data
1728#if defined(OS_LINUX)
1729	.section	.rodata.cst32,"aM",@progbits,32
1730#elif defined(OS_MAC)
1731	.section	__TEXT,__const
1732#elif defined(OS_WINDOWS)
1733	.section .rdata,"dr"
1734#endif
1735
1736#if defined(OS_LINUX) | defined(OS_WINDOWS)
1737	.align 32
1738.LC00: // { -1 -1 -1 1 }
1739#elif defined(OS_MAC)
1740LC00: // { -1 -1 -1 1 }
1741	.align 5
1742#endif
1743	.quad	-1
1744	.quad	-1
1745	.quad	-1
1746	.quad	1
1747
1748#if defined(OS_LINUX) | defined(OS_WINDOWS)
1749	.align 32
1750.LC01: // { -1 -1 -1 -1 }
1751#elif defined(OS_MAC)
1752LC01: // { -1 -1 -1 -1 }
1753	.align 5
1754#endif
1755	.quad	-1
1756	.quad	-1
1757	.quad	-1
1758	.quad	-1
1759
1760#if defined(OS_LINUX) | defined(OS_WINDOWS)
1761	.align 32
1762.LC02: // { 3.5 2.5 1.5 0.5 }
1763#elif defined(OS_MAC)
1764LC02: // { 3.5 2.5 1.5 0.5 }
1765	.align 5
1766#endif
1767	.long	0
1768	.long	1071644672
1769	.long	0
1770	.long	1073217536
1771	.long	0
1772	.long	1074003968
1773	.long	0
1774	.long	1074528256
1775
1776#if defined(OS_LINUX) | defined(OS_WINDOWS)
1777	.align 32
1778.LC03: // { 7.5 6.5 5.5 4.5 }
1779#elif defined(OS_MAC)
1780LC03: // { 7.5 6.5 5.5 4.5 }
1781	.align 5
1782#endif
1783	.long	0
1784	.long	1074921472
1785	.long	0
1786	.long	1075183616
1787	.long	0
1788	.long	1075445760
1789	.long	0
1790	.long	1075707904
1791
1792#if defined(OS_LINUX) | defined(OS_WINDOWS)
1793	.align 32
1794.LC04: // { 1.0 1.0 1.0 1.0 }
1795#elif defined(OS_MAC)
1796LC04: // { 1.0 1.0 1.0 1.0 }
1797	.align 5
1798#endif
1799	.long	0
1800	.long	1072693248
1801	.long	0
1802	.long	1072693248
1803	.long	0
1804	.long	1072693248
1805	.long	0
1806	.long	1072693248
1807
1808#if defined(OS_LINUX) | defined(OS_WINDOWS)
1809	.align 32
1810.LC05: // { 1.0 1.0 1.0 -1.0 }
1811#elif defined(OS_MAC)
1812	.align 5
1813LC05: // { 1.0 1.0 1.0 -1.0 }
1814#endif
1815	.long	0
1816	.long	-1074790400
1817	.long	0
1818	.long	1072693248
1819	.long	0
1820	.long	1072693248
1821	.long	0
1822	.long	1072693248
1823
1824#if defined(OS_LINUX) | defined(OS_WINDOWS)
1825	.align 32
1826.LC06: // { 1.0 1.0 -1.0 -1.0 }
1827#elif defined(OS_MAC)
1828	.align 5
1829LC06: // { 1.0 1.0 -1.0 -1.0 }
1830#endif
1831	.long	0
1832	.long	-1074790400
1833	.long	0
1834	.long	-1074790400
1835	.long	0
1836	.long	1072693248
1837	.long	0
1838	.long	1072693248
1839
1840#if defined(OS_LINUX) | defined(OS_WINDOWS)
1841	.align 32
1842.LC07: // { 1.0 -1.0 -1.0 -1.0 }
1843#elif defined(OS_MAC)
1844	.align 5
1845LC07: // { 1.0 -1.0 -1.0 -1.0 }
1846#endif
1847	.long	0
1848	.long	-1074790400
1849	.long	0
1850	.long	-1074790400
1851	.long	0
1852	.long	-1074790400
1853	.long	0
1854	.long	1072693248
1855
1856#if defined(OS_LINUX) | defined(OS_WINDOWS)
1857	.align 32
1858.LC08: // { -1.0 -1.0 -1.0 1.0 }
1859#elif defined(OS_MAC)
1860	.align 5
1861LC08: // { -1.0 -1.0 -1.0 1.0 }
1862#endif
1863	.long	0
1864	.long	1072693248
1865	.long	0
1866	.long	-1074790400
1867	.long	0
1868	.long	-1074790400
1869	.long	0
1870	.long	-1074790400
1871
1872#if defined(OS_LINUX) | defined(OS_WINDOWS)
1873	.align 32
1874.LC09: // { -1.0 -1.0 1.0 1.0 }
1875#elif defined(OS_MAC)
1876	.align 5
1877LC09: // { -1.0 -1.0 1.0 1.0 }
1878#endif
1879	.long	0
1880	.long	1072693248
1881	.long	0
1882	.long	1072693248
1883	.long	0
1884	.long	-1074790400
1885	.long	0
1886	.long	-1074790400
1887
1888#if defined(OS_LINUX) | defined(OS_WINDOWS)
1889	.align 32
1890.LC10: // { -1.0 1.0 1.0 1.0 }
1891#elif defined(OS_MAC)
1892	.align 5
1893LC10: // { -1.0 1.0 1.0 1.0 }
1894#endif
1895	.long	0
1896	.long	1072693248
1897	.long	0
1898	.long	1072693248
1899	.long	0
1900	.long	1072693248
1901	.long	0
1902	.long	-1074790400
1903
1904
1905
1906
1907#if defined(OS_LINUX)
1908	.section	.note.GNU-stack,"",@progbits
1909#elif defined(OS_MAC)
1910	.subsections_via_symbols
1911#endif
1912
1913
1914
1915