1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36#if defined(OS_LINUX) | defined(OS_MAC)
37
38//#define STACKSIZE 96
39#define STACKSIZE 64
40#define ARG1  %rdi
41#define ARG2  %rsi
42#define ARG3  %rdx
43#define ARG4  %rcx
44#define ARG5  %r8
45#define ARG6  %r9
46#define ARG7  STACKSIZE +  8(%rsp)
47#define ARG8  STACKSIZE + 16(%rsp)
48#define ARG9  STACKSIZE + 24(%rsp)
49#define ARG10 STACKSIZE + 32(%rsp)
50#define ARG11 STACKSIZE + 40(%rsp)
51#define ARG12 STACKSIZE + 48(%rsp)
52#define ARG13 STACKSIZE + 56(%rsp)
53#define ARG14 STACKSIZE + 64(%rsp)
54#define ARG15 STACKSIZE + 72(%rsp)
55#define ARG16 STACKSIZE + 80(%rsp)
56#define ARG17 STACKSIZE + 88(%rsp)
57#define ARG18 STACKSIZE + 96(%rsp)
58#define PROLOGUE \
59	subq	$STACKSIZE, %rsp; \
60	movq	%rbx,   (%rsp); \
61	movq	%rbp,  8(%rsp); \
62	movq	%r12, 16(%rsp); \
63	movq	%r13, 24(%rsp); \
64	movq	%r14, 32(%rsp); \
65	movq	%r15, 40(%rsp); \
66	vzeroupper;
67#define EPILOGUE \
68	vzeroupper; \
69	movq	  (%rsp), %rbx; \
70	movq	 8(%rsp), %rbp; \
71	movq	16(%rsp), %r12; \
72	movq	24(%rsp), %r13; \
73	movq	32(%rsp), %r14; \
74	movq	40(%rsp), %r15; \
75	addq	$STACKSIZE, %rsp;
76
77#elif defined(OS_WINDOWS)
78
79#define STACKSIZE 256
80#define ARG1  %rcx
81#define ARG2  %rdx
82#define ARG3  %r8
83#define ARG4  %r9
84#define ARG5  STACKSIZE + 40(%rsp)
85#define ARG6  STACKSIZE + 48(%rsp)
86#define ARG7  STACKSIZE + 56(%rsp)
87#define ARG8  STACKSIZE + 64(%rsp)
88#define ARG9  STACKSIZE + 72(%rsp)
89#define ARG10 STACKSIZE + 80(%rsp)
90#define ARG11 STACKSIZE + 88(%rsp)
91#define ARG12 STACKSIZE + 96(%rsp)
92#define ARG13 STACKSIZE + 104(%rsp)
93#define ARG14 STACKSIZE + 112(%rsp)
94#define ARG15 STACKSIZE + 120(%rsp)
95#define ARG16 STACKSIZE + 128(%rsp)
96#define ARG17 STACKSIZE + 136(%rsp)
97#define ARG18 STACKSIZE + 144(%rsp)
98#define PROLOGUE \
99	subq	$STACKSIZE, %rsp; \
100	movq	%rbx,   (%rsp); \
101	movq	%rbp,  8(%rsp); \
102	movq	%r12, 16(%rsp); \
103	movq	%r13, 24(%rsp); \
104	movq	%r14, 32(%rsp); \
105	movq	%r15, 40(%rsp); \
106	movq	%rdi, 48(%rsp); \
107	movq	%rsi, 56(%rsp); \
108	vmovups	%xmm6, 64(%rsp); \
109	vmovups	%xmm7, 80(%rsp); \
110	vmovups	%xmm8, 96(%rsp); \
111	vmovups	%xmm9, 112(%rsp); \
112	vmovups	%xmm10, 128(%rsp); \
113	vmovups	%xmm11, 144(%rsp); \
114	vmovups	%xmm12, 160(%rsp); \
115	vmovups	%xmm13, 176(%rsp); \
116	vmovups	%xmm14, 192(%rsp); \
117	vmovups	%xmm15, 208(%rsp); \
118	vzeroupper;
119#define EPILOGUE \
120	vzeroupper; \
121	movq	  (%rsp), %rbx; \
122	movq	 8(%rsp), %rbp; \
123	movq	16(%rsp), %r12; \
124	movq	24(%rsp), %r13; \
125	movq	32(%rsp), %r14; \
126	movq	40(%rsp), %r15; \
127	movq	48(%rsp), %rdi; \
128	movq	56(%rsp), %rsi; \
129	vmovups	64(%rsp), %xmm6; \
130	vmovups	80(%rsp), %xmm7; \
131	vmovups	96(%rsp), %xmm8; \
132	vmovups	112(%rsp), %xmm9; \
133	vmovups	128(%rsp), %xmm10; \
134	vmovups	144(%rsp), %xmm11; \
135	vmovups	160(%rsp), %xmm12; \
136	vmovups	176(%rsp), %xmm13; \
137	vmovups	192(%rsp), %xmm14; \
138	vmovups	208(%rsp), %xmm15; \
139	addq	$STACKSIZE, %rsp;
140
141#else
142
143#error wrong OS
144
145#endif
146
147
148
149#if defined(OS_LINUX) | defined(OS_WINDOWS)
150	.text
151#elif defined(OS_MAC)
152	.section	__TEXT,__text,regular,pure_instructions
153#endif
154
155
156
157
158// ASM Macros
159
160
161
162
163// void inner_kernel_dgemm_add_nn_4x2_lib4
164// common inner routine with file scope
165//
166// input arguments:
167// r10d  <- k
168// r11   <- A
169// r12   <- B
170// r13   <- 4*sdb*sizeof(double)
171// r14   <= dirty
172// ymm0  <- [d00 d10 d20 d30]
173// ymm1  <- [d01 d11 d21 d31]
174
175//
176// output arguments:
177// r10d  <- 0
178// r11   <- A+4*k*sizeof(double)
179// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
180// r13   <- 4*sdb*sizeof(double)
181// r14   <= dirty
182// ymm0  <- [d00 d10 d20 d30]
183// ymm1  <- [d01 d11 d21 d31]
184
185#if MACRO_LEVEL>=2
186	.macro INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4
187#else
188	.p2align 4,,15
189#if defined(OS_LINUX)
190	.type inner_kernel_dgemm_add_nn_4x2_lib4, @function
191inner_kernel_dgemm_add_nn_4x2_lib4:
192#elif defined(OS_MAC)
193_inner_kernel_dgemm_add_nn_4x2_lib4:
194#elif defined(OS_WINDOWS)
195	.def inner_kernel_dgemm_add_nn_4x2_lib4; .scl 2; .type 32; .endef
196inner_kernel_dgemm_add_nn_4x2_lib4:
197#endif
198#endif
199
200	cmpl	$0, %r10d
201	jle		2f // return
202
203	// preload
204	vmovapd 0(%r11), %ymm8 // A0[0]
205
206	cmpl	$4, %r10d
207	jle		0f // consider clean-up loop
208
209	// main loop
210	.p2align 3
2111: // main loop
212
213	prefetcht0	0(%r12, %r13, 2) // software prefetch B + 2(4*sdb*sizeof(double))
214	// T0 (temporal data)-prefetch data into all levels of the cache hierarchy.
215	// The prefetch size may be 32, 64, or 128 Bytes.
216
217	// unroll 0
218	// ymm8 = [a01 a10 a20 a30]
219	vbroadcastsd	0(%r12), %ymm12 // B
220	vmulpd			%ymm8, %ymm12, %ymm15
221	vaddpd			%ymm15, %ymm0, %ymm0
222
223	vmovapd			32(%r11), %ymm10 // load A
224
225	vbroadcastsd	32(%r12), %ymm12 // B + 32 = B+4*sizeof(double)
226	vmulpd			%ymm8, %ymm12, %ymm15
227	vaddpd			%ymm15, %ymm1, %ymm1
228	subl	$4, %r10d
229
230	// unroll 1
231	// ymm10 = [a01 a10 a20 a30]
232	vbroadcastsd	8(%r12), %ymm12 // B
233	vmulpd			%ymm10, %ymm12, %ymm15
234	vaddpd			%ymm15, %ymm0, %ymm0
235
236	vmovapd			64(%r11), %ymm8 // A0
237
238	vbroadcastsd	40(%r12), %ymm12 // B + 32 = B+4*sizeof(double)
239	vmulpd			%ymm10, %ymm12, %ymm15
240	vaddpd			%ymm15, %ymm1, %ymm1
241
242	// unroll 2
243	// ymm8 = [a01 a10 a20 a30]
244	vbroadcastsd	16(%r12), %ymm12 // B
245	vmulpd			%ymm8, %ymm12, %ymm15
246	vaddpd			%ymm15, %ymm0, %ymm0
247
248	vmovapd			96(%r11), %ymm10 // A0
249
250	vbroadcastsd	48(%r12), %ymm12 // B
251	vmulpd			%ymm8, %ymm12, %ymm15
252	vaddpd			%ymm15, %ymm1, %ymm1
253
254	addq	$128, %r11 // Update A
255
256	// unroll 3
257	// A[03 13 23 33] -> ymm10
258	vbroadcastsd	24(%r12), %ymm12 // B
259	vmulpd			%ymm10, %ymm12, %ymm15
260	vaddpd			%ymm15, %ymm0, %ymm0
261
262	vmovapd			0(%r11), %ymm8 // load A[04 14 24 34] -> ymm8
263
264	vbroadcastsd	56(%r12), %ymm12 // B
265	vmulpd			%ymm10, %ymm12, %ymm15
266	vaddpd			%ymm15, %ymm1, %ymm1
267
268	addq	%r13, %r12 // Update B = B + sdb*ps*8bit
269
270	cmpl	$4, %r10d
271	jg		1b // main loop
272
2730: // consider clean4-up
274
275	cmpl	$3, %r10d
276	jle		4f // clean1
277
278	// ki==4
279
280	// unroll 0
281	// ymm8 = A[04 14 24 34]
282	vbroadcastsd	0(%r12), %ymm12 // B
283	vmulpd			%ymm8, %ymm12, %ymm15
284	vaddpd			%ymm15, %ymm0, %ymm0
285
286	vmovapd			32(%r11), %ymm10 // load A
287
288	vbroadcastsd	32(%r12), %ymm12
289	vmulpd			%ymm8, %ymm12, %ymm15
290	vaddpd			%ymm15, %ymm1, %ymm1
291	vaddpd			%ymm15, %ymm7, %ymm7
292
293	subl	$4, %r10d
294
295	// unroll 1
296	// ymm10 = A[05 15 25 35]
297	vbroadcastsd	8(%r12), %ymm12
298	vmulpd			%ymm10, %ymm12, %ymm15
299	vaddpd			%ymm15, %ymm0, %ymm0
300
301	vmovapd			64(%r11), %ymm8 // A0
302
303	vbroadcastsd	40(%r12), %ymm12
304	vmulpd			%ymm10, %ymm12, %ymm15
305	vaddpd			%ymm15, %ymm1, %ymm1
306
307	// unroll 2
308	// ymm8
309	vbroadcastsd	16(%r12), %ymm12
310	vmulpd			%ymm8, %ymm12, %ymm15
311	vaddpd			%ymm15, %ymm0, %ymm0
312
313	vmovapd			96(%r11), %ymm10 // A0
314
315	vbroadcastsd	48(%r12), %ymm12
316	vmulpd			%ymm8, %ymm12, %ymm15
317	vaddpd			%ymm15, %ymm1, %ymm1
318
319	addq	$128, %r11
320
321	// unroll 3
322	// ymm10
323	vbroadcastsd	24(%r12), %ymm12
324	vmulpd			%ymm10, %ymm12, %ymm15
325	vaddpd			%ymm15, %ymm0, %ymm0
326
327//	vmovapd			0(%r11), %ymm8 // A0
328
329	vbroadcastsd	56(%r12), %ymm12
330	vmulpd			%ymm10, %ymm12, %ymm15
331	vaddpd			%ymm15, %ymm1, %ymm1
332
333	addq	%r13, %r12
334
335	jmp		2f
336
337
3384: // consider clean1-up loop
339
340	cmpl	$0, %r10d
341	jle		2f // return
342
343	// clean-up loop
3443: // clean up loop
345
346	vmovapd			0(%r11), %ymm8 // A0[0]
347	vbroadcastsd	0(%r12), %ymm12
348	vmulpd			%ymm8, %ymm12, %ymm15
349	vaddpd			%ymm15, %ymm0, %ymm0
350
351	vbroadcastsd	32(%r12), %ymm12
352	vmulpd			%ymm8, %ymm12, %ymm15
353	vaddpd			%ymm15, %ymm1, %ymm1
354	addq	$32, %r11
355	subl	$1, %r10d
356	addq	$8, %r12
357
358	cmpl	$0, %r10d
359	jg		3b // clean up loop
360
3612: // return
362
363#if MACRO_LEVEL>=2
364	.endm
365#else
366	ret
367
368#if defined(OS_LINUX)
369	.size	inner_kernel_dgemm_add_nn_4x2_lib4, .-inner_kernel_dgemm_add_nn_4x2_lib4
370#endif
371#endif
372// end
373
374
375
376
377// void inner_kernel_dgemm_add_nn_2x4_lib4
378// common inner routine with file scope
379//
380// input arguments:
381// r10d  <- k
382// r11   <- A
383// r12   <- B
384// r13   <- 4*sdb*sizeof(double)
385
386//
387// output arguments:
388// r10d  <- 0
389// r11   <- A+4*k*sizeof(double)
390// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
391// r13   <- 4*sdb*sizeof(double)
392
393#if MACRO_LEVEL>=2
394	.macro INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4
395#else
396	.p2align 4,,15
397#if defined(OS_LINUX)
398	.type inner_kernel_dgemm_add_nn_2x4_lib4, @function
399inner_kernel_dgemm_add_nn_2x4_lib4:
400#elif defined(OS_MAC)
401_inner_kernel_dgemm_add_nn_2x4_lib4:
402#elif defined(OS_WINDOWS)
403	.def inner_kernel_dgemm_add_nn_2x4_lib4; .scl 2; .type 32; .endef
404inner_kernel_dgemm_add_nn_2x4_lib4:
405#endif
406#endif
407
408	cmpl	$0, %r10d
409	jle		5f // return
410
411	// preload
412	vbroadcastf128	0(%r11), %ymm11 // A
413	vbroadcastf128	32(%r11), %ymm12 // A
414
415	cmpl	$4, %r10d
416	jle		0f // consider clean-up loop
417
418	// main loop
419	.p2align 3
4201: // main loop
421
422	prefetcht0	0(%r12, %r13, 2) // software prefetch
423	prefetcht0	64(%r12, %r13, 2) // software prefetch
424
425	// unroll 0 1
426
427	// load
428	vmovapd			0(%r12), %ymm13 // B
429	vmovupd			16(%r12), %ymm14 // B
430
431	// mask
432	vblendpd		$0x3, %ymm13, %ymm14, %ymm14 //
433	vshufpd			$0x0, %ymm14, %ymm14, %ymm13 //
434
435	vmulpd			%ymm11, %ymm13, %ymm15
436	vaddpd			%ymm0, %ymm15, %ymm0
437
438	vbroadcastf128	64(%r11), %ymm9 // A
439	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
440
441	vmulpd			%ymm12, %ymm13, %ymm15
442	vaddpd			%ymm1, %ymm15, %ymm1
443
444	vmovapd			64(%r12), %ymm13
445	vmovupd			80(%r12), %ymm14
446
447	// load
448	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
449	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
450
451	vmulpd			%ymm11, %ymm13, %ymm15
452	vaddpd			%ymm2, %ymm15, %ymm2
453
454	vbroadcastf128	96(%r11), %ymm10 // A
455	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
456
457	vmulpd			%ymm12, %ymm13, %ymm15
458	vaddpd			%ymm3, %ymm15, %ymm3
459
460	// unroll 2 3
461	vmovupd			16(%r12), %ymm13
462	vmovapd			32(%r12), %ymm14
463	addq	$128, %r11
464	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
465	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
466	vmulpd			%ymm9, %ymm13, %ymm15
467	vaddpd			%ymm0, %ymm15, %ymm0
468	vbroadcastf128	0(%r11), %ymm11 // A
469	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
470	vmulpd			%ymm10, %ymm13, %ymm15
471	vaddpd			%ymm1, %ymm15, %ymm1
472	vmovupd			80(%r12), %ymm13
473	vmovapd			96(%r12), %ymm14
474	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
475	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
476	vmulpd			%ymm9, %ymm13, %ymm15
477	vaddpd			%ymm2, %ymm15, %ymm2
478	vbroadcastf128	32(%r11), %ymm12 // A
479	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
480	vmulpd			%ymm10, %ymm13, %ymm15
481	vaddpd			%ymm3, %ymm15, %ymm3
482
483	subl	$4, %r10d
484	addq	%r13, %r12
485
486	cmpl	$4, %r10d
487	jg		1b // main loop
488
489
4900: // consider clean4-up
491
492	cmpl	$3, %r10d
493	jle		4f // clean1
494
495	// unroll 0 1
496	vmovapd			0(%r12), %ymm13
497	vmovupd			16(%r12), %ymm14
498	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
499	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
500	vmulpd			%ymm11, %ymm13, %ymm15
501	vaddpd			%ymm0, %ymm15, %ymm0
502	vbroadcastf128	64(%r11), %ymm9 // A
503	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
504	vmulpd			%ymm12, %ymm13, %ymm15
505	vaddpd			%ymm1, %ymm15, %ymm1
506	vmovapd			64(%r12), %ymm13
507	vmovupd			80(%r12), %ymm14
508	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
509	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
510	vmulpd			%ymm11, %ymm13, %ymm15
511	vaddpd			%ymm2, %ymm15, %ymm2
512	vbroadcastf128	96(%r11), %ymm10 // A
513	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
514	vmulpd			%ymm12, %ymm13, %ymm15
515	vaddpd			%ymm3, %ymm15, %ymm3
516
517	// unroll 2 3
518	vmovupd			16(%r12), %ymm13
519	vmovapd			32(%r12), %ymm14
520	addq	$128, %r11
521	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
522	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
523	vmulpd			%ymm9, %ymm13, %ymm15
524	vaddpd			%ymm0, %ymm15, %ymm0
525//	vbroadcastf128	0(%r11), %ymm11 // A
526	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
527	vmulpd			%ymm10, %ymm13, %ymm15
528	vaddpd			%ymm1, %ymm15, %ymm1
529	vmovupd			80(%r12), %ymm13
530	vmovapd			96(%r12), %ymm14
531	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
532	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
533	vmulpd			%ymm9, %ymm13, %ymm15
534	vaddpd			%ymm2, %ymm15, %ymm2
535//	vbroadcastf128	32(%r11), %ymm12 // A
536	vshufpd			$0xf, %ymm14, %ymm14, %ymm13
537	vmulpd			%ymm10, %ymm13, %ymm15
538	vaddpd			%ymm3, %ymm15, %ymm3
539
540	subl	$4, %r10d
541	addq	$128, %r11
542	addq	%r13, %r12
543
544	jmp		2f // return
545
546
5474: // consider clean1-up loop
548
549	cmpl	$0, %r10d
550	jle		2f // return
551
552	// clean-up loop
5533: // clean up loop
554
555	// unroll 0
556	vbroadcastf128	0(%r11), %ymm11 // A
557	vmovupd			0(%r12), %ymm13
558	vmovupd			16(%r12), %ymm14
559	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
560	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
561	vmulpd			%ymm11, %ymm13, %ymm15
562	vaddpd			%ymm0, %ymm15, %ymm0
563	vmovupd			64(%r12), %ymm13
564	vmovupd			80(%r12), %ymm14
565	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
566	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
567	vmulpd			%ymm11, %ymm13, %ymm15
568	vaddpd			%ymm2, %ymm15, %ymm2
569
570	addq	$32, %r11
571	addq	$8, %r12
572	subl	$1, %r10d
573
574	cmpl	$0, %r10d
575	jg		3b // clean up loop
576
577
5782: // reduce
579	vaddpd			%ymm0, %ymm1, %ymm0
580	vextractf128	$0x1, %ymm0, %xmm1
581	vaddpd			%ymm2, %ymm3, %ymm2
582	vextractf128	$0x1, %ymm2, %xmm3
583
5845: // return
585
586#if MACRO_LEVEL>=2
587	.endm
588#else
589	ret
590
591#if defined(OS_LINUX)
592	.size	inner_kernel_dgemm_add_nn_2x4_lib4, .-inner_kernel_dgemm_add_nn_2x4_lib4
593#endif
594#endif
595// end
596
597
598
599
600// void inner_edge_dgemm_add_nn_4x2_lib4
601// common inner routine with file scope
602//
603// edge for B unaligned
604//
605// input arguments:
606// r10   <- k
607// r11   <- A
608// r12   <- B
609// r13   <- bs*sdb*sizeof(double)
610// r14   <- offB
611
612//
613// output arguments:
614// r10   <- k-(4-offB)
615// r11   <- A+(4-offB)*bs*sizeof(double)
616// r12   <- B-offB+bs*sdb*sizeof(double)
617// r13   <- bs*sdb*sizeof(double)
618// r14   <- offB
619
620
621#if MACRO_LEVEL>=1
622	.macro INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4
623#else
624	.p2align 4,,15
625#if defined(OS_LINUX)
626	.type inner_edge_dgemm_add_nn_4x2_lib4, @function
627inner_edge_dgemm_add_nn_4x2_lib4:
628#elif defined(OS_MAC)
629_inner_edge_dgemm_add_nn_4x2_lib4:
630#elif defined(OS_WINDOWS)
631	.def inner_edge_dgemm_add_nn_4x2_lib4; .scl 2; .type 32; .endef
632inner_edge_dgemm_add_nn_4x2_lib4:
633#endif
634#endif
635
636	cmpl			$0, %r14d // offset==0
637	jle				2f // end
638
639	cmpl			$0, %r10d // k==0
640	jle				2f // end
641
642	movl			$4, %r15d
643	subl			%r14d, %r15d // 4-offsetB
644	cmpl			%r10d, %r15d
645//	jle				0f
646//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
647//0:
648	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
649
650	movl			%r14d, %eax
651	sall			$3, %eax // offsetB*sizeof(double)
652	addq			%rax, %r12 // B+offsetB*sizeof(double)
653
6541:
655	vmovapd			0(%r11), %ymm8
656	vbroadcastsd	0(%r12), %ymm12
657	vmulpd			%ymm8, %ymm12, %ymm15
658	vaddpd			%ymm15, %ymm0, %ymm0
659	vbroadcastsd	32(%r12), %ymm12
660	vmulpd			%ymm8, %ymm12, %ymm15
661	vaddpd			%ymm15, %ymm1, %ymm1
662
663	subl			$1, %r10d // k-1
664	subl			$1, %r15d // kend-1
665	addq			$32, %r11 // A+1*bs*sizeof(float)
666	addq			$8, %r12 // B+1*sizeof(float)
667
668	cmpl			$0, %r15d
669	jg				1b
670
671	cmpl			$0, %r10d
672	jle				2f // end
673
674	addq			%r13, %r12
675	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
676
6772:
678
679#if MACRO_LEVEL>=1
680	.endm
681#else
682	ret
683
684#if defined(OS_LINUX)
685	.size	inner_edge_dgemm_add_nn_4x2_lib4, .-inner_edge_dgemm_add_nn_4x2_lib4
686#endif
687#endif
688// end
689
690
691
692
693// void inner_edge_dgemm_add_nn_2x4_lib4
694// common inner routine with file scope
695//
696// edge for B unaligned
697//
698// input arguments:
699// r10   <- k
700// r11   <- A
701// r12   <- B
702// r13   <- bs*sdb*sizeof(double)
703// r14   <- offB
704
705//
706// output arguments:
707// r10   <- k-(4-offB)
708// r11   <- A+(4-offB)*bs*sizeof(double)
709// r12   <- B-offB+bs*sdb*sizeof(double)
710// r13   <- bs*sdb*sizeof(double)
711// r14   <- offB
712
713
714#if MACRO_LEVEL>=1
715	.macro INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4
716#else
717	.p2align 4,,15
718#if defined(OS_LINUX)
719	.type inner_edge_dgemm_add_nn_2x4_lib4, @function
720inner_edge_dgemm_add_nn_2x4_lib4:
721#elif defined(OS_MAC)
722_inner_edge_dgemm_add_nn_2x4_lib4:
723#elif defined(OS_WINDOWS)
724	.def inner_edge_dgemm_add_nn_2x4_lib4; .scl 2; .type 32; .endef
725inner_edge_dgemm_add_nn_2x4_lib4:
726#endif
727#endif
728
729	cmpl			$0, %r14d // offset==0
730	jle				2f // end
731
732	cmpl			$0, %r10d // k==0
733	jle				2f // end
734
735	movl			$4, %r15d
736	subl			%r14d, %r15d // 4-offsetB
737	cmpl			%r10d, %r15d
738//	jle				0f
739//	movl			%r10d, %r15d // kend=min(k,4-offsetB)
740//0:
741	cmovgl			%r10d, %r15d // kend=min(k,4-offsetB)
742
743	movl			%r14d, %eax
744	sall			$3, %eax // offsetB*sizeof(double)
745	addq			%rax, %r12 // B+offsetB*sizeof(double)
746
7471:
748	vbroadcastf128	0(%r11), %ymm11 // A
749	vmovupd			0(%r12), %ymm13
750	vmovupd			16(%r12), %ymm14
751	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
752	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
753	vmulpd			%ymm11, %ymm13, %ymm15
754	vaddpd			%ymm0, %ymm15, %ymm0
755	vmovupd			64(%r12), %ymm13
756	vmovupd			80(%r12), %ymm14
757	vblendpd		$0x3, %ymm13, %ymm14, %ymm14
758	vshufpd			$0x0, %ymm14, %ymm14, %ymm13
759	vmulpd			%ymm11, %ymm13, %ymm15
760	vaddpd			%ymm2, %ymm15, %ymm2
761
762	subl			$1, %r10d // k-1
763	subl			$1, %r15d // kend-1
764	addq			$32, %r11 // A+1*bs*sizeof(float)
765	addq			$8, %r12 // B+1*sizeof(float)
766
767	cmpl			$0, %r15d
768	jg				1b
769
770	cmpl			$0, %r10d
771	jle				2f // end
772
773	addq			%r13, %r12
774	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(double)
775
7762:
777
778#if MACRO_LEVEL>=1
779	.endm
780#else
781	ret
782
783#if defined(OS_LINUX)
784	.size	inner_edge_dgemm_add_nn_2x4_lib4, .-inner_edge_dgemm_add_nn_2x4_lib4
785#endif
786#endif
787// end
788
789
790
791
792// void inner_scale_ab_4x2_lib4
793// common inner routine with file scope
794//
795// scale for generic alpha and beta
796//
797// input arguments:
798// r10   <- alpha
799// r11   <- beta
800// r12   <- C
801// r13   <- 4*sdc*sizeof(double)
802// r15   <- dirty
803// ymm0  <- [d00 d11 d22 d33]
804// ymm1  <- [d01 d10 d23 d32]
805//
806// output arguments:
807// r10   <- alpha
808// r11   <- beta
809// r12   <- C
810// r13   <- 4*sdc*sizeof(double)
811// r15   <- dirty
812// ymm0  <- [d00 d10 d20 d30]
813// ymm1  <- [d01 d11 d21 d31]
814
815#if MACRO_LEVEL>=1
816	.macro INNER_SCALE_AB_4X2_LIB4
817#else
818	.p2align 4,,15
819#if defined(OS_LINUX)
820	.type inner_scale_ab_4x2_lib4, @function
821inner_scale_ab_4x2_lib4:
822#elif defined(OS_MAC)
823_inner_scale_ab_4x2_lib4:
824#elif defined(OS_WINDOWS)
825	.def inner_scale_ab_4x2_lib4; .scl 2; .type 32; .endef
826inner_scale_ab_4x2_lib4:
827#endif
828#endif
829
830
831	// alpha
832	vbroadcastsd	0(%r10), %ymm15
833
834	vmulpd		%ymm0, %ymm15, %ymm0
835	vmulpd		%ymm1, %ymm15, %ymm1
836
837	// beta
838	vbroadcastsd	0(%r11), %ymm14
839
840	vxorpd		%ymm15, %ymm15, %ymm15 // 0.0
841
842	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
843	je			0f // end
844
845	// alg==1
846	vmovapd		0(%r12), %ymm15
847	vmulpd		%ymm15, %ymm14, %ymm15
848	vaddpd		%ymm0, %ymm15, %ymm0
849	vmovapd		32(%r12), %ymm15
850	vmulpd		%ymm15, %ymm14, %ymm15
851	vaddpd		%ymm1, %ymm15, %ymm1
852
8530:
854
855#if MACRO_LEVEL>=1
856	.endm
857#else
858	ret
859
860#if defined(OS_LINUX)
861	.size	inner_scale_ab_4x2_lib4, .-inner_scale_ab_4x2_lib4
862#endif
863#endif
864// end
865
866
867
868
869// void inner_scale_ab_2x4_lib4
870// common inner routine with file scope
871//
872// scale for generic alpha and beta
873//
874// input arguments:
875// r10   <- alpha
876// r11   <- beta
877// r12   <- C
878// ymm0 <- [d00 d11 d22 d33]
879// ymm1 <- [d01 d10 d23 d32]
880// ymm2 <- [d03 d12 d21 d30]
881// ymm3 <- [d02 d13 d20 d31]
882// ymm8  <- dirty
883// ymm9  <- dirty
884// ymm10 <- dirty
885// ymm11 <- dirty
886// ymm15 <- dirty
887//
888// output arguments:
889// r10   <- alpha
890// r11   <- beta
891// r10   <- C
892// ymm0 <- [d00 d10 d20 d30]
893// ymm1 <- [d01 d11 d21 d31]
894// ymm2 <- [d02 d12 d22 d32]
895// ymm3 <- [d03 d13 d23 d33]
896// ymm8  <- dirty
897// ymm9  <- dirty
898// ymm10 <- dirty
899// ymm11 <- dirty
900// ymm15 <- dirty
901
902#if MACRO_LEVEL>=1
903	.macro INNER_SCALE_AB_2X4_LIB4
904#else
905	.p2align 4,,15
906#if defined(OS_LINUX)
907	.type inner_scale_ab_2x4_lib4, @function
908inner_scale_ab_2x4_lib4:
909#elif defined(OS_MAC)
910_inner_scale_ab_2x4_lib4:
911#elif defined(OS_WINDOWS)
912	.def inner_scale_ab_2x4_lib4; .scl 2; .type 32; .endef
913inner_scale_ab_2x4_lib4:
914#endif
915#endif
916
917	// alpha
918	vmovddup	0(%r10), %xmm15
919
920	vmulpd		%xmm0, %xmm15, %xmm0
921	vmulpd		%xmm1, %xmm15, %xmm1
922	vmulpd		%xmm2, %xmm15, %xmm2
923	vmulpd		%xmm3, %xmm15, %xmm3
924
925	// beta
926	vmovddup	0(%r11), %xmm14
927
928	vxorpd		%xmm15, %xmm15, %xmm15 // 0.0
929
930	vucomisd	%xmm15, %xmm14 // beta==0.0 ?
931	je			0f // end
932
933	vmovapd		0(%r12), %xmm15
934	vmulpd		%xmm14, %xmm15, %xmm15
935	vaddpd		%xmm15, %xmm0, %xmm0
936	vmovapd		32(%r12), %xmm15
937	vmulpd		%xmm14, %xmm15, %xmm15
938	vaddpd		%xmm15, %xmm1, %xmm1
939	vmovapd		64(%r12), %xmm15
940	vmulpd		%xmm14, %xmm15, %xmm15
941	vaddpd		%xmm15, %xmm2, %xmm2
942	vmovapd		96(%r12), %xmm15
943	vmulpd		%xmm14, %xmm15, %xmm15
944	vaddpd		%xmm15, %xmm3, %xmm3
945
9460:
947
948#if MACRO_LEVEL>=1
949	.endm
950#else
951	ret
952
953#if defined(OS_LINUX)
954	.size	inner_scale_ab_2x4_lib4, .-inner_scale_ab_2x4_lib4
955#endif
956#endif
957
958// end
959
960
961
962
963// void inner_store_4x2_lib4
964// common inner routine with file scope
965//
966// store n
967//
968// input arguments:
969// r10  <- D
970// r11  <- 4*sdd*sizeof(double)
971// r15  <- dirty
972// ymm0 <- [d00 d10 d20 d30]
973// ymm1 <- [d01 d11 d21 d31]
974//
975// output arguments:
976// r10  <- D
977// r11  <- 4*sdd*sizeof(double)
978// r15  <- dirty
979// ymm0 <- [d00 d10 d20 d30]
980// ymm1 <- [d01 d11 d21 d31]
981
982#if MACRO_LEVEL>=1
983	.macro INNER_STORE_4X2_LIB4
984#else
985	.p2align 4,,15
986#if defined(OS_LINUX)
987	.type inner_store_4x2_lib4, @function
988inner_store_4x2_lib4:
989#elif defined(OS_MAC)
990_inner_store_4x2_lib4:
991#elif defined(OS_WINDOWS)
992	.def inner_store_4x2_lib4; .scl 2; .type 32; .endef
993inner_store_4x2_lib4:
994#endif
995#endif
996
997	vmovapd %ymm0,  0(%r10)
998	vmovapd %ymm1, 32(%r10)
999
1000#if MACRO_LEVEL>=1
1001	.endm
1002#else
1003	ret
1004
1005#if defined(OS_LINUX)
1006	.size	inner_store_4x2_lib4, .-inner_store_4x2_lib4
1007#endif
1008#endif
1009// end
1010
1011
1012
1013
1014// void inner_store_2x4_lib4
1015// common inner routine with file scope
1016//
1017// store n
1018//
1019// input arguments:
1020// r10   <- D
1021// ymm0  <- [d00 d10]
1022// ymm1  <- [d01 d11]
1023// ymm2  <- [d02 d12]
1024// ymm3  <- [d03 d13]
1025//
1026// output arguments:
1027// r10   <- D
1028// ymm0  <- [d00 d10]
1029// ymm1  <- [d01 d11]
1030// ymm2  <- [d02 d12]
1031// ymm3  <- [d03 d13]
1032
1033#if MACRO_LEVEL>=1
1034	.macro INNER_STORE_2X4_LIB4
1035#else
1036	.p2align 4,,15
1037#if defined(OS_LINUX)
1038	.type inner_store_2x4_lib4, @function
1039inner_store_2x4_lib4:
1040#elif defined(OS_MAC)
1041_inner_store_2x4_lib4:
1042#elif defined(OS_WINDOWS)
1043	.def inner_store_2x4_lib4; .scl 2; .type 32; .endef
1044inner_store_2x4_lib4:
1045#endif
1046#endif
1047
1048	vmovapd %xmm0,   0(%r10)
1049	vmovapd %xmm1,  32(%r10)
1050	vmovapd %xmm2,  64(%r10)
1051	vmovapd %xmm3,  96(%r10)
1052
1053#if MACRO_LEVEL>=1
1054	.endm
1055#else
1056	ret
1057
1058#if defined(OS_LINUX)
1059	.size	inner_store_2x4_lib4, .-inner_store_2x4_lib4
1060#endif
1061#endif
1062// end
1063
1064
1065
1066
1067// void inner_store_2x2_lib4
1068// common inner routine with file scope
1069//
1070// store n
1071//
1072// input arguments:
1073// r10   <- D
1074// ymm0  <- [d00 d10]
1075// ymm1  <- [d01 d11]
1076// ymm2  <- [d02 d12]
1077// ymm3  <- [d03 d13]
1078//
1079// output arguments:
1080// r10   <- D
1081// ymm0  <- [d00 d10]
1082// ymm1  <- [d01 d11]
1083// ymm2  <- [d02 d12]
1084// ymm3  <- [d03 d13]
1085
1086#if MACRO_LEVEL>=1
1087	.macro INNER_STORE_2X2_LIB4
1088#else
1089	.p2align 4,,15
1090#if defined(OS_LINUX)
1091	.type inner_store_2x2_lib4, @function
1092inner_store_2x2_lib4:
1093#elif defined(OS_MAC)
1094_inner_store_2x2_lib4:
1095#elif defined(OS_WINDOWS)
1096	.def inner_store_2x2_lib4; .scl 2; .type 32; .endef
1097inner_store_2x2_lib4:
1098#endif
1099#endif
1100
1101	vmovapd %xmm0,   0(%r10)
1102	vmovapd %xmm1,  32(%r10)
1103
1104#if MACRO_LEVEL>=1
1105	.endm
1106#else
1107	ret
1108
1109#if defined(OS_LINUX)
1110	.size	inner_store_2x2_lib4, .-inner_store_2x2_lib4
1111#endif
1112#endif
1113// end
1114
1115
1116
1117
1118// void inner_store_4x2_vs_lib4
1119// common inner routine with file scope
1120//
1121// store n vs
1122//
1123// input arguments:
1124// r10  <- D
1125// r11d  <- km
1126// r12d  <- kn
1127// ymm0 <- [d00 d10 d20 d30]
1128// ymm1 <- [d01 d11 d21 d31]
1129//
1130// output arguments:
1131// r10  <- D
1132// r11d  <- km
1133// r12d  <- kn
1134// ymm0 <- [d00 d10 d20 d30]
1135// ymm1 <- [d01 d11 d21 d31]
1136
1137#if MACRO_LEVEL>=1
1138	.macro INNER_STORE_4X2_VS_LIB4
1139#else
1140	.p2align 4,,15
1141#if defined(OS_LINUX)
1142	.type inner_store_4x2_vs_lib4, @function
1143inner_store_4x2_vs_lib4:
1144#elif defined(OS_MAC)
1145_inner_store_4x2_vs_lib4:
1146#elif defined(OS_WINDOWS)
1147	.def inner_store_4x2_vs_lib4; .scl 2; .type 32; .endef
1148inner_store_4x2_vs_lib4:
1149#endif
1150#endif
1151
1152	vcvtsi2sd	%r11d, %xmm15, %xmm15
1153#if defined(OS_LINUX) | defined(OS_WINDOWS)
1154	vmovupd		.LC02(%rip), %ymm14
1155#elif defined(OS_MAC)
1156	vmovupd		LC02(%rip), %ymm14
1157#endif
1158	vmovddup	%xmm15, %xmm15
1159	vinsertf128	$1, %xmm15, %ymm15, %ymm15
1160	vsubpd		%ymm15, %ymm14, %ymm15
1161
1162	vmaskmovpd	%ymm0, %ymm15,  0(%r10)
1163	cmpl		$2, %r12d
1164	jl			0f // end
1165	vmaskmovpd	%ymm1, %ymm15, 32(%r10)
1166
11670:
1168
1169#if MACRO_LEVEL>=1
1170	.endm
1171#else
1172	ret
1173
1174#if defined(OS_LINUX)
1175	.size	inner_store_4x2_vs_lib4, .-inner_store_4x2_vs_lib4
1176#endif
1177#endif
1178// end
1179
1180
1181
1182
1183// void inner_store_2x4_vs_lib4
1184// common inner routine with file scope
1185//
1186// store n
1187//
1188// input arguments:
1189// r10   <- D
1190// r11   <- m1
1191// r12   <- n1
1192// ymm0  <- [d00 d10 d20 d30]
1193// ymm1  <- [d01 d11 d21 d31]
1194// ymm2  <- [d02 d12 d22 d32]
1195// ymm3  <- [d03 d13 d23 d33]
1196// ymm4  <- [d40 d50 d60 d70]
1197// ymm5  <- [d41 d51 d61 d71]
1198// ymm6  <- [d42 d52 d62 d72]
1199// ymm7  <- [d43 d53 d63 d73]
1200//
1201// output arguments:
1202// r10   <- D
1203// r11   <- m1
1204// r12   <- n1
1205// ymm0  <- [d00 d10 d20 d30]
1206// ymm1  <- [d01 d11 d21 d31]
1207// ymm2  <- [d02 d12 d22 d32]
1208// ymm3  <- [d03 d13 d23 d33]
1209// ymm4  <- [d40 d50 d60 d70]
1210// ymm5  <- [d41 d51 d61 d71]
1211// ymm6  <- [d42 d52 d62 d72]
1212// ymm7  <- [d43 d53 d63 d73]
1213
1214#if MACRO_LEVEL>=1
1215	.macro INNER_STORE_2X4_VS_LIB4
1216#else
1217	.p2align 4,,15
1218#if defined(OS_LINUX)
1219	.type inner_store_2x4_vs_lib4, @function
1220inner_store_2x4_vs_lib4:
1221#elif defined(OS_MAC)
1222_inner_store_2x4_vs_lib4:
1223#elif defined(OS_WINDOWS)
1224	.def inner_store_2x4_vs_lib4; .scl 2; .type 32; .endef
1225inner_store_2x4_vs_lib4:
1226#endif
1227#endif
1228
1229	vcvtsi2sd	%r11d, %xmm15, %xmm15
1230#if defined(OS_LINUX) | defined(OS_WINDOWS)
1231	vmovupd		.LC02(%rip), %ymm14
1232#elif defined(OS_MAC)
1233	vmovupd		LC02(%rip), %ymm14
1234#endif
1235	vmovddup	%xmm15, %xmm15
1236	vinsertf128	$1, %xmm15, %ymm15, %ymm15
1237	vsubpd		%ymm15, %ymm14, %ymm15
1238
1239	cmpl		$2, %r12d
1240	vmaskmovpd	%xmm0, %xmm15,  0(%r10)
1241	jl			0f // end
1242	cmpl		$3, %r12d
1243	vmaskmovpd	%xmm1, %xmm15, 32(%r10)
1244	jl			0f // end
1245	vmaskmovpd	%xmm2, %xmm15, 64(%r10)
1246	je			0f // end
1247	vmaskmovpd	%xmm3, %xmm15, 96(%r10)
1248
12490:
1250
1251#if MACRO_LEVEL>=1
1252	.endm
1253#else
1254	ret
1255
1256#if defined(OS_LINUX)
1257	.size	inner_store_2x4_vs_lib4, .-inner_store_2x4_vs_lib4
1258#endif
1259#endif
1260// end
1261
1262
1263
1264
1265// ASM Kernels
1266
1267
1268
1269
1270// void kernel_dgemm_nn_4x2_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
1271//                               1      2              3          4            5          6        7             8          9
1272
1273	.p2align 4,,15
1274#if defined(OS_LINUX)
1275	.globl kernel_dgemm_nn_4x2_lib4
1276	.type kernel_dgemm_nn_4x2_lib4, @function
1277kernel_dgemm_nn_4x2_lib4:
1278#elif defined(OS_MAC)
1279	.globl _kernel_dgemm_nn_4x2_lib4
1280_kernel_dgemm_nn_4x2_lib4:
1281#elif defined(OS_WINDOWS)
1282	.globl kernel_dgemm_nn_4x2_lib4
1283	.def kernel_dgemm_nn_4x2_lib4; .scl 2; .type 32; .endef
1284kernel_dgemm_nn_4x2_lib4:
1285#endif
1286
1287	PROLOGUE
1288
1289	// zero accumulation registers
1290
1291	vxorpd	%ymm0, %ymm0, %ymm0
1292	vmovapd	%ymm0, %ymm1
1293	vmovapd	%ymm0, %ymm2
1294	vmovapd	%ymm0, %ymm3
1295
1296
1297	// call inner dgemm kernel nn
1298
1299	movq	ARG1, %r10 // k
1300	movq	ARG3, %r11  // A
1301	movq	ARG5, %r12  // B
1302	movq	ARG6, %r13 // sdb
1303	sall	$5, %r13d // 4*sdb*sizeof(double)
1304	movq	ARG4, %r14 // offsetB
1305
1306#if MACRO_LEVEL>=1
1307	INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4
1308#else
1309#if defined(OS_LINUX) | defined(OS_WINDOWS)
1310	call inner_edge_dgemm_add_nn_4x2_lib4
1311#elif defined(OS_MAC)
1312	callq _inner_edge_dgemm_add_nn_4x2_lib4
1313#endif
1314#endif
1315
1316#if MACRO_LEVEL>=2
1317	INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4
1318#else
1319#if defined(OS_LINUX) | defined(OS_WINDOWS)
1320	call inner_kernel_dgemm_add_nn_4x2_lib4
1321#elif defined(OS_MAC)
1322	callq _inner_kernel_dgemm_add_nn_4x2_lib4
1323#endif
1324#endif
1325
1326
1327	// call inner blend
1328
1329	movq	ARG2, %r10 // alpha
1330	movq	ARG7, %r11 // beta
1331	movq	ARG8, %r12   // C
1332
1333#if MACRO_LEVEL>=1
1334	INNER_SCALE_AB_4X2_LIB4
1335#else
1336#if defined(OS_LINUX) | defined(OS_WINDOWS)
1337	call inner_scale_ab_4x2_lib4
1338#elif defined(OS_MAC)
1339	callq _inner_scale_ab_4x2_lib4
1340#endif
1341#endif
1342
1343
1344	// store n
1345
1346	movq	ARG9, %r10 // D
1347
1348#if MACRO_LEVEL>=1
1349	INNER_STORE_4X2_LIB4
1350#else
1351#if defined(OS_LINUX) | defined(OS_WINDOWS)
1352	call inner_store_4x2_lib4
1353#elif defined(OS_MAC)
1354	callq _inner_store_4x2_lib4
1355#endif
1356#endif
1357
1358
1359	EPILOGUE
1360
1361	ret
1362
1363#if defined(OS_LINUX)
1364	.size	kernel_dgemm_nn_4x2_lib4, .-kernel_dgemm_nn_4x2_lib4
1365#endif
1366// end
1367
1368
1369
1370
1371// void kernel_dgemm_nn_4x2_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1);
1372//                                  1      2              3          4            5          6        7             8          9          10      11
1373
1374	.p2align 4,,15
1375#if defined(OS_LINUX)
1376	.globl kernel_dgemm_nn_4x2_vs_lib4
1377	.type kernel_dgemm_nn_4x2_vs_lib4, @function
1378kernel_dgemm_nn_4x2_vs_lib4:
1379#elif defined(OS_MAC)
1380	.globl _kernel_dgemm_nn_4x2_vs_lib4
1381_kernel_dgemm_nn_4x2_vs_lib4:
1382#elif defined(OS_WINDOWS)
1383	.globl kernel_dgemm_nn_4x2_vs_lib4
1384	.def kernel_dgemm_nn_4x2_vs_lib4; .scl 2; .type 32; .endef
1385kernel_dgemm_nn_4x2_vs_lib4:
1386#endif
1387
1388	PROLOGUE
1389
1390	// zero accumulation registers
1391
1392	vxorpd	%ymm0, %ymm0, %ymm0
1393	vmovapd	%ymm0, %ymm1
1394	vmovapd	%ymm0, %ymm2
1395	vmovapd	%ymm0, %ymm3
1396
1397
1398	// call inner dgemm kernel nn
1399
1400	movq	ARG1, %r10 // k
1401	movq	ARG3, %r11  // A
1402	movq	ARG5, %r12  // B
1403	movq	ARG6, %r13 // sdb
1404	sall	$5, %r13d // 4*sdb*sizeof(double)
1405	movq	ARG4, %r14 // offsetB
1406
1407#if MACRO_LEVEL>=1
1408	INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4
1409#else
1410#if defined(OS_LINUX) | defined(OS_WINDOWS)
1411	call inner_edge_dgemm_add_nn_4x2_lib4
1412#elif defined(OS_MAC)
1413	callq _inner_edge_dgemm_add_nn_4x2_lib4
1414#endif
1415#endif
1416
1417#if MACRO_LEVEL>=2
1418	INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4
1419#else
1420#if defined(OS_LINUX) | defined(OS_WINDOWS)
1421	call inner_kernel_dgemm_add_nn_4x2_lib4
1422#elif defined(OS_MAC)
1423	callq _inner_kernel_dgemm_add_nn_4x2_lib4
1424#endif
1425#endif
1426
1427
1428	// call inner blend
1429
1430	movq	ARG2, %r10 // alpha
1431	movq	ARG7, %r11 // beta
1432	movq	ARG8, %r12   // C
1433
1434#if MACRO_LEVEL>=1
1435	INNER_SCALE_AB_4X2_LIB4
1436#else
1437#if defined(OS_LINUX) | defined(OS_WINDOWS)
1438	call inner_scale_ab_4x2_lib4
1439#elif defined(OS_MAC)
1440	callq _inner_scale_ab_4x2_lib4
1441#endif
1442#endif
1443
1444
1445	// store n
1446
1447	movq	ARG9, %r10 // D
1448	movq	ARG10, %r11 // m1
1449	movq	ARG11, %r12 // n1
1450
1451#if MACRO_LEVEL>=1
1452	INNER_STORE_4X2_VS_LIB4
1453#else
1454#if defined(OS_LINUX) | defined(OS_WINDOWS)
1455	call inner_store_4x2_vs_lib4
1456#elif defined(OS_MAC)
1457	callq _inner_store_4x2_vs_lib4
1458#endif
1459#endif
1460
1461
1462	EPILOGUE
1463
1464	ret
1465
1466#if defined(OS_LINUX)
1467	.size	kernel_dgemm_nn_4x2_vs_lib4, .-kernel_dgemm_nn_4x2_vs_lib4
1468#endif
1469
1470// end
1471
1472
1473
1474
1475// void kernel_dgemm_nn_2x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
1476//                               1      2              3          4            5          6        7             8          9
1477
1478	.p2align 4,,15
1479#if defined(OS_LINUX)
1480	.globl kernel_dgemm_nn_2x4_lib4
1481	.type kernel_dgemm_nn_2x4_lib4, @function
1482kernel_dgemm_nn_2x4_lib4:
1483#elif defined(OS_MAC)
1484	.globl _kernel_dgemm_nn_2x4_lib4
1485_kernel_dgemm_nn_2x4_lib4:
1486#elif defined(OS_WINDOWS)
1487	.globl kernel_dgemm_nn_2x4_lib4
1488	.def kernel_dgemm_nn_2x4_lib4; .scl 2; .type 32; .endef
1489kernel_dgemm_nn_2x4_lib4:
1490#endif
1491
1492	PROLOGUE
1493
1494	// zero accumulation registers
1495
1496	vxorpd	%ymm0, %ymm0, %ymm0
1497	vmovapd	%ymm0, %ymm1
1498	vmovapd	%ymm0, %ymm2
1499	vmovapd	%ymm0, %ymm3
1500	vmovapd	%ymm0, %ymm4
1501	vmovapd	%ymm0, %ymm5
1502	vmovapd	%ymm0, %ymm6
1503	vmovapd	%ymm0, %ymm7
1504
1505
1506	// call inner dgemm kernel nn
1507
1508	movq	ARG1, %r10 // k
1509	movq	ARG3, %r11  // A
1510	movq	ARG5, %r12  // B
1511	movq	ARG6, %r13 // sdb
1512	sall	$5, %r13d // 4*sdb*sizeof(double)
1513	movq	ARG4, %r14 // offsetB
1514
1515#if MACRO_LEVEL>=1
1516	INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4
1517#else
1518#if defined(OS_LINUX) | defined(OS_WINDOWS)
1519	call inner_edge_dgemm_add_nn_2x4_lib4
1520#elif defined(OS_MAC)
1521	callq _inner_edge_dgemm_add_nn_2x4_lib4
1522#endif
1523#endif
1524
1525#if MACRO_LEVEL>=2
1526	INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4
1527#else
1528#if defined(OS_LINUX) | defined(OS_WINDOWS)
1529	call inner_kernel_dgemm_add_nn_2x4_lib4
1530#elif defined(OS_MAC)
1531	callq _inner_kernel_dgemm_add_nn_2x4_lib4
1532#endif
1533#endif
1534
1535
1536	// call inner blend
1537
1538	movq	ARG2, %r10 // alpha
1539	movq	ARG7, %r11 // beta
1540	movq	ARG8, %r12   // C
1541
1542#if MACRO_LEVEL>=1
1543	INNER_SCALE_AB_2X4_LIB4
1544#else
1545#if defined(OS_LINUX) | defined(OS_WINDOWS)
1546	call inner_scale_ab_2x4_lib4
1547#elif defined(OS_MAC)
1548	callq _inner_scale_ab_2x4_lib4
1549#endif
1550#endif
1551
1552
1553	// store n
1554
1555	movq	ARG9, %r10 // D
1556
1557#if MACRO_LEVEL>=1
1558	INNER_STORE_2X4_LIB4
1559#else
1560#if defined(OS_LINUX) | defined(OS_WINDOWS)
1561	call inner_store_2x4_lib4
1562#elif defined(OS_MAC)
1563	callq _inner_store_2x4_lib4
1564#endif
1565#endif
1566
1567
1568	EPILOGUE
1569
1570	ret
1571
1572#if defined(OS_LINUX)
1573	.size	kernel_dgemm_nn_2x4_lib4, .-kernel_dgemm_nn_2x4_lib4
1574#endif
1575
1576// end
1577
1578
1579
1580
1581// void kernel_dgemm_nn_2x4_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1);
1582//                                  1      2              3          4            5          6        7             8          9          10      11
1583
1584	.p2align 4,,15
1585#if defined(OS_LINUX)
1586	.globl kernel_dgemm_nn_2x4_vs_lib4
1587	.type kernel_dgemm_nn_2x4_vs_lib4, @function
1588kernel_dgemm_nn_2x4_vs_lib4:
1589#elif defined(OS_MAC)
1590	.globl _kernel_dgemm_nn_2x4_vs_lib4
1591_kernel_dgemm_nn_2x4_vs_lib4:
1592#elif defined(OS_WINDOWS)
1593	.globl kernel_dgemm_nn_2x4_vs_lib4
1594	.def kernel_dgemm_nn_2x4_vs_lib4; .scl 2; .type 32; .endef
1595kernel_dgemm_nn_2x4_vs_lib4:
1596#endif
1597
1598	PROLOGUE
1599
1600	// zero accumulation registers
1601
1602	vxorpd	%ymm0, %ymm0, %ymm0
1603	vmovapd	%ymm0, %ymm1
1604	vmovapd	%ymm0, %ymm2
1605	vmovapd	%ymm0, %ymm3
1606	vmovapd	%ymm0, %ymm4
1607	vmovapd	%ymm0, %ymm5
1608	vmovapd	%ymm0, %ymm6
1609	vmovapd	%ymm0, %ymm7
1610
1611
1612	// call inner dgemm kernel nn
1613
1614	movq	ARG1, %r10 // k
1615	movq	ARG3, %r11  // A
1616	movq	ARG5, %r12  // B
1617	movq	ARG6, %r13 // sdb
1618	sall	$5, %r13d // 4*sdb*sizeof(double)
1619	movq	ARG4, %r14 // offsetB
1620
1621#if MACRO_LEVEL>=1
1622	INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4
1623#else
1624#if defined(OS_LINUX) | defined(OS_WINDOWS)
1625	call inner_edge_dgemm_add_nn_2x4_lib4
1626#elif defined(OS_MAC)
1627	callq _inner_edge_dgemm_add_nn_2x4_lib4
1628#endif
1629#endif
1630
1631#if MACRO_LEVEL>=2
1632	INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4
1633#else
1634#if defined(OS_LINUX) | defined(OS_WINDOWS)
1635	call inner_kernel_dgemm_add_nn_2x4_lib4
1636#elif defined(OS_MAC)
1637	callq _inner_kernel_dgemm_add_nn_2x4_lib4
1638#endif
1639#endif
1640
1641
1642	// call inner blend
1643
1644	movq	ARG2, %r10 // alpha
1645	movq	ARG7, %r11 // beta
1646	movq	ARG8, %r12   // C
1647
1648#if MACRO_LEVEL>=1
1649	INNER_SCALE_AB_2X4_LIB4
1650#else
1651#if defined(OS_LINUX) | defined(OS_WINDOWS)
1652	call inner_scale_ab_2x4_lib4
1653#elif defined(OS_MAC)
1654	callq _inner_scale_ab_2x4_lib4
1655#endif
1656#endif
1657
1658
1659	// store n
1660
1661	movq	ARG9, %r10 // D
1662	movq	ARG10, %r11 // m1
1663	movq	ARG11, %r12 // n1
1664
1665#if MACRO_LEVEL>=1
1666	INNER_STORE_2X4_VS_LIB4
1667#else
1668#if defined(OS_LINUX) | defined(OS_WINDOWS)
1669	call inner_store_2x4_vs_lib4
1670#elif defined(OS_MAC)
1671	callq _inner_store_2x4_vs_lib4
1672#endif
1673#endif
1674
1675
1676	EPILOGUE
1677
1678	ret
1679
1680#if defined(OS_LINUX)
1681	.size	kernel_dgemm_nn_2x4_vs_lib4, .-kernel_dgemm_nn_2x4_vs_lib4
1682#endif
1683
1684// end
1685
1686
1687
1688
1689// void kernel_dgemm_nn_2x2_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D);
1690//                               1      2              3          4            5          6        7             8          9
1691
1692	.p2align 4,,15
1693#if defined(OS_LINUX)
1694	.globl kernel_dgemm_nn_2x2_lib4
1695	.type kernel_dgemm_nn_2x2_lib4, @function
1696kernel_dgemm_nn_2x2_lib4:
1697#elif defined(OS_MAC)
1698	.globl _kernel_dgemm_nn_2x2_lib4
1699_kernel_dgemm_nn_2x2_lib4:
1700#elif defined(OS_WINDOWS)
1701	.globl kernel_dgemm_nn_2x2_lib4
1702	.def kernel_dgemm_nn_2x2_lib4; .scl 2; .type 32; .endef
1703kernel_dgemm_nn_2x2_lib4:
1704#endif
1705
1706	PROLOGUE
1707
1708	// zero accumulation registers
1709
1710	vxorpd	%ymm0, %ymm0, %ymm0
1711	vmovapd	%ymm0, %ymm1
1712	vmovapd	%ymm0, %ymm2
1713	vmovapd	%ymm0, %ymm3
1714
1715
1716	// call inner dgemm kernel nn
1717
1718	movq	ARG1, %r10 // k
1719	movq	ARG3, %r11  // A
1720	movq	ARG5, %r12  // B
1721	movq	ARG6, %r13 // sdb
1722	sall	$5, %r13d // 4*sdb*sizeof(double)
1723	movq	ARG4, %r14 // offsetB
1724
1725#if MACRO_LEVEL>=1
1726	INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4
1727#else
1728#if defined(OS_LINUX) | defined(OS_WINDOWS)
1729	call inner_edge_dgemm_add_nn_4x2_lib4
1730#elif defined(OS_MAC)
1731	callq _inner_edge_dgemm_add_nn_4x2_lib4
1732#endif
1733#endif
1734
1735#if MACRO_LEVEL>=2
1736	INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4
1737#else
1738#if defined(OS_LINUX) | defined(OS_WINDOWS)
1739	call inner_kernel_dgemm_add_nn_4x2_lib4
1740#elif defined(OS_MAC)
1741	callq _inner_kernel_dgemm_add_nn_4x2_lib4
1742#endif
1743#endif
1744
1745
1746	// call inner blend
1747
1748	movq	ARG2, %r10 // alpha
1749	movq	ARG7, %r11 // beta
1750	movq	ARG8, %r12   // C
1751
1752#if MACRO_LEVEL>=1
1753	INNER_SCALE_AB_4X2_LIB4
1754#else
1755#if defined(OS_LINUX) | defined(OS_WINDOWS)
1756	call inner_scale_ab_4x2_lib4
1757#elif defined(OS_MAC)
1758	callq _inner_scale_ab_4x2_lib4
1759#endif
1760#endif
1761
1762
1763	// store n
1764
1765	movq	ARG9, %r10 // D
1766
1767#if MACRO_LEVEL>=1
1768	INNER_STORE_2X2_LIB4
1769#else
1770#if defined(OS_LINUX) | defined(OS_WINDOWS)
1771	call inner_store_2x2_lib4
1772#elif defined(OS_MAC)
1773	callq _inner_store_2x2_lib4
1774#endif
1775#endif
1776
1777
1778	EPILOGUE
1779
1780	ret
1781
1782#if defined(OS_LINUX)
1783	.size	kernel_dgemm_nn_2x2_lib4, .-kernel_dgemm_nn_2x2_lib4
1784#endif
1785// end
1786
1787
1788
1789
1790// Data
1791
1792
1793
1794
1795	// read-only data
1796#if defined(OS_LINUX)
1797	.section	.rodata.cst32,"aM",@progbits,32
1798#elif defined(OS_MAC)
1799	.section	__TEXT,__const
1800#elif defined(OS_WINDOWS)
1801	.section .rdata,"dr"
1802#endif
1803
1804#if defined(OS_LINUX) | defined(OS_WINDOWS)
1805	.align 32
1806.LC00: // { -1 -1 -1 1 }
1807#elif defined(OS_MAC)
1808LC00: // { -1 -1 -1 1 }
1809	.align 5
1810#endif
1811	.quad	-1
1812	.quad	-1
1813	.quad	-1
1814	.quad	1
1815
1816#if defined(OS_LINUX) | defined(OS_WINDOWS)
1817	.align 32
1818.LC01: // { -1 -1 -1 -1 }
1819#elif defined(OS_MAC)
1820LC01: // { -1 -1 -1 -1 }
1821	.align 5
1822#endif
1823	.quad	-1
1824	.quad	-1
1825	.quad	-1
1826	.quad	-1
1827
1828#if defined(OS_LINUX) | defined(OS_WINDOWS)
1829	.align 32
1830.LC02: // { 3.5 2.5 1.5 0.5 }
1831#elif defined(OS_MAC)
1832LC02: // { 3.5 2.5 1.5 0.5 }
1833	.align 5
1834#endif
1835	.long	0
1836	.long	1071644672
1837	.long	0
1838	.long	1073217536
1839	.long	0
1840	.long	1074003968
1841	.long	0
1842	.long	1074528256
1843
1844#if defined(OS_LINUX) | defined(OS_WINDOWS)
1845	.align 32
1846.LC03: // { 7.5 6.5 5.5 4.5 }
1847#elif defined(OS_MAC)
1848LC03: // { 7.5 6.5 5.5 4.5 }
1849	.align 5
1850#endif
1851	.long	0
1852	.long	1074921472
1853	.long	0
1854	.long	1075183616
1855	.long	0
1856	.long	1075445760
1857	.long	0
1858	.long	1075707904
1859
1860#if defined(OS_LINUX) | defined(OS_WINDOWS)
1861	.align 32
1862.LC04: // { 1.0 1.0 1.0 1.0 }
1863#elif defined(OS_MAC)
1864LC04: // { 1.0 1.0 1.0 1.0 }
1865	.align 5
1866#endif
1867	.long	0
1868	.long	1072693248
1869	.long	0
1870	.long	1072693248
1871	.long	0
1872	.long	1072693248
1873	.long	0
1874	.long	1072693248
1875
1876#if defined(OS_LINUX) | defined(OS_WINDOWS)
1877	.align 32
1878.LC05: // { 1.0 1.0 1.0 -1.0 }
1879#elif defined(OS_MAC)
1880	.align 5
1881LC05: // { 1.0 1.0 1.0 -1.0 }
1882#endif
1883	.long	0
1884	.long	-1074790400
1885	.long	0
1886	.long	1072693248
1887	.long	0
1888	.long	1072693248
1889	.long	0
1890	.long	1072693248
1891
1892#if defined(OS_LINUX) | defined(OS_WINDOWS)
1893	.align 32
1894.LC06: // { 1.0 1.0 -1.0 -1.0 }
1895#elif defined(OS_MAC)
1896	.align 5
1897LC06: // { 1.0 1.0 -1.0 -1.0 }
1898#endif
1899	.long	0
1900	.long	-1074790400
1901	.long	0
1902	.long	-1074790400
1903	.long	0
1904	.long	1072693248
1905	.long	0
1906	.long	1072693248
1907
1908#if defined(OS_LINUX) | defined(OS_WINDOWS)
1909	.align 32
1910.LC07: // { 1.0 -1.0 -1.0 -1.0 }
1911#elif defined(OS_MAC)
1912	.align 5
1913LC07: // { 1.0 -1.0 -1.0 -1.0 }
1914#endif
1915	.long	0
1916	.long	-1074790400
1917	.long	0
1918	.long	-1074790400
1919	.long	0
1920	.long	-1074790400
1921	.long	0
1922	.long	1072693248
1923
1924#if defined(OS_LINUX) | defined(OS_WINDOWS)
1925	.align 32
1926.LC08: // { -1.0 -1.0 -1.0 1.0 }
1927#elif defined(OS_MAC)
1928	.align 5
1929LC08: // { -1.0 -1.0 -1.0 1.0 }
1930#endif
1931	.long	0
1932	.long	1072693248
1933	.long	0
1934	.long	-1074790400
1935	.long	0
1936	.long	-1074790400
1937	.long	0
1938	.long	-1074790400
1939
1940#if defined(OS_LINUX) | defined(OS_WINDOWS)
1941	.align 32
1942.LC09: // { -1.0 -1.0 1.0 1.0 }
1943#elif defined(OS_MAC)
1944	.align 5
1945LC09: // { -1.0 -1.0 1.0 1.0 }
1946#endif
1947	.long	0
1948	.long	1072693248
1949	.long	0
1950	.long	1072693248
1951	.long	0
1952	.long	-1074790400
1953	.long	0
1954	.long	-1074790400
1955
1956#if defined(OS_LINUX) | defined(OS_WINDOWS)
1957	.align 32
1958.LC10: // { -1.0 1.0 1.0 1.0 }
1959#elif defined(OS_MAC)
1960	.align 5
1961LC10: // { -1.0 1.0 1.0 1.0 }
1962#endif
1963	.long	0
1964	.long	1072693248
1965	.long	0
1966	.long	1072693248
1967	.long	0
1968	.long	1072693248
1969	.long	0
1970	.long	-1074790400
1971
1972
1973
1974
1975#if defined(OS_LINUX)
1976	.section	.note.GNU-stack,"",@progbits
1977#elif defined(OS_MAC)
1978	.subsections_via_symbols
1979#endif
1980