1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36#if defined(OS_LINUX) | defined(OS_MAC)
37
38//#define STACKSIZE 96
39#define STACKSIZE 64
40#define ARG1  %rdi
41#define ARG2  %rsi
42#define ARG3  %rdx
43#define ARG4  %rcx
44#define ARG5  %r8
45#define ARG6  %r9
46#define ARG7  STACKSIZE +  8(%rsp)
47#define ARG8  STACKSIZE + 16(%rsp)
48#define ARG9  STACKSIZE + 24(%rsp)
49#define ARG10 STACKSIZE + 32(%rsp)
50#define ARG11 STACKSIZE + 40(%rsp)
51#define ARG12 STACKSIZE + 48(%rsp)
52#define ARG13 STACKSIZE + 56(%rsp)
53#define ARG14 STACKSIZE + 64(%rsp)
54#define ARG15 STACKSIZE + 72(%rsp)
55#define ARG16 STACKSIZE + 80(%rsp)
56#define ARG17 STACKSIZE + 88(%rsp)
57#define ARG18 STACKSIZE + 96(%rsp)
58#define PROLOGUE \
59	subq	$STACKSIZE, %rsp; \
60	movq	%rbx,   (%rsp); \
61	movq	%rbp,  8(%rsp); \
62	movq	%r12, 16(%rsp); \
63	movq	%r13, 24(%rsp); \
64	movq	%r14, 32(%rsp); \
65	movq	%r15, 40(%rsp); \
66	vzeroupper;
67#define EPILOGUE \
68	vzeroupper; \
69	movq	  (%rsp), %rbx; \
70	movq	 8(%rsp), %rbp; \
71	movq	16(%rsp), %r12; \
72	movq	24(%rsp), %r13; \
73	movq	32(%rsp), %r14; \
74	movq	40(%rsp), %r15; \
75	addq	$STACKSIZE, %rsp;
76
77#if defined(OS_LINUX)
78
79#define GLOB_FUN_START(NAME) \
80	.globl NAME; \
81	.type NAME, @function; \
82NAME:
83#define FUN_START(NAME) \
84	.type NAME, @function; \
85NAME:
86#define FUN_END(NAME) \
87	.size	NAME, .-NAME
88#define CALL(NAME) \
89	call NAME
90#define ZERO_ACC \
91	vxorps	%ymm0, %ymm0, %ymm0; \
92	vmovaps	%ymm0, %ymm1; \
93	vmovaps	%ymm0, %ymm2; \
94	vmovaps	%ymm0, %ymm3; \
95	vmovaps	%ymm0, %ymm4; \
96	vmovaps	%ymm0, %ymm5; \
97	vmovaps	%ymm0, %ymm6; \
98	vmovaps	%ymm0, %ymm7
99//#define NEG_ACC \
100//	vmovapd		.LC11(%rip), %ymm15; \
101//	vxorpd		%ymm15, %ymm0, %ymm0; \
102//	vxorpd		%ymm15, %ymm1, %ymm1; \
103//	vxorpd		%ymm15, %ymm2, %ymm2; \
104//	vxorpd		%ymm15, %ymm3, %ymm3
105
106#else // defined(OS_MAC)
107
108#define GLOB_FUN_START(NAME) \
109	.globl _ ## NAME; \
110_ ## NAME:
111#define FUN_START(NAME) \
112_ ## NAME:
113#define FUN_END(NAME)
114#define CALL(NAME) \
115	callq _ ## NAME
116#define ZERO_ACC \
117	vxorps	%ymm0, %ymm0, %ymm0; \
118	vmovaps	%ymm0, %ymm1; \
119	vmovaps	%ymm0, %ymm2; \
120	vmovaps	%ymm0, %ymm3; \
121	vmovaps	%ymm0, %ymm4; \
122	vmovaps	%ymm0, %ymm5; \
123	vmovaps	%ymm0, %ymm6; \
124	vmovaps	%ymm0, %ymm7
125//#define NEG_ACC \
126//	vmovapd		LC11(%rip), %ymm15; \
127//	vxorpd		%ymm15, %ymm0, %ymm0; \
128//	vxorpd		%ymm15, %ymm1, %ymm1; \
129//	vxorpd		%ymm15, %ymm2, %ymm2; \
130//	vxorpd		%ymm15, %ymm3, %ymm3
131
132#endif
133
134#elif defined(OS_WINDOWS)
135
136#define STACKSIZE 256
137#define ARG1  %rcx
138#define ARG2  %rdx
139#define ARG3  %r8
140#define ARG4  %r9
141#define ARG5  STACKSIZE + 40(%rsp)
142#define ARG6  STACKSIZE + 48(%rsp)
143#define ARG7  STACKSIZE + 56(%rsp)
144#define ARG8  STACKSIZE + 64(%rsp)
145#define ARG9  STACKSIZE + 72(%rsp)
146#define ARG10 STACKSIZE + 80(%rsp)
147#define ARG11 STACKSIZE + 88(%rsp)
148#define ARG12 STACKSIZE + 96(%rsp)
149#define ARG13 STACKSIZE + 104(%rsp)
150#define ARG14 STACKSIZE + 112(%rsp)
151#define ARG15 STACKSIZE + 120(%rsp)
152#define ARG16 STACKSIZE + 128(%rsp)
153#define ARG17 STACKSIZE + 136(%rsp)
154#define ARG18 STACKSIZE + 144(%rsp)
155#define PROLOGUE \
156	subq	$STACKSIZE, %rsp; \
157	movq	%rbx,   (%rsp); \
158	movq	%rbp,  8(%rsp); \
159	movq	%r12, 16(%rsp); \
160	movq	%r13, 24(%rsp); \
161	movq	%r14, 32(%rsp); \
162	movq	%r15, 40(%rsp); \
163	movq	%rdi, 48(%rsp); \
164	movq	%rsi, 56(%rsp); \
165	vmovups	%xmm6, 64(%rsp); \
166	vmovups	%xmm7, 80(%rsp); \
167	vmovups	%xmm8, 96(%rsp); \
168	vmovups	%xmm9, 112(%rsp); \
169	vmovups	%xmm10, 128(%rsp); \
170	vmovups	%xmm11, 144(%rsp); \
171	vmovups	%xmm12, 160(%rsp); \
172	vmovups	%xmm13, 176(%rsp); \
173	vmovups	%xmm14, 192(%rsp); \
174	vmovups	%xmm15, 208(%rsp); \
175	vzeroupper;
176#define EPILOGUE \
177	vzeroupper; \
178	movq	  (%rsp), %rbx; \
179	movq	 8(%rsp), %rbp; \
180	movq	16(%rsp), %r12; \
181	movq	24(%rsp), %r13; \
182	movq	32(%rsp), %r14; \
183	movq	40(%rsp), %r15; \
184	movq	48(%rsp), %rdi; \
185	movq	56(%rsp), %rsi; \
186	vmovups	64(%rsp), %xmm6; \
187	vmovups	80(%rsp), %xmm7; \
188	vmovups	96(%rsp), %xmm8; \
189	vmovups	112(%rsp), %xmm9; \
190	vmovups	128(%rsp), %xmm10; \
191	vmovups	144(%rsp), %xmm11; \
192	vmovups	160(%rsp), %xmm12; \
193	vmovups	176(%rsp), %xmm13; \
194	vmovups	192(%rsp), %xmm14; \
195	vmovups	208(%rsp), %xmm15; \
196	addq	$STACKSIZE, %rsp;
197
198#define GLOB_FUN_START(NAME) \
199	.globl NAME; \
200	.def NAME; .scl 2; .type 32; .endef; \
201NAME:
202#define FUN_START(NAME) \
203	.def NAME; .scl 2; .type 32; .endef; \
204NAME:
205#define FUN_END(NAME)
206#define CALL(NAME) \
207	call NAME
208#define ZERO_ACC \
209	vxorps	%ymm0, %ymm0, %ymm0; \
210	vmovaps	%ymm0, %ymm1; \
211	vmovaps	%ymm0, %ymm2; \
212	vmovaps	%ymm0, %ymm3; \
213	vmovaps	%ymm0, %ymm4; \
214	vmovaps	%ymm0, %ymm5; \
215	vmovaps	%ymm0, %ymm6; \
216	vmovaps	%ymm0, %ymm7
217//#define NEG_ACC \
218//	vmovapd		.LC11(%rip), %ymm15; \
219//	vxorpd		%ymm15, %ymm0, %ymm0; \
220//	vxorpd		%ymm15, %ymm1, %ymm1; \
221//	vxorpd		%ymm15, %ymm2, %ymm2; \
222//	vxorpd		%ymm15, %ymm3, %ymm3
223
224#else
225
226#error wrong OS
227
228#endif
229
230
231
232#if defined(OS_LINUX) | defined(OS_WINDOWS)
233	.text
234#elif defined(OS_MAC)
235	.section	__TEXT,__text,regular,pure_instructions
236#endif
237
238
239
240// common inner routine with file scope
241//
242// input arguments:
243// r10d   <- k
244// r11   <- A
245// r12   <- B
246// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
247// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
248// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
249// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
250// ymm4  <- []
251// ymm5  <- []
252// ymm6  <- []
253// ymm7  <- []
254// ymm8  <- dirty
255// ymm9  <- dirty
256// ymm10 <- dirty
257// ymm11 <- dirty
258// ymm12 <- dirty
259// ymm13 <- dirty
260// ymm14 <- dirty
261// ymm15 <- dirty
262
263//
264// output arguments:
265// r10d  <- 0
266// r11   <- A+4*k*sizeof(double)
267// r12   <- B+4*k*sizeof(double)
268// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
269// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
270// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
271// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
272// ymm4  <- []
273// ymm5  <- []
274// ymm6  <- []
275// ymm7  <- []
276// ymm8  <- dirty
277// ymm9  <- dirty
278// ymm10 <- dirty
279// ymm11 <- dirty
280// ymm12 <- dirty
281// ymm13 <- dirty
282// ymm14 <- dirty
283// ymm15 <- dirty
284
285#if MACRO_LEVEL>=2
286	.macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
287#else
288	.p2align 4,,15
289	FUN_START(inner_kernel_gemm_add_nt_8x8_lib8)
290#endif
291
292
293// broadcast scheme
294#if 1
295
296	cmpl	$0, %r10d
297	jle		2f // return
298
299	// preload
300
301	cmpl	$3, %r10d
302	jle		4f // consider clean-up loop
303
304	// main loop
305	.p2align 3
3061: // main loop
307
308	// unroll 0
309	vmovaps			0(%r11), %ymm13 // A
310	vbroadcastss	0(%r12), %ymm12 // B
311	vfmadd231ps		%ymm13, %ymm12, %ymm0
312	vbroadcastss	4(%r12), %ymm12 // B
313	vfmadd231ps		%ymm13, %ymm12, %ymm1
314	vbroadcastss	8(%r12), %ymm12 // B
315	vfmadd231ps		%ymm13, %ymm12, %ymm2
316	vbroadcastss	12(%r12), %ymm12 // B
317	vfmadd231ps		%ymm13, %ymm12, %ymm3
318	vbroadcastss	16(%r12), %ymm12 // B
319	vfmadd231ps		%ymm13, %ymm12, %ymm4
320	vbroadcastss	20(%r12), %ymm12 // B
321	vfmadd231ps		%ymm13, %ymm12, %ymm5
322	vbroadcastss	24(%r12), %ymm12 // B
323	vfmadd231ps		%ymm13, %ymm12, %ymm6
324	vbroadcastss	28(%r12), %ymm12 // B
325	vfmadd231ps		%ymm13, %ymm12, %ymm7
326	subl	$4, %r10d
327
328	// unroll 0
329	vmovaps			32(%r11), %ymm13 // A
330	vbroadcastss	32(%r12), %ymm12 // B
331	vfmadd231ps		%ymm13, %ymm12, %ymm0
332	vbroadcastss	36(%r12), %ymm12 // B
333	vfmadd231ps		%ymm13, %ymm12, %ymm1
334	vbroadcastss	40(%r12), %ymm12 // B
335	vfmadd231ps		%ymm13, %ymm12, %ymm2
336	vbroadcastss	44(%r12), %ymm12 // B
337	vfmadd231ps		%ymm13, %ymm12, %ymm3
338	vbroadcastss	48(%r12), %ymm12 // B
339	vfmadd231ps		%ymm13, %ymm12, %ymm4
340	vbroadcastss	52(%r12), %ymm12 // B
341	vfmadd231ps		%ymm13, %ymm12, %ymm5
342	vbroadcastss	56(%r12), %ymm12 // B
343	vfmadd231ps		%ymm13, %ymm12, %ymm6
344	vbroadcastss	60(%r12), %ymm12 // B
345	vfmadd231ps		%ymm13, %ymm12, %ymm7
346	addq	$128, %r11
347
348	// unroll 0
349	vmovaps			-64(%r11), %ymm13 // A
350	vbroadcastss	64(%r12), %ymm12 // B
351	vfmadd231ps		%ymm13, %ymm12, %ymm0
352	vbroadcastss	68(%r12), %ymm12 // B
353	vfmadd231ps		%ymm13, %ymm12, %ymm1
354	vbroadcastss	72(%r12), %ymm12 // B
355	vfmadd231ps		%ymm13, %ymm12, %ymm2
356	vbroadcastss	76(%r12), %ymm12 // B
357	vfmadd231ps		%ymm13, %ymm12, %ymm3
358	vbroadcastss	80(%r12), %ymm12 // B
359	vfmadd231ps		%ymm13, %ymm12, %ymm4
360	vbroadcastss	84(%r12), %ymm12 // B
361	vfmadd231ps		%ymm13, %ymm12, %ymm5
362	vbroadcastss	88(%r12), %ymm12 // B
363	vfmadd231ps		%ymm13, %ymm12, %ymm6
364	vbroadcastss	92(%r12), %ymm12 // B
365	vfmadd231ps		%ymm13, %ymm12, %ymm7
366	addq	$128, %r12
367
368	// unroll 0
369	vmovaps			-32(%r11), %ymm13 // A
370	vbroadcastss	-32(%r12), %ymm12 // B
371	vfmadd231ps		%ymm13, %ymm12, %ymm0
372	vbroadcastss	-28(%r12), %ymm12 // B
373	vfmadd231ps		%ymm13, %ymm12, %ymm1
374	vbroadcastss	-24(%r12), %ymm12 // B
375	vfmadd231ps		%ymm13, %ymm12, %ymm2
376	vbroadcastss	-20(%r12), %ymm12 // B
377	vfmadd231ps		%ymm13, %ymm12, %ymm3
378	vbroadcastss	-16(%r12), %ymm12 // B
379	vfmadd231ps		%ymm13, %ymm12, %ymm4
380	vbroadcastss	-12(%r12), %ymm12 // B
381	vfmadd231ps		%ymm13, %ymm12, %ymm5
382	vbroadcastss	-8(%r12), %ymm12 // B
383	vfmadd231ps		%ymm13, %ymm12, %ymm6
384	vbroadcastss	-4(%r12), %ymm12 // B
385	vfmadd231ps		%ymm13, %ymm12, %ymm7
386
387	cmpl	$4, %r10d
388	jg		1b // main loop
389
390
3914: // consider clean1-up loop
392
393	cmpl	$0, %r10d
394	jle		2f // return
395
396	// clean-up loop
3973: // clean up loop
398
399	// unroll 0
400	vmovaps			0(%r11), %ymm13 // a
401	vbroadcastss	0(%r12), %ymm12 // b
402	vfmadd231ps		%ymm13, %ymm12, %ymm0
403	vbroadcastss	4(%r12), %ymm12 // b
404	vfmadd231ps		%ymm13, %ymm12, %ymm1
405	subl	$1, %r10d
406	vbroadcastss	8(%r12), %ymm12 // b
407	vfmadd231ps		%ymm13, %ymm12, %ymm2
408	addq	$32, %r11
409	vbroadcastss	12(%r12), %ymm12 // b
410	vfmadd231ps		%ymm13, %ymm12, %ymm3
411	vbroadcastss	16(%r12), %ymm12 // B
412	vfmadd231ps		%ymm13, %ymm12, %ymm4
413	vbroadcastss	20(%r12), %ymm12 // B
414	vfmadd231ps		%ymm13, %ymm12, %ymm5
415	vbroadcastss	24(%r12), %ymm12 // B
416	vfmadd231ps		%ymm13, %ymm12, %ymm6
417	vbroadcastss	28(%r12), %ymm12 // B
418	vfmadd231ps		%ymm13, %ymm12, %ymm7
419	addq	$32, %r12
420
421	cmpl	$0, %r10d
422	jg		3b // clean up loop
423
424
4252: // return
426
427// shuffle scheme
428#else
429
430	cmpl	$0, %r10d
431	jle		2f // return
432
433	// preload
434	vbroadcastf128	0(%r12), %ymm14 // B
435	vmovaps			0(%r11), %ymm12 // A
436	vbroadcastf128	16(%r12), %ymm15 // B
437	vmovaps			32(%r11), %ymm13 // A
438
439	cmpl	$4, %r10d
440	jle		0f // consider clean-up loop
441
442	// main loop
443	.p2align 3
4441: // main loop
445
446	// unroll 0
447	vfmadd231ps		%ymm12, %ymm14, %ymm0
448	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
449
450	vfmadd231ps		%ymm12, %ymm14, %ymm1
451	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
452
453	vfmadd231ps		%ymm12, %ymm14, %ymm2
454	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
455
456	vfmadd231ps		%ymm12, %ymm14, %ymm3
457	vbroadcastf128	32(%r12), %ymm14 // B
458
459	vfmadd231ps		%ymm12, %ymm15, %ymm4
460	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
461
462	vfmadd231ps		%ymm12, %ymm15, %ymm5
463	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
464
465	vfmadd231ps		%ymm12, %ymm15, %ymm6
466	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
467
468	vfmadd231ps		%ymm12, %ymm15, %ymm7
469	vbroadcastf128	48(%r12), %ymm15 // B
470	vmovaps			64(%r11), %ymm12 // A
471
472
473	// unroll 1
474	vfmadd231ps		%ymm13, %ymm14, %ymm0
475	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
476
477	vfmadd231ps		%ymm13, %ymm14, %ymm1
478	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
479
480	vfmadd231ps		%ymm13, %ymm14, %ymm2
481	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
482
483	vfmadd231ps		%ymm13, %ymm14, %ymm3
484	vbroadcastf128	64(%r12), %ymm14 // B
485
486	vfmadd231ps		%ymm13, %ymm15, %ymm4
487	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
488
489	vfmadd231ps		%ymm13, %ymm15, %ymm5
490	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
491
492	vfmadd231ps		%ymm13, %ymm15, %ymm6
493	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
494
495	vfmadd231ps		%ymm13, %ymm15, %ymm7
496	vbroadcastf128	80(%r12), %ymm15 // B
497	vmovaps			96(%r11), %ymm13 // A
498
499
500	// unroll 2
501	vfmadd231ps		%ymm12, %ymm14, %ymm0
502	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
503
504	vfmadd231ps		%ymm12, %ymm14, %ymm1
505	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
506
507	vfmadd231ps		%ymm12, %ymm14, %ymm2
508	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
509
510	vfmadd231ps		%ymm12, %ymm14, %ymm3
511	vbroadcastf128	96(%r12), %ymm14 // B
512
513	vfmadd231ps		%ymm12, %ymm15, %ymm4
514	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
515
516	vfmadd231ps		%ymm12, %ymm15, %ymm5
517	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
518
519	vfmadd231ps		%ymm12, %ymm15, %ymm6
520	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
521
522	vfmadd231ps		%ymm12, %ymm15, %ymm7
523	vbroadcastf128	112(%r12), %ymm15 // B
524	vmovaps			128(%r11), %ymm12 // A
525
526	subl	$4, %r10d
527	addq	$128, %r11
528	addq	$128, %r12
529
530	// unroll 3
531	vfmadd231ps		%ymm13, %ymm14, %ymm0
532	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
533
534	vfmadd231ps		%ymm13, %ymm14, %ymm1
535	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
536
537	vfmadd231ps		%ymm13, %ymm14, %ymm2
538	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
539
540	vfmadd231ps		%ymm13, %ymm14, %ymm3
541	vbroadcastf128	0(%r12), %ymm14 // B
542
543	vfmadd231ps		%ymm13, %ymm15, %ymm4
544	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
545
546	vfmadd231ps		%ymm13, %ymm15, %ymm5
547	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
548
549	vfmadd231ps		%ymm13, %ymm15, %ymm6
550	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
551
552	vfmadd231ps		%ymm13, %ymm15, %ymm7
553	vbroadcastf128	16(%r12), %ymm15 // B
554	vmovaps			32(%r11), %ymm13 // A
555
556	cmpl	$4, %r10d
557	jg		1b // main loop
558
559
5600: // consider clean4-up
561
562	cmpl	$3, %r10d
563	jle		4f // clean1
564
565
566	// unroll 0
567	vfmadd231ps		%ymm12, %ymm14, %ymm0
568	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
569
570	vfmadd231ps		%ymm12, %ymm14, %ymm1
571	vshufps			$0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10
572
573	vfmadd231ps		%ymm12, %ymm14, %ymm2
574	vshufps			$0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01
575
576	vfmadd231ps		%ymm12, %ymm14, %ymm3
577	vbroadcastf128	32(%r12), %ymm14 // B
578
579	vfmadd231ps		%ymm12, %ymm15, %ymm4
580	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
581
582	vfmadd231ps		%ymm12, %ymm15, %ymm5
583	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
584
585	vfmadd231ps		%ymm12, %ymm15, %ymm6
586	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
587
588	vfmadd231ps		%ymm12, %ymm15, %ymm7
589	vbroadcastf128	48(%r12), %ymm15 // B
590	vmovaps			64(%r11), %ymm12 // A
591
592
593	// unroll 1
594	vfmadd231ps		%ymm13, %ymm14, %ymm0
595	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
596
597	vfmadd231ps		%ymm13, %ymm14, %ymm1
598	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
599
600	vfmadd231ps		%ymm13, %ymm14, %ymm2
601	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
602
603	vfmadd231ps		%ymm13, %ymm14, %ymm3
604	vbroadcastf128	64(%r12), %ymm14 // B
605
606	vfmadd231ps		%ymm13, %ymm15, %ymm4
607	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
608
609	vfmadd231ps		%ymm13, %ymm15, %ymm5
610	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
611
612	vfmadd231ps		%ymm13, %ymm15, %ymm6
613	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
614
615	vfmadd231ps		%ymm13, %ymm15, %ymm7
616	vbroadcastf128	80(%r12), %ymm15 // B
617	vmovaps			96(%r11), %ymm13 // A
618
619
620	// unroll 2
621	vfmadd231ps		%ymm12, %ymm14, %ymm0
622	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
623
624	vfmadd231ps		%ymm12, %ymm14, %ymm1
625	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
626
627	vfmadd231ps		%ymm12, %ymm14, %ymm2
628	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
629
630	vfmadd231ps		%ymm12, %ymm14, %ymm3
631	vbroadcastf128	96(%r12), %ymm14 // B
632
633	vfmadd231ps		%ymm12, %ymm15, %ymm4
634	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
635
636	vfmadd231ps		%ymm12, %ymm15, %ymm5
637	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
638
639	vfmadd231ps		%ymm12, %ymm15, %ymm6
640	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
641
642	vfmadd231ps		%ymm12, %ymm15, %ymm7
643	vbroadcastf128	112(%r12), %ymm15 // B
644//	vmovaps			128(%r11), %ymm12 // A
645
646	subl	$4, %r10d
647	addq	$128, %r11
648	addq	$128, %r12
649
650	// unroll 3
651	vfmadd231ps		%ymm13, %ymm14, %ymm0
652	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
653
654	vfmadd231ps		%ymm13, %ymm14, %ymm1
655	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
656
657	vfmadd231ps		%ymm13, %ymm14, %ymm2
658	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
659
660	vfmadd231ps		%ymm13, %ymm14, %ymm3
661//	vbroadcastf128	0(%r12), %ymm14 // B
662
663	vfmadd231ps		%ymm13, %ymm15, %ymm4
664	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
665
666	vfmadd231ps		%ymm13, %ymm15, %ymm5
667	vshufps			$0x4e, %ymm15, %ymm15, %ymm15
668
669	vfmadd231ps		%ymm13, %ymm15, %ymm6
670	vshufps			$0xb1, %ymm15, %ymm15, %ymm15
671
672	vfmadd231ps		%ymm13, %ymm15, %ymm7
673//	vbroadcastf128	16(%r12), %ymm15 // B
674//	vmovaps			32(%r11), %ymm13 // A
675
676
677//	cmpl	$4, %r10d
678	jmp		2f // return
679
680
6814: // consider clean1-up loop
682
683	cmpl	$0, %r10d
684	jle		2f // return
685
686	// clean-up loop
6873: // clean up loop
688
689	// unroll 0
690	vbroadcastf128	0(%r12), %ymm14 // B
691	vmovaps			0(%r11), %ymm12 // A
692	vfmadd231ps		%ymm12, %ymm14, %ymm0
693
694	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
695	vfmadd231ps		%ymm12, %ymm14, %ymm1
696
697	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
698	vfmadd231ps		%ymm12, %ymm14, %ymm2
699
700	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
701	vfmadd231ps		%ymm12, %ymm14, %ymm3
702
703	vbroadcastf128	16(%r12), %ymm14 // B
704	vfmadd231ps		%ymm12, %ymm14, %ymm4
705
706	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
707	vfmadd231ps		%ymm12, %ymm14, %ymm5
708
709	vshufps			$0x4e, %ymm14, %ymm14, %ymm14
710	vfmadd231ps		%ymm12, %ymm14, %ymm6
711
712	subl	$1, %r10d
713	addq	$32, %r11
714	addq	$32, %r12
715
716	vshufps			$0xb1, %ymm14, %ymm14, %ymm14
717	vfmadd231ps		%ymm12, %ymm14, %ymm7
718
719	cmpl	$0, %r10d
720	jg		3b // clean up loop
721
722
7232: // return
724
725#endif
726
727#if MACRO_LEVEL>=2
728	.endm
729#else
730	ret
731
732	FUN_END(inner_kernel_gemm_add_nt_8x8_lib8)
733#endif
734
735
736
737
738
739// common inner routine with file scope
740//
741// input arguments:
742// r10d   <- k
743// r11   <- A
744// r12   <- B
745// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
746// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
747// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
748// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
749// ymm4  <- []
750// ymm5  <- []
751// ymm6  <- []
752// ymm7  <- []
753// ymm8  <- dirty
754// ymm9  <- dirty
755// ymm10 <- dirty
756// ymm11 <- dirty
757// ymm12 <- dirty
758// ymm13 <- dirty
759// ymm14 <- dirty
760// ymm15 <- dirty
761
762//
763// output arguments:
764// r10d  <- 0
765// r11   <- A+4*k*sizeof(double)
766// r12   <- B+4*k*sizeof(double)
767// ymm0  <- [d00 d11 d22 d33 d40 d51 d62 d73]
768// ymm1  <- [d01 d10 d23 d32 d41 d50 d63 d72]
769// ymm2  <- [d03 d12 d21 d30 d43 d52 d61 d70]
770// ymm3  <- [d02 d13 d20 d31 d42 d53 d60 d71]
771// ymm4  <- []
772// ymm5  <- []
773// ymm6  <- []
774// ymm7  <- []
775// ymm8  <- dirty
776// ymm9  <- dirty
777// ymm10 <- dirty
778// ymm11 <- dirty
779// ymm12 <- dirty
780// ymm13 <- dirty
781// ymm14 <- dirty
782// ymm15 <- dirty
783
784#if MACRO_LEVEL>=2
785	.macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
786#else
787	.p2align 4,,15
788	FUN_START(inner_kernel_gemm_sub_nt_8x8_lib8)
789#endif
790
791	cmpl	$0, %r10d
792	jle		2f // return
793
794	// preload
795
796	cmpl	$3, %r10d
797	jle		4f // consider clean-up loop
798
799	// main loop
800	.p2align 3
8011: // main loop
802
803	// unroll 0
804	vmovaps			0(%r11), %ymm13 // A
805	vbroadcastss	0(%r12), %ymm12 // B
806	vfnmadd231ps	%ymm13, %ymm12, %ymm0
807	vbroadcastss	4(%r12), %ymm12 // B
808	vfnmadd231ps	%ymm13, %ymm12, %ymm1
809	vbroadcastss	8(%r12), %ymm12 // B
810	vfnmadd231ps	%ymm13, %ymm12, %ymm2
811	vbroadcastss	12(%r12), %ymm12 // B
812	vfnmadd231ps	%ymm13, %ymm12, %ymm3
813	vbroadcastss	16(%r12), %ymm12 // B
814	vfnmadd231ps	%ymm13, %ymm12, %ymm4
815	vbroadcastss	20(%r12), %ymm12 // B
816	vfnmadd231ps	%ymm13, %ymm12, %ymm5
817	vbroadcastss	24(%r12), %ymm12 // B
818	vfnmadd231ps	%ymm13, %ymm12, %ymm6
819	vbroadcastss	28(%r12), %ymm12 // B
820	vfnmadd231ps	%ymm13, %ymm12, %ymm7
821	subl	$4, %r10d
822
823	// unroll 0
824	vmovaps			32(%r11), %ymm13 // A
825	vbroadcastss	32(%r12), %ymm12 // B
826	vfnmadd231ps	%ymm13, %ymm12, %ymm0
827	vbroadcastss	36(%r12), %ymm12 // B
828	vfnmadd231ps	%ymm13, %ymm12, %ymm1
829	vbroadcastss	40(%r12), %ymm12 // B
830	vfnmadd231ps	%ymm13, %ymm12, %ymm2
831	vbroadcastss	44(%r12), %ymm12 // B
832	vfnmadd231ps	%ymm13, %ymm12, %ymm3
833	vbroadcastss	48(%r12), %ymm12 // B
834	vfnmadd231ps	%ymm13, %ymm12, %ymm4
835	vbroadcastss	52(%r12), %ymm12 // B
836	vfnmadd231ps	%ymm13, %ymm12, %ymm5
837	vbroadcastss	56(%r12), %ymm12 // B
838	vfnmadd231ps	%ymm13, %ymm12, %ymm6
839	vbroadcastss	60(%r12), %ymm12 // B
840	vfnmadd231ps	%ymm13, %ymm12, %ymm7
841	addq	$128, %r11
842
843	// unroll 0
844	vmovaps			-64(%r11), %ymm13 // A
845	vbroadcastss	64(%r12), %ymm12 // B
846	vfnmadd231ps	%ymm13, %ymm12, %ymm0
847	vbroadcastss	68(%r12), %ymm12 // B
848	vfnmadd231ps	%ymm13, %ymm12, %ymm1
849	vbroadcastss	72(%r12), %ymm12 // B
850	vfnmadd231ps	%ymm13, %ymm12, %ymm2
851	vbroadcastss	76(%r12), %ymm12 // B
852	vfnmadd231ps	%ymm13, %ymm12, %ymm3
853	vbroadcastss	80(%r12), %ymm12 // B
854	vfnmadd231ps	%ymm13, %ymm12, %ymm4
855	vbroadcastss	84(%r12), %ymm12 // B
856	vfnmadd231ps	%ymm13, %ymm12, %ymm5
857	vbroadcastss	88(%r12), %ymm12 // B
858	vfnmadd231ps	%ymm13, %ymm12, %ymm6
859	vbroadcastss	92(%r12), %ymm12 // B
860	vfnmadd231ps	%ymm13, %ymm12, %ymm7
861	addq	$128, %r12
862
863	// unroll 0
864	vmovaps			-32(%r11), %ymm13 // A
865	vbroadcastss	-32(%r12), %ymm12 // B
866	vfnmadd231ps	%ymm13, %ymm12, %ymm0
867	vbroadcastss	-28(%r12), %ymm12 // B
868	vfnmadd231ps	%ymm13, %ymm12, %ymm1
869	vbroadcastss	-24(%r12), %ymm12 // B
870	vfnmadd231ps	%ymm13, %ymm12, %ymm2
871	vbroadcastss	-20(%r12), %ymm12 // B
872	vfnmadd231ps	%ymm13, %ymm12, %ymm3
873	vbroadcastss	-16(%r12), %ymm12 // B
874	vfnmadd231ps	%ymm13, %ymm12, %ymm4
875	vbroadcastss	-12(%r12), %ymm12 // B
876	vfnmadd231ps	%ymm13, %ymm12, %ymm5
877	vbroadcastss	-8(%r12), %ymm12 // B
878	vfnmadd231ps	%ymm13, %ymm12, %ymm6
879	vbroadcastss	-4(%r12), %ymm12 // B
880	vfnmadd231ps	%ymm13, %ymm12, %ymm7
881
882	cmpl	$4, %r10d
883	jg		1b // main loop
884
885
8864: // consider clean1-up loop
887
888	cmpl	$0, %r10d
889	jle		2f // return
890
891	// clean-up loop
8923: // clean up loop
893
894	// unroll 0
895	vmovaps			0(%r11), %ymm13 // a
896	vbroadcastss	0(%r12), %ymm12 // b
897	vfnmadd231ps	%ymm13, %ymm12, %ymm0
898	vbroadcastss	4(%r12), %ymm12 // b
899	vfnmadd231ps	%ymm13, %ymm12, %ymm1
900	subl	$1, %r10d
901	vbroadcastss	8(%r12), %ymm12 // b
902	vfnmadd231ps	%ymm13, %ymm12, %ymm2
903	addq	$32, %r11
904	vbroadcastss	12(%r12), %ymm12 // b
905	vfnmadd231ps	%ymm13, %ymm12, %ymm3
906	vbroadcastss	16(%r12), %ymm12 // B
907	vfnmadd231ps	%ymm13, %ymm12, %ymm4
908	vbroadcastss	20(%r12), %ymm12 // B
909	vfnmadd231ps	%ymm13, %ymm12, %ymm5
910	vbroadcastss	24(%r12), %ymm12 // B
911	vfnmadd231ps	%ymm13, %ymm12, %ymm6
912	vbroadcastss	28(%r12), %ymm12 // B
913	vfnmadd231ps	%ymm13, %ymm12, %ymm7
914	addq	$32, %r12
915
916	cmpl	$0, %r10d
917	jg		3b // clean up loop
918
919
9202: // return
921
922#if MACRO_LEVEL>=2
923	.endm
924#else
925	ret
926
927	FUN_END(inner_kernel_gemm_sub_nt_8x8_lib8)
928#endif
929
930
931
932
933
934// common inner routine with file scope
935//
936// input arguments:
937// r10d  <- k
938// r11   <- A
939// r12   <- B
940// r13   <- 4*sdb*sizeof(double)
941// r14   <= dirty
942// ymm0  <- []
943// ymm1  <- []
944// ymm2  <- []
945// ymm3  <- []
946// ymm8  <- dirty
947// ymm9  <- dirty
948// ymm10 <- dirty
949// ymm11 <- dirty
950// ymm12 <- dirty
951// ymm13 <- dirty
952// ymm14 <- dirty
953// ymm15 <- dirty
954
955//
956// output arguments:
957// r10d  <- 0
958// r11   <- A+4*k*sizeof(double)
959// r12   <- B+(k/4)*sdb*sizeof(double)+(k%4)
960// r13   <- 4*sdb*sizeof(double)
961// r14   <= dirty
962// ymm0  <- []
963// ymm1  <- []
964// ymm2  <- []
965// ymm3  <- []
966// ymm8  <- dirty
967// ymm9  <- dirty
968// ymm10 <- dirty
969// ymm11 <- dirty
970// ymm12 <- dirty
971// ymm13 <- dirty
972// ymm14 <- dirty
973// ymm15 <- dirty
974
975#if MACRO_LEVEL>=2
976	.macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
977#else
978	.p2align 4,,15
979	FUN_START(inner_kernel_gemm_add_nn_8x8_lib8)
980#endif
981
982	cmpl	$0, %r10d
983	jle		2f // return
984
985	cmpl	$8, %r10d
986	jl		0f // consider clean-up loop
987
988	// main loop
989	.p2align 3
9901: // main loop
991
992//	prefetcht0	0(%r12, %r13, 1) // software prefetch
993//	prefetcht0	64(%r12, %r13, 1) // software prefetch
994//	prefetcht0	128(%r12, %r13, 1) // software prefetch
995//	prefetcht0	192(%r12, %r13, 1) // software prefetch
996
997	// unroll 0
998	vmovaps			0(%r11), %ymm12 // A[0]
999	vbroadcastss	0(%r12), %ymm13 // B[0]
1000	vfmadd231ps		%ymm12, %ymm13, %ymm0
1001	vbroadcastss	32(%r12), %ymm13 // B[1]
1002	vfmadd231ps		%ymm12, %ymm13, %ymm1
1003	vbroadcastss	64(%r12), %ymm13 // B[2]
1004	vfmadd231ps		%ymm12, %ymm13, %ymm2
1005	vbroadcastss	96(%r12), %ymm13 // B[3]
1006	vfmadd231ps		%ymm12, %ymm13, %ymm3
1007	vbroadcastss	128(%r12), %ymm13 // B[4]
1008	vfmadd231ps		%ymm12, %ymm13, %ymm4
1009	vbroadcastss	160(%r12), %ymm13 // B[5]
1010	vfmadd231ps		%ymm12, %ymm13, %ymm5
1011	vbroadcastss	192(%r12), %ymm13 // B[6]
1012	vfmadd231ps		%ymm12, %ymm13, %ymm6
1013	vbroadcastss	224(%r12), %ymm13 // B[7]
1014	vfmadd231ps		%ymm12, %ymm13, %ymm7
1015
1016	// unroll 1
1017	vmovaps			32(%r11), %ymm12 // A[0]
1018	vbroadcastss	4(%r12), %ymm13 // B[0]
1019	vfmadd231ps		%ymm12, %ymm13, %ymm0
1020	vbroadcastss	36(%r12), %ymm13 // B[1]
1021	vfmadd231ps		%ymm12, %ymm13, %ymm1
1022	vbroadcastss	68(%r12), %ymm13 // B[2]
1023	vfmadd231ps		%ymm12, %ymm13, %ymm2
1024	vbroadcastss	100(%r12), %ymm13 // B[3]
1025	vfmadd231ps		%ymm12, %ymm13, %ymm3
1026	vbroadcastss	132(%r12), %ymm13 // B[4]
1027	vfmadd231ps		%ymm12, %ymm13, %ymm4
1028	vbroadcastss	164(%r12), %ymm13 // B[5]
1029	vfmadd231ps		%ymm12, %ymm13, %ymm5
1030	vbroadcastss	196(%r12), %ymm13 // B[6]
1031	vfmadd231ps		%ymm12, %ymm13, %ymm6
1032	vbroadcastss	228(%r12), %ymm13 // B[7]
1033	vfmadd231ps		%ymm12, %ymm13, %ymm7
1034
1035	// unroll 2
1036	vmovaps			64(%r11), %ymm12 // A[0]
1037	vbroadcastss	8(%r12), %ymm13 // B[0]
1038	vfmadd231ps		%ymm12, %ymm13, %ymm0
1039	vbroadcastss	40(%r12), %ymm13 // B[1]
1040	vfmadd231ps		%ymm12, %ymm13, %ymm1
1041	vbroadcastss	72(%r12), %ymm13 // B[2]
1042	vfmadd231ps		%ymm12, %ymm13, %ymm2
1043	vbroadcastss	104(%r12), %ymm13 // B[3]
1044	vfmadd231ps		%ymm12, %ymm13, %ymm3
1045	vbroadcastss	136(%r12), %ymm13 // B[4]
1046	vfmadd231ps		%ymm12, %ymm13, %ymm4
1047	vbroadcastss	168(%r12), %ymm13 // B[5]
1048	vfmadd231ps		%ymm12, %ymm13, %ymm5
1049	vbroadcastss	200(%r12), %ymm13 // B[6]
1050	vfmadd231ps		%ymm12, %ymm13, %ymm6
1051	vbroadcastss	232(%r12), %ymm13 // B[7]
1052	vfmadd231ps		%ymm12, %ymm13, %ymm7
1053
1054	// unroll 3
1055	vmovaps			96(%r11), %ymm12 // A[0]
1056	vbroadcastss	12(%r12), %ymm13 // B[0]
1057	vfmadd231ps		%ymm12, %ymm13, %ymm0
1058	vbroadcastss	44(%r12), %ymm13 // B[1]
1059	vfmadd231ps		%ymm12, %ymm13, %ymm1
1060	vbroadcastss	76(%r12), %ymm13 // B[2]
1061	vfmadd231ps		%ymm12, %ymm13, %ymm2
1062	vbroadcastss	108(%r12), %ymm13 // B[3]
1063	vfmadd231ps		%ymm12, %ymm13, %ymm3
1064	vbroadcastss	140(%r12), %ymm13 // B[4]
1065	vfmadd231ps		%ymm12, %ymm13, %ymm4
1066	vbroadcastss	172(%r12), %ymm13 // B[5]
1067	vfmadd231ps		%ymm12, %ymm13, %ymm5
1068	vbroadcastss	204(%r12), %ymm13 // B[6]
1069	vfmadd231ps		%ymm12, %ymm13, %ymm6
1070	vbroadcastss	236(%r12), %ymm13 // B[7]
1071	vfmadd231ps		%ymm12, %ymm13, %ymm7
1072
1073	// unroll 4
1074	vmovaps			128(%r11), %ymm12 // A[0]
1075	vbroadcastss	16(%r12), %ymm13 // B[0]
1076	vfmadd231ps		%ymm12, %ymm13, %ymm0
1077	vbroadcastss	48(%r12), %ymm13 // B[1]
1078	vfmadd231ps		%ymm12, %ymm13, %ymm1
1079	vbroadcastss	80(%r12), %ymm13 // B[2]
1080	vfmadd231ps		%ymm12, %ymm13, %ymm2
1081	vbroadcastss	112(%r12), %ymm13 // B[3]
1082	vfmadd231ps		%ymm12, %ymm13, %ymm3
1083	vbroadcastss	144(%r12), %ymm13 // B[4]
1084	vfmadd231ps		%ymm12, %ymm13, %ymm4
1085	vbroadcastss	176(%r12), %ymm13 // B[5]
1086	vfmadd231ps		%ymm12, %ymm13, %ymm5
1087	vbroadcastss	208(%r12), %ymm13 // B[6]
1088	vfmadd231ps		%ymm12, %ymm13, %ymm6
1089	vbroadcastss	240(%r12), %ymm13 // B[7]
1090	vfmadd231ps		%ymm12, %ymm13, %ymm7
1091
1092	// unroll 5
1093	vmovaps			160(%r11), %ymm12 // A[0]
1094	vbroadcastss	20(%r12), %ymm13 // B[0]
1095	vfmadd231ps		%ymm12, %ymm13, %ymm0
1096	vbroadcastss	52(%r12), %ymm13 // B[1]
1097	vfmadd231ps		%ymm12, %ymm13, %ymm1
1098	vbroadcastss	84(%r12), %ymm13 // B[2]
1099	vfmadd231ps		%ymm12, %ymm13, %ymm2
1100	vbroadcastss	116(%r12), %ymm13 // B[3]
1101	vfmadd231ps		%ymm12, %ymm13, %ymm3
1102	vbroadcastss	148(%r12), %ymm13 // B[4]
1103	vfmadd231ps		%ymm12, %ymm13, %ymm4
1104	vbroadcastss	180(%r12), %ymm13 // B[5]
1105	vfmadd231ps		%ymm12, %ymm13, %ymm5
1106	vbroadcastss	212(%r12), %ymm13 // B[6]
1107	vfmadd231ps		%ymm12, %ymm13, %ymm6
1108	vbroadcastss	244(%r12), %ymm13 // B[7]
1109	vfmadd231ps		%ymm12, %ymm13, %ymm7
1110	subl	$8, %r10d
1111
1112	// unroll 6
1113	vmovaps			192(%r11), %ymm12 // A[0]
1114	vbroadcastss	24(%r12), %ymm13 // B[0]
1115	vfmadd231ps		%ymm12, %ymm13, %ymm0
1116	vbroadcastss	56(%r12), %ymm13 // B[1]
1117	vfmadd231ps		%ymm12, %ymm13, %ymm1
1118	vbroadcastss	88(%r12), %ymm13 // B[2]
1119	vfmadd231ps		%ymm12, %ymm13, %ymm2
1120	vbroadcastss	120(%r12), %ymm13 // B[3]
1121	vfmadd231ps		%ymm12, %ymm13, %ymm3
1122	vbroadcastss	152(%r12), %ymm13 // B[4]
1123	vfmadd231ps		%ymm12, %ymm13, %ymm4
1124	vbroadcastss	184(%r12), %ymm13 // B[5]
1125	vfmadd231ps		%ymm12, %ymm13, %ymm5
1126	vbroadcastss	216(%r12), %ymm13 // B[6]
1127	vfmadd231ps		%ymm12, %ymm13, %ymm6
1128	vbroadcastss	248(%r12), %ymm13 // B[7]
1129	vfmadd231ps		%ymm12, %ymm13, %ymm7
1130	addq	$256, %r11
1131
1132	// unroll 7
1133	vmovaps			-32(%r11), %ymm12 // A[0]
1134	vbroadcastss	28(%r12), %ymm13 // B[0]
1135	vfmadd231ps		%ymm12, %ymm13, %ymm0
1136	vbroadcastss	60(%r12), %ymm13 // B[1]
1137	vfmadd231ps		%ymm12, %ymm13, %ymm1
1138	vbroadcastss	92(%r12), %ymm13 // B[2]
1139	vfmadd231ps		%ymm12, %ymm13, %ymm2
1140	vbroadcastss	124(%r12), %ymm13 // B[3]
1141	vfmadd231ps		%ymm12, %ymm13, %ymm3
1142	vbroadcastss	156(%r12), %ymm13 // B[4]
1143	vfmadd231ps		%ymm12, %ymm13, %ymm4
1144	vbroadcastss	188(%r12), %ymm13 // B[5]
1145	vfmadd231ps		%ymm12, %ymm13, %ymm5
1146	vbroadcastss	220(%r12), %ymm13 // B[6]
1147	vfmadd231ps		%ymm12, %ymm13, %ymm6
1148	vbroadcastss	252(%r12), %ymm13 // B[7]
1149	addq	%r12, %r13
1150	vfmadd231ps		%ymm12, %ymm13, %ymm7
1151
1152	cmpl	$7, %r10d
1153	jg		1b // main loop
1154
1155
11560: // consider clean1-up loop
1157
1158	cmpl	$0, %r10d
1159	jle		2f // return
1160
11613: // clean1-up loop
1162
1163	// unroll 0
1164	vmovaps			0(%r11), %ymm12 // A[0]
1165	vbroadcastss	0(%r12), %ymm13 // B[0]
1166	vfmadd231ps		%ymm12, %ymm13, %ymm0
1167	vbroadcastss	32(%r12), %ymm13 // B[1]
1168	vfmadd231ps		%ymm12, %ymm13, %ymm1
1169	vbroadcastss	64(%r12), %ymm13 // B[2]
1170	vfmadd231ps		%ymm12, %ymm13, %ymm2
1171	vbroadcastss	96(%r12), %ymm13 // B[3]
1172	vfmadd231ps		%ymm12, %ymm13, %ymm3
1173	vbroadcastss	128(%r12), %ymm13 // B[4]
1174	vfmadd231ps		%ymm12, %ymm13, %ymm4
1175	vbroadcastss	160(%r12), %ymm13 // B[5]
1176	vfmadd231ps		%ymm12, %ymm13, %ymm5
1177	vbroadcastss	192(%r12), %ymm13 // B[6]
1178	vfmadd231ps		%ymm12, %ymm13, %ymm6
1179	vbroadcastss	224(%r12), %ymm13 // B[7]
1180	vfmadd231ps		%ymm12, %ymm13, %ymm7
1181
1182	subl	$1, %r10d
1183	addq	$32, %r11
1184	addq	$4, %r12
1185
1186	cmpl	$0, %r10d
1187	jg		3b // clean up loop
1188
1189
11902: // return
1191
1192#if MACRO_LEVEL>=2
1193	.endm
1194#else
1195	ret
1196
1197	FUN_END(inner_kernel_gemm_add_nn_8x8_lib8)
1198#endif
1199
1200
1201
1202
1203
1204// common inner routine with file scope
1205//
1206// edge for B unaligned
1207//
1208// input arguments:
1209// r10   <- k
1210// r11   <- A
1211// r12   <- B
1212// r13   <- bs*sdb*sizeof(double)
1213// r14   <- offB
1214// ymm0  <- []
1215// ymm1  <- []
1216// ymm2  <- []
1217// ymm3  <- []
1218// ymm8  <- dirty
1219// ymm12 <- dirty
1220// ymm15 <- dirty
1221
1222//
1223// output arguments:
1224// r10   <- k-(4-offB)
1225// r11   <- A+(4-offB)*bs*sizeof(double)
1226// r12   <- B-offB+bs*sdb*sizeof(double)
1227// r13   <- bs*sdb*sizeof(double)
1228// r14   <- offB
1229// ymm0  <- []
1230// ymm1  <- []
1231// ymm2  <- []
1232// ymm3  <- []
1233// ymm8  <- dirty
1234// ymm12 <- dirty
1235// ymm15 <- dirty
1236
1237
1238#if MACRO_LEVEL>=1
1239	.macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
1240#else
1241	.p2align 4,,15
1242	FUN_START(inner_edge_gemm_add_nn_8x8_lib8)
1243#endif
1244
1245	cmpl			$0, %r14d // offset==0
1246	jle				2f // end
1247
1248	cmpl			$0, %r10d // k==0
1249	jle				2f // end
1250
1251	movl			$8, %ebx
1252	subl			%r14d, %ebx // 8-offsetB
1253	cmpl			%r10d, %ebx
1254//	jle				0f
1255//	movl			%r10d, %ebx // kend=min(k,8-offsetB)
1256//0:
1257	cmovgl			%r10d, %ebx // kend=min(k,8-offsetB)
1258
1259	movl			%r14d, %eax
1260	sall			$2, %eax // offsetB*sizeof(float)
1261	addq			%rax, %r12 // B+offsetB*sizeof(float)
1262
1263	// unroll 0
1264	vmovaps			0(%r11), %ymm12 // A[0]
1265	vbroadcastss	0(%r12), %ymm13 // B[0]
1266	vfmadd231ps		%ymm12, %ymm13, %ymm0
1267	vbroadcastss	32(%r12), %ymm13 // B[1]
1268	vfmadd231ps		%ymm12, %ymm13, %ymm1
1269	vbroadcastss	64(%r12), %ymm13 // B[2]
1270	vfmadd231ps		%ymm12, %ymm13, %ymm2
1271	vbroadcastss	96(%r12), %ymm13 // B[3]
1272	vfmadd231ps		%ymm12, %ymm13, %ymm3
1273	vbroadcastss	128(%r12), %ymm13 // B[4]
1274	vfmadd231ps		%ymm12, %ymm13, %ymm4
1275	vbroadcastss	160(%r12), %ymm13 // B[5]
1276	vfmadd231ps		%ymm12, %ymm13, %ymm5
1277	vbroadcastss	192(%r12), %ymm13 // B[6]
1278	vfmadd231ps		%ymm12, %ymm13, %ymm6
1279	vbroadcastss	224(%r12), %ymm13 // B[7]
1280	vfmadd231ps		%ymm12, %ymm13, %ymm7
1281
1282	subl			$1, %r10d // k-1
1283	subl			$1, %ebx // kend-1
1284	addq			$32, %r11 // A+1*bs*sizeof(float)
1285	addq			$4, %r12 // B+1*sizeof(float)
1286
1287	cmpl			$0, %ebx
1288	jg				1b
1289
1290	cmpl			$0, %r10d
1291	jle				2f // end
1292
1293	addq			%r13, %r12
1294	subq			$32, %r12 // B+bs*(sdb-1)*sizeof(float)
1295
12962:
1297
1298#if MACRO_LEVEL>=1
1299	.endm
1300#else
1301	ret
1302
1303	FUN_END(inner_edge_gemm_add_nn_8x8_lib8)
1304#endif
1305
1306
1307
1308
1309
1310// common inner routine with file scope
1311//
1312// strsm
1313// right
1314// lower
1315// transposed
1316// not-unit
1317//
1318// input arguments:
1319// r10  <- D
1320// r11  <- inv_diag_D
1321// r12d <- kn
1322// ymm0 <- []
1323// ymm1 <- []
1324// ymm2 <- []
1325// ymm3 <- []
1326// ymm4 <- []
1327// ymm5 <- []
1328// ymm6 <- []
1329// ymm7 <- []
1330// ymm12 <- dirty
1331// ymm13 <- dirty
1332// ymm14 <- dirty
1333// ymm15 <- dirty
1334//
1335// output arguments:
1336// r10  <- D
1337// r11  <- inv_diag_D
1338// r12d <- kn
1339// ymm0 <- []
1340// ymm1 <- []
1341// ymm2 <- []
1342// ymm3 <- []
1343// ymm4 <- []
1344// ymm5 <- []
1345// ymm6 <- []
1346// ymm7 <- []
1347// ymm12 <- dirty
1348// ymm13 <- dirty
1349// ymm14 <- dirty
1350// ymm15 <- dirty
1351
1352#if MACRO_LEVEL>=1
1353	.macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
1354#else
1355	.p2align 4,,15
1356	FUN_START(inner_edge_trsm_rlt_inv_8x8_vs_lib8)
1357#endif
1358
1359	vbroadcastss	0(%r11), %ymm13
1360	vmulps			%ymm0, %ymm13, %ymm0
1361	vbroadcastss	4(%r10), %ymm13
1362	vfnmadd231ps	%ymm0, %ymm13, %ymm1
1363	vbroadcastss	8(%r10), %ymm13
1364	vfnmadd231ps	%ymm0, %ymm13, %ymm2
1365	vbroadcastss	12(%r10), %ymm13
1366	vfnmadd231ps	%ymm0, %ymm13, %ymm3
1367	vbroadcastss	16(%r10), %ymm13
1368	vfnmadd231ps	%ymm0, %ymm13, %ymm4
1369	vbroadcastss	20(%r10), %ymm13
1370	vfnmadd231ps	%ymm0, %ymm13, %ymm5
1371	vbroadcastss	24(%r10), %ymm13
1372	vfnmadd231ps	%ymm0, %ymm13, %ymm6
1373	vbroadcastss	28(%r10), %ymm13
1374	vfnmadd231ps	%ymm0, %ymm13, %ymm7
1375
1376	vbroadcastss	4(%r11), %ymm13
1377	vmulps			%ymm1, %ymm13, %ymm1
1378	vbroadcastss	40(%r10), %ymm13
1379	vfnmadd231ps	%ymm1, %ymm13, %ymm2
1380	vbroadcastss	44(%r10), %ymm13
1381	vfnmadd231ps	%ymm1, %ymm13, %ymm3
1382	vbroadcastss	48(%r10), %ymm13
1383	vfnmadd231ps	%ymm1, %ymm13, %ymm4
1384	vbroadcastss	52(%r10), %ymm13
1385	vfnmadd231ps	%ymm1, %ymm13, %ymm5
1386	vbroadcastss	56(%r10), %ymm13
1387	vfnmadd231ps	%ymm1, %ymm13, %ymm6
1388	vbroadcastss	60(%r10), %ymm13
1389	vfnmadd231ps	%ymm1, %ymm13, %ymm7
1390
1391	vbroadcastss	8(%r11), %ymm13
1392	vmulps			%ymm2, %ymm13, %ymm2
1393	vbroadcastss	76(%r10), %ymm13
1394	vfnmadd231ps	%ymm2, %ymm13, %ymm3
1395	vbroadcastss	80(%r10), %ymm13
1396	vfnmadd231ps	%ymm2, %ymm13, %ymm4
1397	vbroadcastss	84(%r10), %ymm13
1398	vfnmadd231ps	%ymm2, %ymm13, %ymm5
1399	vbroadcastss	88(%r10), %ymm13
1400	vfnmadd231ps	%ymm2, %ymm13, %ymm6
1401	vbroadcastss	92(%r10), %ymm13
1402	vfnmadd231ps	%ymm2, %ymm13, %ymm7
1403
1404	vbroadcastss	12(%r11), %ymm13
1405	vmulps			%ymm3, %ymm13, %ymm3
1406	vbroadcastss	112(%r10), %ymm13
1407	vfnmadd231ps	%ymm3, %ymm13, %ymm4
1408	vbroadcastss	116(%r10), %ymm13
1409	vfnmadd231ps	%ymm3, %ymm13, %ymm5
1410	vbroadcastss	120(%r10), %ymm13
1411	vfnmadd231ps	%ymm3, %ymm13, %ymm6
1412	vbroadcastss	124(%r10), %ymm13
1413	vfnmadd231ps	%ymm3, %ymm13, %ymm7
1414
1415	vbroadcastss	16(%r11), %ymm13
1416	vmulps			%ymm4, %ymm13, %ymm4
1417	cmpl			$6, %r12d
1418	jl				0f // ret
1419	vbroadcastss	148(%r10), %ymm13
1420	vfnmadd231ps	%ymm4, %ymm13, %ymm5
1421	vbroadcastss	152(%r10), %ymm13
1422	vfnmadd231ps	%ymm4, %ymm13, %ymm6
1423	vbroadcastss	156(%r10), %ymm13
1424	vfnmadd231ps	%ymm4, %ymm13, %ymm7
1425
1426	vbroadcastss	20(%r11), %ymm13
1427	vmulps			%ymm5, %ymm13, %ymm5
1428	cmpl			$7, %r12d
1429	jl				0f // ret
1430	vbroadcastss	184(%r10), %ymm13
1431	vfnmadd231ps	%ymm5, %ymm13, %ymm6
1432	vbroadcastss	188(%r10), %ymm13
1433	vfnmadd231ps	%ymm5, %ymm13, %ymm7
1434
1435	vbroadcastss	24(%r11), %ymm13
1436	vmulps			%ymm6, %ymm13, %ymm6
1437	cmpl			$8, %r12d
1438	jl				0f // ret
1439	vbroadcastss	220(%r10), %ymm13
1440	vfnmadd231ps	%ymm6, %ymm13, %ymm7
1441
1442	vbroadcastss	28(%r11), %ymm13
1443	vmulps			%ymm7, %ymm13, %ymm7
1444
14450:
1446
1447#if MACRO_LEVEL>=1
1448	.endm
1449#else
1450	ret
1451
1452	FUN_END(inner_edge_trsm_rlt_inv_8x8_vs_lib8)
1453#endif
1454
1455
1456
1457
1458
1459// common inner routine with file scope
1460//
1461// cholesky factorization
1462//
1463// input arguments:
1464// r10  <- inv_diag_E
1465// r11d <- kn
1466// ymm0 <- []
1467// ymm1 <- []
1468// ymm2 <- []
1469// ymm3 <- []
1470// ymm4 <- []
1471// ymm5 <- []
1472// ymm6 <- []
1473// ymm7 <- []
1474// ymm12 <- dirty
1475// ymm13 <- dirty
1476// ymm14 <- dirty
1477// ymm15 <- dirty
1478//
1479// output arguments:
1480// r10  <- inv_diag_E
1481// r11d <- kn
1482// ymm0 <- []
1483// ymm1 <- []
1484// ymm2 <- []
1485// ymm3 <- []
1486// ymm4 <- []
1487// ymm5 <- []
1488// ymm6 <- []
1489// ymm7 <- []
1490// ymm12 <- dirty
1491// ymm13 <- dirty
1492// ymm14 <- dirty
1493// ymm15 <- dirty
1494
1495#if MACRO_LEVEL>=1
1496	.macro INNER_EDGE_POTRF_8X8_VS_LIB8
1497#else
1498	.p2align 4,,15
1499	FUN_START(inner_edge_potrf_8x8_vs_lib8)
1500#endif
1501
1502	vxorps	%ymm15, %ymm15, %ymm15 // 0.0
1503#if defined(OS_LINUX) | defined(OS_WINDOWS)
1504	vmovss	.LC03(%rip), %xmm14 // 1.0
1505#elif defined(OS_MAC)
1506	vmovss	LC03(%rip), %xmm14 // 1.0
1507#endif
1508
1509	vmovss			%xmm0, %xmm0, %xmm13
1510	vucomiss		%xmm15, %xmm13 // d_00 > 0.0 ?
1511	jbe			1f
1512	vsqrtss			%xmm13, %xmm13, %xmm13
1513	vdivss			%xmm13, %xmm14, %xmm13
15142:
1515	vmovss			%xmm13, 0(%r10)
1516	vbroadcastss	%xmm13, %ymm13
1517//	vpermilps		$0x00, %xmm13, %xmm13
1518//	vinsertf128		$0x1, %xmm13, %ymm13, %ymm13
1519	vmulps			%ymm0, %ymm13, %ymm0
1520	vperm2f128		$0x00, %ymm0, %ymm0, %ymm11
1521	vpermilps		$0x55, %ymm11, %ymm13
1522	vfnmadd231ps	%ymm0, %ymm13, %ymm1
1523	vpermilps		$0xaa, %ymm11, %ymm13
1524	vfnmadd231ps	%ymm0, %ymm13, %ymm2
1525	vpermilps		$0xff, %ymm11, %ymm13
1526	vfnmadd231ps	%ymm0, %ymm13, %ymm3
1527	vperm2f128		$0x11, %ymm0, %ymm0, %ymm11
1528	vpermilps		$0x00, %ymm11, %ymm13
1529	vfnmadd231ps	%ymm0, %ymm13, %ymm4
1530	vpermilps		$0x55, %ymm11, %ymm13
1531	vfnmadd231ps	%ymm0, %ymm13, %ymm5
1532	vpermilps		$0xaa, %ymm11, %ymm13
1533	vfnmadd231ps	%ymm0, %ymm13, %ymm6
1534	vpermilps		$0xff, %ymm11, %ymm13
1535	vfnmadd231ps	%ymm0, %ymm13, %ymm7
1536
1537
1538	vpermilps		$0x55, %xmm1, %xmm13
1539	vucomiss		%xmm15, %xmm13 // d_11 > 0.0 ?
1540	jbe			3f
1541	vsqrtss			%xmm13, %xmm13, %xmm13
1542	vdivss			%xmm13, %xmm14, %xmm13
15434:
1544	vmovss			%xmm13, 4(%r10)
1545	vbroadcastss	%xmm13, %ymm13
1546	vmulps			%ymm1, %ymm13, %ymm1
1547	vperm2f128		$0x00, %ymm1, %ymm1, %ymm11
1548	vpermilps		$0xaa, %ymm11, %ymm13
1549	vfnmadd231ps	%ymm1, %ymm13, %ymm2
1550	vpermilps		$0xff, %ymm11, %ymm13
1551	vfnmadd231ps	%ymm1, %ymm13, %ymm3
1552	vperm2f128		$0x11, %ymm1, %ymm1, %ymm11
1553	vpermilps		$0x00, %ymm11, %ymm13
1554	vfnmadd231ps	%ymm1, %ymm13, %ymm4
1555	vpermilps		$0x55, %ymm11, %ymm13
1556	vfnmadd231ps	%ymm1, %ymm13, %ymm5
1557	vpermilps		$0xaa, %ymm11, %ymm13
1558	vfnmadd231ps	%ymm1, %ymm13, %ymm6
1559	vpermilps		$0xff, %ymm11, %ymm13
1560	vfnmadd231ps	%ymm1, %ymm13, %ymm7
1561
1562
1563	vpermilps		$0xaa, %xmm2, %xmm13
1564	vucomiss		%xmm15, %xmm13 // d_22 > 0.0 ?
1565	jbe			5f
1566	vsqrtss			%xmm13, %xmm13, %xmm13
1567	vdivss			%xmm13, %xmm14, %xmm13
15686:
1569	vmovss			%xmm13, 8(%r10)
1570	vbroadcastss	%xmm13, %ymm13
1571	vmulps			%ymm2, %ymm13, %ymm2
1572	vperm2f128		$0x00, %ymm2, %ymm2, %ymm11
1573	vpermilps		$0xff, %ymm11, %ymm13
1574	vfnmadd231ps	%ymm2, %ymm13, %ymm3
1575	vperm2f128		$0x11, %ymm2, %ymm2, %ymm11
1576	vpermilps		$0x00, %ymm11, %ymm13
1577	vfnmadd231ps	%ymm2, %ymm13, %ymm4
1578	vpermilps		$0x55, %ymm11, %ymm13
1579	vfnmadd231ps	%ymm2, %ymm13, %ymm5
1580	vpermilps		$0xaa, %ymm11, %ymm13
1581	vfnmadd231ps	%ymm2, %ymm13, %ymm6
1582	vpermilps		$0xff, %ymm11, %ymm13
1583	vfnmadd231ps	%ymm2, %ymm13, %ymm7
1584
1585
1586	vpermilps		$0xff, %xmm3, %xmm13
1587	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
1588	jbe			7f
1589	vsqrtss			%xmm13, %xmm13, %xmm13
1590	vdivss			%xmm13, %xmm14, %xmm13
15918:
1592	vmovss			%xmm13, 12(%r10)
1593	vbroadcastss	%xmm13, %ymm13
1594	vmulps			%ymm3, %ymm13, %ymm3
1595	vperm2f128		$0x11, %ymm3, %ymm3, %ymm11
1596	vpermilps		$0x00, %ymm11, %ymm13
1597	vfnmadd231ps	%ymm3, %ymm13, %ymm4
1598	vpermilps		$0x55, %ymm11, %ymm13
1599	vfnmadd231ps	%ymm3, %ymm13, %ymm5
1600	vpermilps		$0xaa, %ymm11, %ymm13
1601	vfnmadd231ps	%ymm3, %ymm13, %ymm6
1602	vpermilps		$0xff, %ymm11, %ymm13
1603	vfnmadd231ps	%ymm3, %ymm13, %ymm7
1604
1605
1606	vextractf128	$0x1, %ymm4, %xmm13
1607//	vpermilps		$0x00, %xmm13, %xmm13
1608	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
1609	jbe			9f
1610	vsqrtss			%xmm13, %xmm13, %xmm13
1611	vdivss			%xmm13, %xmm14, %xmm13
161210:
1613	vmovss			%xmm13, 16(%r10)
1614	vbroadcastss	%xmm13, %ymm13
1615	vmulps			%ymm4, %ymm13, %ymm4
1616	cmpl		$6, %r11d
1617	jl			0f // ret
1618	vperm2f128		$0x11, %ymm4, %ymm4, %ymm11
1619	vpermilps		$0x55, %ymm11, %ymm13
1620	vfnmadd231ps	%ymm4, %ymm13, %ymm5
1621	vpermilps		$0xaa, %ymm11, %ymm13
1622	vfnmadd231ps	%ymm4, %ymm13, %ymm6
1623	vpermilps		$0xff, %ymm11, %ymm13
1624	vfnmadd231ps	%ymm4, %ymm13, %ymm7
1625
1626
1627	vextractf128	$0x1, %ymm5, %xmm13
1628	vpermilps		$0x55, %xmm13, %xmm13
1629	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
1630	jbe			11f
1631	vsqrtss			%xmm13, %xmm13, %xmm13
1632	vdivss			%xmm13, %xmm14, %xmm13
163312:
1634	vmovss			%xmm13, 20(%r10)
1635	vbroadcastss	%xmm13, %ymm13
1636	vmulps			%ymm5, %ymm13, %ymm5
1637	cmpl		$7, %r11d
1638	jl			0f // ret
1639	vperm2f128		$0x11, %ymm5, %ymm5, %ymm11
1640	vpermilps		$0xaa, %ymm11, %ymm13
1641	vfnmadd231ps	%ymm5, %ymm13, %ymm6
1642	vpermilps		$0xff, %ymm11, %ymm13
1643	vfnmadd231ps	%ymm5, %ymm13, %ymm7
1644
1645
1646	vextractf128	$0x1, %ymm6, %xmm13
1647	vpermilps		$0xaa, %xmm13, %xmm13
1648	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
1649	jbe			13f
1650	vsqrtss			%xmm13, %xmm13, %xmm13
1651	vdivss			%xmm13, %xmm14, %xmm13
165214:
1653	vmovss			%xmm13, 24(%r10)
1654	vbroadcastss	%xmm13, %ymm13
1655	vmulps			%ymm6, %ymm13, %ymm6
1656	cmpl		$8, %r11d
1657	jl			0f // ret
1658	vperm2f128		$0x11, %ymm6, %ymm6, %ymm11
1659	vpermilps		$0xff, %ymm11, %ymm13
1660	vfnmadd231ps	%ymm6, %ymm13, %ymm7
1661
1662
1663	vextractf128	$0x1, %ymm7, %xmm13
1664	vpermilps		$0xff, %xmm13, %xmm13
1665	vucomiss		%xmm15, %xmm13 // d_33 > 0.0 ?
1666	jbe			15f
1667	vsqrtss			%xmm13, %xmm13, %xmm13
1668	vdivss			%xmm13, %xmm14, %xmm13
166916:
1670	vmovss			%xmm13, 28(%r10)
1671	vbroadcastss	%xmm13, %ymm13
1672	vmulps			%ymm7, %ymm13, %ymm7
1673
1674
1675	jmp		0f
1676
1677
16781:
1679	vxorps	%ymm13, %ymm13, %ymm13
1680	jmp		2b
1681
16823:
1683	vxorpd	%ymm13, %ymm13, %ymm13
1684	jmp		4b
1685
16865:
1687	vxorpd	%ymm13, %ymm13, %ymm13
1688	jmp		6b
1689
16907:
1691	vxorpd	%ymm13, %ymm13, %ymm13
1692	jmp		8b
1693
16949:
1695	vxorpd	%ymm13, %ymm13, %ymm13
1696	jmp		10b
1697
169811:
1699	vxorpd	%ymm13, %ymm13, %ymm13
1700	jmp		12b
1701
170213:
1703	vxorpd	%ymm13, %ymm13, %ymm13
1704	jmp		14b
1705
170615:
1707	vxorpd	%ymm13, %ymm13, %ymm13
1708	jmp		16b
1709
17100:
1711
1712#if MACRO_LEVEL>=1
1713	.endm
1714#else
1715	ret
1716
1717	FUN_END(inner_edge_potrf_8x8_vs_lib8)
1718#endif
1719
1720
1721
1722
1723
1724// common inner routine with file scope
1725//
1726// scale for generic alpha and beta
1727//
1728// input arguments:
1729// r10   <- alpha
1730// r11   <- beta
1731// r12   <- C
1732// ymm0 <- []
1733// ymm1 <- []
1734// ymm2 <- []
1735// ymm3 <- []
1736// ymm4 <- []
1737// ymm5 <- []
1738// ymm6 <- []
1739// ymm7 <- []
1740// ymm8  <- dirty
1741// ymm9  <- dirty
1742// ymm10 <- dirty
1743// ymm11 <- dirty
1744// ymm15 <- dirty
1745//
1746// output arguments:
1747// r10   <- alpha
1748// r11   <- beta
1749// r12   <- C
1750// ymm0 <- []
1751// ymm1 <- []
1752// ymm2 <- []
1753// ymm3 <- []
1754// ymm4 <- []
1755// ymm5 <- []
1756// ymm6 <- []
1757// ymm7 <- []
1758// ymm8  <- dirty
1759// ymm9  <- dirty
1760// ymm10 <- dirty
1761// ymm11 <- dirty
1762// ymm15 <- dirty
1763
1764#if MACRO_LEVEL>=1
1765	.macro INNER_SCALE_AB_8X8_LIB8
1766#else
1767	.p2align 4,,15
1768	FUN_START(inner_scale_ab_8x8_lib8)
1769#endif
1770
1771	// alpha
1772	vbroadcastss	0(%r10), %ymm11
1773
1774	vmulps		%ymm0, %ymm11, %ymm0
1775	vmulps		%ymm1, %ymm11, %ymm1
1776	vmulps		%ymm2, %ymm11, %ymm2
1777	vmulps		%ymm3, %ymm11, %ymm3
1778	vmulps		%ymm4, %ymm11, %ymm4
1779	vmulps		%ymm5, %ymm11, %ymm5
1780	vmulps		%ymm6, %ymm11, %ymm6
1781	vmulps		%ymm7, %ymm11, %ymm7
1782
1783	// beta
1784	vbroadcastss	0(%r11), %ymm14
1785
1786	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
1787
1788	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
1789	je			0f // end
1790
1791	vmovaps		0(%r12), %ymm15
1792	vfmadd231ps	%ymm15, %ymm14, %ymm0
1793	vmovaps		32(%r12), %ymm15
1794	vfmadd231ps	%ymm15, %ymm14, %ymm1
1795	vmovaps		64(%r12), %ymm15
1796	vfmadd231ps	%ymm15, %ymm14, %ymm2
1797	vmovaps		96(%r12), %ymm15
1798	vfmadd231ps	%ymm15, %ymm14, %ymm3
1799	vmovaps		128(%r12), %ymm15
1800	vfmadd231ps	%ymm15, %ymm14, %ymm4
1801	vmovaps		160(%r12), %ymm15
1802	vfmadd231ps	%ymm15, %ymm14, %ymm5
1803	vmovaps		192(%r12), %ymm15
1804	vfmadd231ps	%ymm15, %ymm14, %ymm6
1805	vmovaps		224(%r12), %ymm15
1806	vfmadd231ps	%ymm15, %ymm14, %ymm7
1807
18080:
1809
1810#if MACRO_LEVEL>=1
1811	.endm
1812#else
1813	ret
1814
1815	FUN_END(inner_scale_ab_8x8_lib8)
1816#endif
1817
1818
1819
1820
1821
1822// common inner routine with file scope
1823//
1824// scale scale for generic alpha and beta
1825//
1826// input arguments:
1827// r10   <- alpha
1828// r11   <- beta
1829// r12  <- offset
1830// r13   <- C
1831// r14  <- 4*sdc*sizeof(double)
1832// ymm0 <- []
1833// ymm1 <- []
1834// ymm2 <- []
1835// ymm3 <- []
1836// ymm4 <- []
1837// ymm5 <- []
1838// ymm6 <- []
1839// ymm7 <- []
1840// ymm8  <- dirty
1841// ymm9  <- dirty
1842// ymm10 <- dirty
1843// ymm11 <- dirty
1844// ymm15 <- dirty
1845//
1846// output arguments:
1847// r10   <- alpha
1848// r11   <- beta
1849// r12  <- offset
1850// r13   <- C
1851// r14  <- 4*sdc*sizeof(double)
1852// r15  <- n0 // col index: start from (inc)
1853// ymm0 <- []
1854// ymm1 <- []
1855// ymm2 <- []
1856// ymm3 <- []
1857// ymm4 <- []
1858// ymm5 <- []
1859// ymm6 <- []
1860// ymm7 <- []
1861// ymm8  <- dirty
1862// ymm9  <- dirty
1863// ymm10 <- dirty
1864// ymm11 <- dirty
1865// ymm15 <- dirty
1866
1867#if MACRO_LEVEL>=1
1868	.macro INNER_SCALE_AB_8X8_GEN_LIB8
1869#else
1870	.p2align 4,,15
1871	FUN_START(inner_scale_ab_8x8_gen_lib8)
1872#endif
1873
1874	// alpha
1875	vbroadcastss	0(%r10), %ymm11
1876
1877	vmulps		%ymm0, %ymm11, %ymm0
1878	vmulps		%ymm1, %ymm11, %ymm1
1879	vmulps		%ymm2, %ymm11, %ymm2
1880	vmulps		%ymm3, %ymm11, %ymm3
1881
1882	vmulps		%ymm4, %ymm11, %ymm4
1883	vmulps		%ymm5, %ymm11, %ymm5
1884	vmulps		%ymm6, %ymm11, %ymm6
1885	vmulps		%ymm7, %ymm11, %ymm7
1886
1887	// beta
1888	vbroadcastss	0(%r11), %ymm15
1889
1890	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
1891
1892	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
1893	je			3f // end
1894
1895	cmpl	$0, %r12d
1896	jg		0f
1897
1898	// offset==0
1899
1900	vmovaps		0(%r13), %ymm12
1901	vfmadd231ps	%ymm12, %ymm15, %ymm0
1902	vmovaps		32(%r13), %ymm12
1903	vfmadd231ps	%ymm12, %ymm15, %ymm1
1904	vmovaps		64(%r13), %ymm12
1905	vfmadd231ps	%ymm12, %ymm15, %ymm2
1906	vmovaps		96(%r13), %ymm12
1907	vfmadd231ps	%ymm12, %ymm15, %ymm3
1908	vmovaps		128(%r13), %ymm12
1909	vfmadd231ps	%ymm12, %ymm15, %ymm4
1910	vmovaps		160(%r13), %ymm12
1911	vfmadd231ps	%ymm12, %ymm15, %ymm5
1912	vmovaps		192(%r13), %ymm12
1913	vfmadd231ps	%ymm12, %ymm15, %ymm6
1914	vmovaps		224(%r13), %ymm12
1915	vfmadd231ps	%ymm12, %ymm15, %ymm7
1916
1917	jmp		7f
1918
19190:
1920
1921	// offset > 0
1922	// 1 2 3 4 5 6 7
1923
1924	movq	%r13, %r15 // C0
1925	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
1926
1927	cmpl	$4, %r10d
1928	jl		1f
1929	jg		2f
1930
1931	// offset==4
1932	// TODO
1933	jmp		7f
1934
19351:
1936	// 1 2 3
1937
1938	cmpl	$2, %r10d
1939	jl		3f
1940	jg		4f
1941
1942	// offset==2
1943	// TODO
1944	jmp		7f
1945
19463:
1947	// offset==1
1948	// TODO
1949	jmp		7f
1950
19514:
1952	// offset==3
1953	// TODO
1954	jmp		7f
1955
19562:
1957	// 5 6 7
1958
1959	cmpl	$6, %r10d
1960	jl		5f
1961	jg		6f
1962
1963	// offset==6
1964	// TODO
1965	jmp		7f
1966
19675:
1968	// offset==5
1969	// TODO
1970	jmp		7f
1971
19726:
1973	// offset==7
1974	// TODO
1975	jmp		7f
1976
1977	// end
19787:
1979
1980
1981#if MACRO_LEVEL>=1
1982	.endm
1983#else
1984	ret
1985
1986	FUN_END(inner_scale_ab_8x8_gen_lib8)
1987#endif
1988
1989
1990
1991
1992
1993// common inner routine with file scope
1994//
1995// blend for generic alpha=1.0 and beta=1.0
1996//
1997// input arguments:
1998// r10   <- C
1999// ymm0 <- []
2000// ymm1 <- []
2001// ymm2 <- []
2002// ymm3 <- []
2003// ymm4 <- []
2004// ymm5 <- []
2005// ymm6 <- []
2006// ymm7 <- []
2007// ymm8  <- dirty
2008// ymm9  <- dirty
2009// ymm10 <- dirty
2010// ymm11 <- dirty
2011// ymm15 <- dirty
2012//
2013// output arguments:
2014// r10   <- C
2015// ymm0 <- []
2016// ymm1 <- []
2017// ymm2 <- []
2018// ymm3 <- []
2019// ymm4 <- []
2020// ymm5 <- []
2021// ymm6 <- []
2022// ymm7 <- []
2023// ymm8  <- dirty
2024// ymm9  <- dirty
2025// ymm10 <- dirty
2026// ymm11 <- dirty
2027// ymm15 <- dirty
2028
2029#if MACRO_LEVEL>=1
2030	.macro INNER_SCALE_11_8X8_LIB8
2031#else
2032	.p2align 4,,15
2033	FUN_START(inner_scale_11_8x8_lib8)
2034#endif
2035
2036	vmovaps		0(%r10), %ymm15
2037	vaddps		%ymm0, %ymm15, %ymm0
2038	vmovaps		32(%r10), %ymm15
2039	vaddps		%ymm1, %ymm15, %ymm1
2040	vmovaps		64(%r10), %ymm15
2041	vaddps		%ymm2, %ymm15, %ymm2
2042	vmovaps		96(%r10), %ymm15
2043	vaddps		%ymm3, %ymm15, %ymm3
2044	vmovaps		128(%r10), %ymm15
2045	vaddps		%ymm4, %ymm15, %ymm4
2046	vmovaps		160(%r10), %ymm15
2047	vaddps		%ymm5, %ymm15, %ymm5
2048	vmovaps		192(%r10), %ymm15
2049	vaddps		%ymm6, %ymm15, %ymm6
2050	vmovaps		224(%r10), %ymm15
2051	vaddps		%ymm7, %ymm15, %ymm7
2052
2053#if MACRO_LEVEL>=1
2054	.endm
2055#else
2056	ret
2057
2058	FUN_END(inner_scale_11_8x8_lib8)
2059#endif
2060
2061
2062
2063
2064
2065// common inner routine with file scope
2066//
2067// blend scale for generic alpha and beta
2068//
2069// input arguments:
2070// r10  <- offset
2071// r11   <- C
2072// r12  <- 4*sdc*sizeof(double)
2073// ymm0 <- []
2074// ymm1 <- []
2075// ymm2 <- []
2076// ymm3 <- []
2077// ymm4 <- []
2078// ymm5 <- []
2079// ymm6 <- []
2080// ymm7 <- []
2081// ymm8  <- dirty
2082// ymm9  <- dirty
2083// ymm10 <- dirty
2084// ymm11 <- dirty
2085// ymm15 <- dirty
2086//
2087// output arguments:
2088// r10  <- offset
2089// r11   <- C
2090// r12  <- 4*sdc*sizeof(double)
2091// ymm0 <- []
2092// ymm1 <- []
2093// ymm2 <- []
2094// ymm3 <- []
2095// ymm4 <- []
2096// ymm5 <- []
2097// ymm6 <- []
2098// ymm7 <- []
2099// ymm8  <- dirty
2100// ymm9  <- dirty
2101// ymm10 <- dirty
2102// ymm11 <- dirty
2103// ymm15 <- dirty
2104
2105#if MACRO_LEVEL>=1
2106	.macro INNER_SCALE_11_8X8_GEN_LIB8
2107#else
2108	.p2align 4,,15
2109	FUN_START(inner_scale_11_8x8_gen_lib8)
2110#endif
2111
2112	cmpl	$0, %r10d
2113	jg		0f
2114
2115	// offset==0
2116
2117	vmovaps		0(%r11), %ymm12
2118	vaddps		%ymm0, %ymm12, %ymm0
2119	vmovaps		32(%r11), %ymm12
2120	vaddps		%ymm1, %ymm12, %ymm1
2121	vmovaps		64(%r11), %ymm12
2122	vaddps		%ymm2, %ymm12, %ymm2
2123	vmovaps		96(%r11), %ymm12
2124	vaddps		%ymm3, %ymm12, %ymm3
2125	vmovaps		128(%r11), %ymm12
2126	vaddps		%ymm4, %ymm12, %ymm4
2127	vmovaps		160(%r11), %ymm12
2128	vaddps		%ymm5, %ymm12, %ymm5
2129	vmovaps		192(%r11), %ymm12
2130	vaddps		%ymm6, %ymm12, %ymm6
2131	vmovaps		224(%r11), %ymm12
2132	vaddps		%ymm7, %ymm12, %ymm7
2133
2134	jmp		7f
2135
21360:
2137
2138	// offset > 0
2139	// 1 2 3 4 5 6 7
2140
2141	movq	%r13, %r15 // C0
2142	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
2143
2144	cmpl	$4, %r10d
2145	jl		1f
2146	jg		2f
2147
2148	// offset==4
2149	// TODO
2150	jmp		7f
2151
21521:
2153	// 1 2 3
2154
2155	cmpl	$2, %r10d
2156	jl		3f
2157	jg		4f
2158
2159	// offset==2
2160	// TODO
2161	jmp		7f
2162
21633:
2164	// offset==1
2165	// TODO
2166	jmp		7f
2167
21684:
2169	// offset==3
2170	// TODO
2171	jmp		7f
2172
21732:
2174	// 5 6 7
2175
2176	cmpl	$6, %r10d
2177	jl		5f
2178	jg		6f
2179
2180	// offset==6
2181	// TODO
2182	jmp		7f
2183
21845:
2185	// offset==5
2186	// TODO
2187	jmp		7f
2188
21896:
2190	// offset==7
2191	// TODO
2192	jmp		7f
2193
2194	// end
21957:
2196
2197
2198#if MACRO_LEVEL>=1
2199	.endm
2200#else
2201	ret
2202
2203	FUN_END(inner_scale_11_8x8_gen_lib8)
2204#endif
2205
2206
2207
2208
2209
2210// common inner routine with file scope
2211//
2212// blend for generic alpha and beta
2213//
2214// input arguments:
2215// r10   <- alpha
2216// r11   <- beta
2217// r12   <- C
2218// ymm0 <- []
2219// ymm1 <- []
2220// ymm2 <- []
2221// ymm3 <- []
2222// ymm4 <- []
2223// ymm5 <- []
2224// ymm6 <- []
2225// ymm7 <- []
2226// ymm8  <- dirty
2227// ymm9  <- dirty
2228// ymm10 <- dirty
2229// ymm11 <- dirty
2230// ymm15 <- dirty
2231//
2232// output arguments:
2233// r10   <- alpha
2234// r11   <- beta
2235// r12   <- C
2236// ymm0 <- []
2237// ymm1 <- []
2238// ymm2 <- []
2239// ymm3 <- []
2240// ymm4 <- []
2241// ymm5 <- []
2242// ymm6 <- []
2243// ymm7 <- []
2244// ymm8  <- dirty
2245// ymm9  <- dirty
2246// ymm10 <- dirty
2247// ymm11 <- dirty
2248// ymm15 <- dirty
2249
2250#if MACRO_LEVEL>=1
2251	.macro INNER_BLEND_SCALE_AB_8X8_LIB8
2252#else
2253	.p2align 4,,15
2254	FUN_START(inner_blend_scale_ab_8x8_lib8)
2255#endif
2256
2257	// alpha
2258	vbroadcastss	0(%r10), %ymm11
2259
2260	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
2261	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
2262	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
2263	vblendps	$0x55, %ymm3, %ymm2, %ymm15
2264
2265	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
2266	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
2267	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
2268	vblendps	$0x33, %ymm14, %ymm13, %ymm3
2269
2270	vmulps		%ymm0, %ymm11, %ymm0
2271	vmulps		%ymm1, %ymm11, %ymm1
2272	vmulps		%ymm2, %ymm11, %ymm2
2273	vmulps		%ymm3, %ymm11, %ymm3
2274
2275	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
2276	vblendps	$0x55, %ymm5, %ymm4, %ymm13
2277	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
2278	vblendps	$0x55, %ymm7, %ymm6, %ymm15
2279
2280	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
2281	vblendps	$0x33, %ymm15, %ymm12, %ymm6
2282	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
2283	vblendps	$0x33, %ymm14, %ymm13, %ymm7
2284
2285	vmulps		%ymm4, %ymm11, %ymm4
2286	vmulps		%ymm5, %ymm11, %ymm5
2287	vmulps		%ymm6, %ymm11, %ymm6
2288	vmulps		%ymm7, %ymm11, %ymm7
2289
2290	// beta
2291	vbroadcastss	0(%r11), %ymm14
2292
2293	vxorps		%ymm15, %ymm15, %ymm15 // 0.0
2294
2295	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
2296	je			0f // end
2297
2298	vmovaps		0(%r12), %ymm15
2299	vfmadd231ps	%ymm15, %ymm14, %ymm0
2300	vmovaps		32(%r12), %ymm15
2301	vfmadd231ps	%ymm15, %ymm14, %ymm1
2302	vmovaps		64(%r12), %ymm15
2303	vfmadd231ps	%ymm15, %ymm14, %ymm2
2304	vmovaps		96(%r12), %ymm15
2305	vfmadd231ps	%ymm15, %ymm14, %ymm3
2306	vmovaps		128(%r12), %ymm15
2307	vfmadd231ps	%ymm15, %ymm14, %ymm4
2308	vmovaps		160(%r12), %ymm15
2309	vfmadd231ps	%ymm15, %ymm14, %ymm5
2310	vmovaps		192(%r12), %ymm15
2311	vfmadd231ps	%ymm15, %ymm14, %ymm6
2312	vmovaps		224(%r12), %ymm15
2313	vfmadd231ps	%ymm15, %ymm14, %ymm7
2314
23150:
2316
2317#if MACRO_LEVEL>=1
2318	.endm
2319#else
2320	ret
2321
2322	FUN_END(inner_blend_scale_ab_8x8_lib8)
2323#endif
2324
2325
2326
2327
2328
2329// common inner routine with file scope
2330//
2331// blend scale for generic alpha and beta
2332//
2333// input arguments:
2334// r10   <- alpha
2335// r11   <- beta
2336// r12  <- offset
2337// r13   <- C
2338// r14  <- 4*sdc*sizeof(double)
2339// ymm0 <- []
2340// ymm1 <- []
2341// ymm2 <- []
2342// ymm3 <- []
2343// ymm4 <- []
2344// ymm5 <- []
2345// ymm6 <- []
2346// ymm7 <- []
2347// ymm8  <- dirty
2348// ymm9  <- dirty
2349// ymm10 <- dirty
2350// ymm11 <- dirty
2351// ymm15 <- dirty
2352//
2353// output arguments:
2354// r10   <- alpha
2355// r11   <- beta
2356// r12  <- offset
2357// r13   <- C
2358// r14  <- 4*sdc*sizeof(double)
2359// ymm0 <- []
2360// ymm1 <- []
2361// ymm2 <- []
2362// ymm3 <- []
2363// ymm4 <- []
2364// ymm5 <- []
2365// ymm6 <- []
2366// ymm7 <- []
2367// ymm8  <- dirty
2368// ymm9  <- dirty
2369// ymm10 <- dirty
2370// ymm11 <- dirty
2371// ymm15 <- dirty
2372
2373#if MACRO_LEVEL>=1
2374	.macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8
2375#else
2376	.p2align 4,,15
2377	FUN_START(inner_blend_scale_ab_8x8_gen_lib8)
2378#endif
2379
2380	// alpha
2381	vbroadcastss	0(%r10), %ymm11
2382
2383	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
2384	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
2385	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
2386	vblendps	$0x55, %ymm3, %ymm2, %ymm15
2387
2388	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
2389	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
2390	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
2391	vblendps	$0x33, %ymm14, %ymm13, %ymm3
2392
2393	vmulps		%ymm0, %ymm11, %ymm0
2394	vmulps		%ymm1, %ymm11, %ymm1
2395	vmulps		%ymm2, %ymm11, %ymm2
2396	vmulps		%ymm3, %ymm11, %ymm3
2397
2398	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
2399	vblendps	$0x55, %ymm5, %ymm4, %ymm13
2400	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
2401	vblendps	$0x55, %ymm7, %ymm6, %ymm15
2402
2403	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
2404	vblendps	$0x33, %ymm15, %ymm12, %ymm6
2405	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
2406	vblendps	$0x33, %ymm14, %ymm13, %ymm7
2407
2408	vmulps		%ymm4, %ymm11, %ymm4
2409	vmulps		%ymm5, %ymm11, %ymm5
2410	vmulps		%ymm6, %ymm11, %ymm6
2411	vmulps		%ymm7, %ymm11, %ymm7
2412
2413	// beta
2414	vbroadcastss	0(%r11), %ymm15
2415
2416	vxorps		%ymm14, %ymm14, %ymm14 // 0.0
2417
2418	vucomiss	%xmm15, %xmm14 // beta==0.0 ?
2419	je			3f // end
2420
2421	cmpl	$0, %r12d
2422	jg		0f
2423
2424	// offset==0
2425
2426	vmovaps		0(%r13), %ymm12
2427	vfmadd231ps	%ymm12, %ymm15, %ymm0
2428	vmovaps		32(%r13), %ymm12
2429	vfmadd231ps	%ymm12, %ymm15, %ymm1
2430	vmovaps		64(%r13), %ymm12
2431	vfmadd231ps	%ymm12, %ymm15, %ymm2
2432	vmovaps		96(%r13), %ymm12
2433	vfmadd231ps	%ymm12, %ymm15, %ymm3
2434	vmovaps		128(%r13), %ymm12
2435	vfmadd231ps	%ymm12, %ymm15, %ymm4
2436	vmovaps		160(%r13), %ymm12
2437	vfmadd231ps	%ymm12, %ymm15, %ymm5
2438	vmovaps		192(%r13), %ymm12
2439	vfmadd231ps	%ymm12, %ymm15, %ymm6
2440	vmovaps		224(%r13), %ymm12
2441	vfmadd231ps	%ymm12, %ymm15, %ymm7
2442
2443	jmp		7f
2444
24450:
2446
2447	// offset > 0
2448	// 1 2 3 4 5 6 7
2449
2450	movq	%r13, %r15 // C0
2451	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
2452
2453	cmpl	$4, %r12d
2454	jl		1f
2455	jg		2f
2456
2457	// offset==4
2458	// TODO
2459	jmp		7f
2460
24611:
2462	// 1 2 3
2463
2464	cmpl	$2, %r12d
2465	jl		3f
2466	jg		4f
2467
2468	// offset==2
2469	// TODO
2470	jmp		7f
2471
24723:
2473	// offset==1
2474	// TODO
2475	jmp		7f
2476
24774:
2478	// offset==3
2479	// TODO
2480	jmp		7f
2481
24822:
2483	// 5 6 7
2484
2485	cmpl	$6, %r12d
2486	jl		5f
2487	jg		6f
2488
2489	// offset==6
2490	// TODO
2491	jmp		7f
2492
24935:
2494	// offset==5
2495	// TODO
2496	jmp		7f
2497
24986:
2499	// offset==7
2500	// TODO
2501	jmp		7f
2502
2503	// end
25047:
2505
2506
2507#if MACRO_LEVEL>=1
2508	.endm
2509#else
2510	ret
2511
2512	FUN_END(inner_blend_scale_ab_8x8_gen_lib8)
2513#endif
2514
2515
2516
2517
2518
2519// common inner routine with file scope
2520//
2521// blend for generic alpha=1.0 and beta=1.0
2522//
2523// input arguments:
2524// r10   <- C
2525// ymm0 <- []
2526// ymm1 <- []
2527// ymm2 <- []
2528// ymm3 <- []
2529// ymm4 <- []
2530// ymm5 <- []
2531// ymm6 <- []
2532// ymm7 <- []
2533// ymm8  <- dirty
2534// ymm9  <- dirty
2535// ymm10 <- dirty
2536// ymm11 <- dirty
2537// ymm15 <- dirty
2538//
2539// output arguments:
2540// r10   <- C
2541// ymm0 <- []
2542// ymm1 <- []
2543// ymm2 <- []
2544// ymm3 <- []
2545// ymm4 <- []
2546// ymm5 <- []
2547// ymm6 <- []
2548// ymm7 <- []
2549// ymm8  <- dirty
2550// ymm9  <- dirty
2551// ymm10 <- dirty
2552// ymm11 <- dirty
2553// ymm15 <- dirty
2554
2555#if MACRO_LEVEL>=1
2556	.macro INNER_BLEND_SCALE_11_8X8_LIB8
2557#else
2558	.p2align 4,,15
2559	FUN_START(inner_blend_scale_11_8x8_lib8)
2560#endif
2561
2562	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
2563	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
2564	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
2565	vblendps	$0x55, %ymm3, %ymm2, %ymm15
2566
2567	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
2568	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
2569	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
2570	vblendps	$0x33, %ymm14, %ymm13, %ymm3
2571
2572	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
2573	vblendps	$0x55, %ymm5, %ymm4, %ymm13
2574	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
2575	vblendps	$0x55, %ymm7, %ymm6, %ymm15
2576
2577	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
2578	vblendps	$0x33, %ymm15, %ymm12, %ymm6
2579	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
2580	vblendps	$0x33, %ymm14, %ymm13, %ymm7
2581
2582	vmovaps		0(%r10), %ymm15
2583	vaddps		%ymm0, %ymm15, %ymm0
2584	vmovaps		32(%r10), %ymm15
2585	vaddps		%ymm1, %ymm15, %ymm1
2586	vmovaps		64(%r10), %ymm15
2587	vaddps		%ymm2, %ymm15, %ymm2
2588	vmovaps		96(%r10), %ymm15
2589	vaddps		%ymm3, %ymm15, %ymm3
2590	vmovaps		128(%r10), %ymm15
2591	vaddps		%ymm4, %ymm15, %ymm4
2592	vmovaps		160(%r10), %ymm15
2593	vaddps		%ymm5, %ymm15, %ymm5
2594	vmovaps		192(%r10), %ymm15
2595	vaddps		%ymm6, %ymm15, %ymm6
2596	vmovaps		224(%r10), %ymm15
2597	vaddps		%ymm7, %ymm15, %ymm7
2598
2599#if MACRO_LEVEL>=1
2600	.endm
2601#else
2602	ret
2603
2604	FUN_END(inner_blend_scale_11_8x8_lib8)
2605#endif
2606
2607
2608
2609
2610
2611// common inner routine with file scope
2612//
2613// blend scale for generic alpha and beta
2614//
2615// input arguments:
2616// r10  <- offset
2617// r11   <- C
2618// r12  <- 4*sdc*sizeof(double)
2619// ymm0 <- []
2620// ymm1 <- []
2621// ymm2 <- []
2622// ymm3 <- []
2623// ymm4 <- []
2624// ymm5 <- []
2625// ymm6 <- []
2626// ymm7 <- []
2627// ymm8  <- dirty
2628// ymm9  <- dirty
2629// ymm10 <- dirty
2630// ymm11 <- dirty
2631// ymm15 <- dirty
2632//
2633// output arguments:
2634// r10  <- offset
2635// r11   <- C
2636// r12  <- 4*sdc*sizeof(double)
2637// ymm0 <- []
2638// ymm1 <- []
2639// ymm2 <- []
2640// ymm3 <- []
2641// ymm4 <- []
2642// ymm5 <- []
2643// ymm6 <- []
2644// ymm7 <- []
2645// ymm8  <- dirty
2646// ymm9  <- dirty
2647// ymm10 <- dirty
2648// ymm11 <- dirty
2649// ymm15 <- dirty
2650
2651#if MACRO_LEVEL>=1
2652	.macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8
2653#else
2654	.p2align 4,,15
2655	FUN_START(inner_blend_scale_11_8x8_gen_lib8)
2656#endif
2657
2658	vblendps	$0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010
2659	vblendps	$0x55, %ymm1, %ymm0, %ymm13 // 0101 0101
2660	vblendps	$0xaa, %ymm3, %ymm2, %ymm14
2661	vblendps	$0x55, %ymm3, %ymm2, %ymm15
2662
2663	vblendps	$0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100
2664	vblendps	$0x33, %ymm15, %ymm12, %ymm2 // 0011 0011
2665	vblendps	$0xcc, %ymm14, %ymm13, %ymm1
2666	vblendps	$0x33, %ymm14, %ymm13, %ymm3
2667
2668	vblendps	$0xaa, %ymm5, %ymm4, %ymm12
2669	vblendps	$0x55, %ymm5, %ymm4, %ymm13
2670	vblendps	$0xaa, %ymm7, %ymm6, %ymm14
2671	vblendps	$0x55, %ymm7, %ymm6, %ymm15
2672
2673	vblendps	$0xcc, %ymm15, %ymm12, %ymm4
2674	vblendps	$0x33, %ymm15, %ymm12, %ymm6
2675	vblendps	$0xcc, %ymm14, %ymm13, %ymm5
2676	vblendps	$0x33, %ymm14, %ymm13, %ymm7
2677
2678	cmpl	$0, %r10d
2679	jg		0f
2680
2681	// offset==0
2682
2683	vmovaps		0(%r11), %ymm12
2684	vaddps		%ymm0, %ymm12, %ymm0
2685	vmovaps		32(%r11), %ymm12
2686	vaddps		%ymm1, %ymm12, %ymm1
2687	vmovaps		64(%r11), %ymm12
2688	vaddps		%ymm2, %ymm12, %ymm2
2689	vmovaps		96(%r11), %ymm12
2690	vaddps		%ymm3, %ymm12, %ymm3
2691	vmovaps		128(%r11), %ymm12
2692	vaddps		%ymm4, %ymm12, %ymm4
2693	vmovaps		160(%r11), %ymm12
2694	vaddps		%ymm5, %ymm12, %ymm5
2695	vmovaps		192(%r11), %ymm12
2696	vaddps		%ymm6, %ymm12, %ymm6
2697	vmovaps		224(%r11), %ymm12
2698	vaddps		%ymm7, %ymm12, %ymm7
2699
2700	jmp		7f
2701
27020:
2703
2704	// offset > 0
2705	// 1 2 3 4 5 6 7
2706
2707	movq	%r13, %r15 // C0
2708	addq	%r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double)
2709
2710	cmpl	$4, %r10d
2711	jl		1f
2712	jg		2f
2713
2714	// offset==4
2715	// TODO
2716	jmp		7f
2717
27181:
2719	// 1 2 3
2720
2721	cmpl	$2, %r10d
2722	jl		3f
2723	jg		4f
2724
2725	// offset==2
2726	// TODO
2727	jmp		7f
2728
27293:
2730	// offset==1
2731	// TODO
2732	jmp		7f
2733
27344:
2735	// offset==3
2736	// TODO
2737	jmp		7f
2738
27392:
2740	// 5 6 7
2741
2742	cmpl	$6, %r10d
2743	jl		5f
2744	jg		6f
2745
2746	// offset==6
2747	// TODO
2748	jmp		7f
2749
27505:
2751	// offset==5
2752	// TODO
2753	jmp		7f
2754
27556:
2756	// offset==7
2757	// TODO
2758	jmp		7f
2759
2760	// end
27617:
2762
2763
2764#if MACRO_LEVEL>=1
2765	.endm
2766#else
2767	ret
2768
2769	FUN_END(inner_blend_scale_11_8x8_gen_lib8)
2770#endif
2771
2772
2773
2774
2775
2776// common inner routine with file scope
2777//
2778// store n
2779//
2780// input arguments:
2781// r10  <- D
2782// ymm0 <- []
2783// ymm1 <- []
2784// ymm2 <- []
2785// ymm3 <- []
2786//
2787// output arguments:
2788// r10  <- D
2789// ymm0 <- []
2790// ymm1 <- []
2791// ymm2 <- []
2792// ymm3 <- []
2793
2794#if MACRO_LEVEL>=1
2795	.macro INNER_STORE_8X8_LIB8
2796#else
2797	.p2align 4,,15
2798	FUN_START(inner_store_8x8_lib8)
2799#endif
2800
2801	vmovaps 	%ymm0,  0(%r10)
2802	vmovaps 	%ymm1, 32(%r10)
2803	vmovaps 	%ymm2, 64(%r10)
2804	vmovaps 	%ymm3, 96(%r10)
2805	vmovaps 	%ymm4, 128(%r10)
2806	vmovaps 	%ymm5, 160(%r10)
2807	vmovaps 	%ymm6, 192(%r10)
2808	vmovaps 	%ymm7, 224(%r10)
2809
2810#if MACRO_LEVEL>=1
2811	.endm
2812#else
2813	ret
2814
2815	FUN_END(inner_store_8x8_lib8)
2816#endif
2817
2818
2819
2820
2821
2822// common inner routine with file scope
2823//
2824// store n vs
2825//
2826// input arguments:
2827// r10  <- D
2828// r11  <- km
2829// r12  <- kn
2830// ymm0 <- []
2831// ymm1 <- []
2832// ymm2 <- []
2833// ymm3 <- []
2834// ymm4 <- []
2835// ymm5 <- []
2836// ymm6 <- []
2837// ymm7 <- []
2838//
2839// output arguments:
2840// r10  <- D
2841// r11  <- km
2842// r12  <- kn
2843// ymm0 <- []
2844// ymm1 <- []
2845// ymm2 <- []
2846// ymm3 <- []
2847// ymm4 <- []
2848// ymm5 <- []
2849// ymm6 <- []
2850// ymm7 <- []
2851
2852#if MACRO_LEVEL>=1
2853	.macro INNER_STORE_8X8_VS_LIB8
2854#else
2855	.p2align 4,,15
2856	FUN_START(inner_store_8x8_vs_lib8)
2857#endif
2858
2859	// compute mask for rows
2860	vcvtsi2ss	%r11d, %xmm15, %xmm15
2861#if defined(OS_LINUX) | defined(OS_WINDOWS)
2862	vmovups		.LC00(%rip), %ymm12
2863#elif defined(OS_MAC)
2864	vmovups		LC00(%rip), %ymm12
2865#endif
2866	vshufps		$0x00, %xmm15, %xmm15, %xmm15
2867	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
2868	vsubps		%ymm15, %ymm12, %ymm15
2869
2870	vmaskmovps	%ymm0, %ymm15,  0(%r10)
2871	vmaskmovps	%ymm1, %ymm15,  32(%r10)
2872	vmaskmovps	%ymm2, %ymm15,  64(%r10)
2873	vmaskmovps	%ymm3, %ymm15,  96(%r10)
2874	vmaskmovps	%ymm4, %ymm15,  128(%r10)
2875	cmpl		$6, %r12d
2876	jl			0f // end
2877	vmaskmovps	%ymm5, %ymm15, 160(%r10)
2878	cmpl		$7, %r12d
2879	jl			0f // end
2880	vmaskmovps	%ymm6, %ymm15, 192(%r10)
2881	je			0f // end
2882	vmaskmovps	%ymm7, %ymm15, 224(%r10)
2883
28840:
2885
2886#if MACRO_LEVEL>=1
2887	.endm
2888#else
2889	ret
2890
2891	FUN_END(inner_store_8x8_vs_lib8)
2892#endif
2893
2894
2895
2896
2897
2898// common inner routine with file scope
2899//
2900// store n generalized
2901//
2902// input arguments:
2903// r10  <- offset
2904// r11  <- D
2905// r12  <- 4*sdd*sizeof(double)
2906// r13  <- m0 // row index: start from (inc)
2907// r14  <- m1 // row index: up to (exc)
2908// r15  <- n0 // col index: start from (inc)
2909// rax  <- n1 // col index: up to (exc)
2910// rbx  <- dirty
2911// ymm0 <- []
2912// ymm1 <- []
2913// ymm2 <- []
2914// ymm3 <- []
2915// ymm4 <- []
2916// ymm5 <- []
2917// ymm6 <- []
2918// ymm7 <- []
2919//
2920// output arguments:
2921// r10  <- offset
2922// r11  <- D
2923// r12  <- 4*sdd*sizeof(double)
2924// r13  <- m0 // row index: start from (inc)
2925// r14  <- m1 // row index: up to (exc)
2926// r15  <- n1-n0
2927// rax  <- n1-n0
2928// rbx  <- dirty
2929// ymm0 <- []
2930// ymm1 <- []
2931// ymm2 <- []
2932// ymm3 <- []
2933// ymm4 <- []
2934// ymm5 <- []
2935// ymm6 <- []
2936// ymm7 <- []
2937
2938#if MACRO_LEVEL>=1
2939	.macro INNER_STORE_8X8_GEN_LIB8
2940#else
2941	.p2align 4,,15
2942	FUN_START(inner_store_8x8_gen_lib8)
2943#endif
2944
2945	// compute mask for rows
2946	vcvtsi2ss	%r13d, %xmm14, %xmm14
2947	vcvtsi2ss	%r14d, %xmm15, %xmm15
2948#if defined(OS_LINUX) | defined(OS_WINDOWS)
2949	vmovups		.LC00(%rip), %ymm12
2950#elif defined(OS_MAC)
2951	vmovups		LC00(%rip), %ymm12
2952#endif
2953	vshufps		$0x00, %xmm14, %xmm14, %xmm14
2954	vshufps		$0x00, %xmm15, %xmm15, %xmm15
2955	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
2956	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
2957	vsubps		%ymm12, %ymm14, %ymm14
2958	vsubps		%ymm15, %ymm12, %ymm15
2959	vandps		%ymm14, %ymm15, %ymm15
2960
2961	// shift D and sol for cols
2962	cmpl	$0, %r15d
2963	jle		0f
2964
2965	vmovaps		%ymm1, %ymm0
2966	vmovaps		%ymm2, %ymm1
2967	vmovaps		%ymm3, %ymm2
2968	vmovaps		%ymm4, %ymm3
2969	vmovaps		%ymm5, %ymm4
2970	vmovaps		%ymm6, %ymm5
2971	vmovaps		%ymm7, %ymm6
2972	addq		$32, %r11
2973
2974	cmpl	$1, %r15d
2975	jle		0f
2976
2977	vmovaps		%ymm1, %ymm0
2978	vmovaps		%ymm2, %ymm1
2979	vmovaps		%ymm3, %ymm2
2980	vmovaps		%ymm4, %ymm3
2981	vmovaps		%ymm5, %ymm4
2982	vmovaps		%ymm6, %ymm5
2983	addq		$32, %r11
2984
2985	cmpl	$2, %r15d
2986	jle		0f
2987
2988	vmovaps		%ymm1, %ymm0
2989	vmovaps		%ymm3, %ymm2
2990	vmovaps		%ymm4, %ymm3
2991	vmovaps		%ymm5, %ymm4
2992	addq		$32, %r11
2993
29940:
2995
2996	// compute number of cols
2997	cmpl	$8, %eax
2998	jle		0f
2999	movl	$8, %eax
30000:
3001	subl	%r15d, %eax
3002	movl	%eax, %r15d
3003
3004	cmpl	$0, %r10d
3005	jg		0f
3006
3007	// offset==0
3008	vmaskmovps	%ymm0, %ymm15,  0(%r11)
3009	vmaskmovps	%ymm1, %ymm15,  32(%r11)
3010	vmaskmovps	%ymm2, %ymm15,  64(%r11)
3011	vmaskmovps	%ymm3, %ymm15,  96(%r11)
3012	vmaskmovps	%ymm4, %ymm15,  128(%r11)
3013	cmpl		$6, %r15d
3014	jl			7f // end
3015	vmaskmovps	%ymm5, %ymm15, 160(%r11)
3016	cmpl		$7, %r15d
3017	jl			7f // end
3018	vmaskmovps	%ymm6, %ymm15, 192(%r11)
3019	je			7f // end
3020	vmaskmovps	%ymm7, %ymm15, 224(%r11)
3021	//
3022	jmp		7f
3023
30240:
3025	// offset > 0
3026	// 1 2 3 4 5 6 7
3027
3028	movq	%r11, %rbx // D0
3029	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
3030
3031	cmpl	$4, %r10d
3032	jl		1f
3033	jg		2f
3034
3035	// offset==4
3036	// TODO
3037	jmp		7f
3038
30391:
3040	// 1 2 3
3041
3042	cmpl	$2, %r10d
3043	jl		3f
3044	jg		4f
3045
3046	// offset==2
3047	// TODO
3048	jmp		7f
3049
30503:
3051	// offset==1
3052	// TODO
3053	jmp		7f
3054
30554:
3056	// offset==3
3057	// TODO
3058	jmp		7f
3059
30602:
3061	// 5 6 7
3062
3063	cmpl	$6, %r10d
3064	jl		5f
3065	jg		6f
3066
3067	// offset==6
3068	// TODO
3069	jmp		7f
3070
30715:
3072	// offset==5
3073	// TODO
3074	jmp		7f
3075
30766:
3077	// offset==7
3078	// TODO
3079	jmp		7f
3080
3081	// end
30827:
3083
3084#if MACRO_LEVEL>=1
3085	.endm
3086#else
3087	ret
3088
3089	FUN_END(inner_store_8x8_gen_lib8)
3090#endif
3091
3092
3093
3094
3095
3096// common inner routine with file scope
3097//
3098// store n
3099//
3100// input arguments:
3101// r10  <- D
3102// ymm0 <- []
3103// ymm1 <- []
3104// ymm2 <- []
3105// ymm3 <- []
3106//
3107// output arguments:
3108// r10  <- D
3109// ymm0 <- []
3110// ymm1 <- []
3111// ymm2 <- []
3112// ymm3 <- []
3113
3114#if MACRO_LEVEL>=1
3115	.macro INNER_STORE_L_8X8_LIB8
3116#else
3117	.p2align 4,,15
3118	FUN_START(inner_store_l_8x8_lib8)
3119#endif
3120
3121	vmovaps 	%ymm0,  0(%r10)
3122	vmovaps		32(%r10), %ymm14
3123	vblendps	$0x01, %ymm14, %ymm1, %ymm1
3124	vmovaps 	%ymm1, 32(%r10)
3125	vmovaps		64(%r10), %ymm14
3126	vblendps	$0x03, %ymm14, %ymm2, %ymm2
3127	vmovaps 	%ymm2, 64(%r10)
3128	vmovaps		96(%r10), %ymm14
3129	vblendps	$0x07, %ymm14, %ymm3, %ymm3
3130	vmovaps 	%ymm3, 96(%r10)
3131	vmovaps		128(%r10), %ymm14
3132	vblendps	$0x0f, %ymm14, %ymm4, %ymm4
3133	vmovaps 	%ymm4, 128(%r10)
3134	vmovaps		160(%r10), %ymm14
3135	vblendps	$0x1f, %ymm14, %ymm5, %ymm5
3136	vmovaps 	%ymm5, 160(%r10)
3137	vmovaps		192(%r10), %ymm14
3138	vblendps	$0x3f, %ymm14, %ymm6, %ymm6
3139	vmovaps 	%ymm6, 192(%r10)
3140	vmovaps		224(%r10), %ymm14
3141	vblendps	$0x7f, %ymm14, %ymm7, %ymm7
3142	vmovaps 	%ymm7, 224(%r10)
3143
3144#if MACRO_LEVEL>=1
3145	.endm
3146#else
3147	ret
3148
3149	FUN_END(inner_store_l_8x8_lib8)
3150#endif
3151
3152
3153
3154
3155
3156// common inner routine with file scope
3157//
3158// store lower vs
3159//
3160// input arguments:
3161// r10  <- D
3162// r11  <- km
3163// r12  <- kn
3164// ymm0 <- []
3165// ymm1 <- []
3166// ymm2 <- []
3167// ymm3 <- []
3168// ymm4 <- []
3169// ymm5 <- []
3170// ymm6 <- []
3171// ymm7 <- []
3172//
3173// output arguments:
3174// r10  <- D
3175// r11  <- km
3176// r12  <- kn
3177// ymm0 <- []
3178// ymm1 <- []
3179// ymm2 <- []
3180// ymm3 <- []
3181// ymm4 <- []
3182// ymm5 <- []
3183// ymm6 <- []
3184// ymm7 <- []
3185
3186#if MACRO_LEVEL>=1
3187	.macro INNER_STORE_L_8X8_VS_LIB8
3188#else
3189	.p2align 4,,15
3190	FUN_START(inner_store_l_8x8_vs_lib8)
3191#endif
3192
3193	// compute mask for rows
3194	vcvtsi2ss	%r11d, %xmm15, %xmm15
3195#if defined(OS_LINUX) | defined(OS_WINDOWS)
3196	vmovups		.LC00(%rip), %ymm12
3197#elif defined(OS_MAC)
3198	vmovups		LC00(%rip), %ymm12
3199#endif
3200	vshufps		$0x00, %xmm15, %xmm15, %xmm15
3201	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
3202	vsubps		%ymm15, %ymm12, %ymm15
3203
3204	// offset==0
3205	vmaskmovps	%ymm0, %ymm15,  0(%r10)
3206	vmovaps 	32(%r10), %ymm12
3207	vblendps	$0x01, %ymm12, %ymm1, %ymm1
3208	vmaskmovps	%ymm1, %ymm15,  32(%r10)
3209	vmovaps 	64(%r10), %ymm12
3210	vblendps	$0x03, %ymm12, %ymm2, %ymm2
3211	vmaskmovps	%ymm2, %ymm15,  64(%r10)
3212	vmovaps 	96(%r10), %ymm12
3213	vblendps	$0x07, %ymm12, %ymm3, %ymm3
3214	vmaskmovps	%ymm3, %ymm15,  96(%r10)
3215	vmovaps 	128(%r10), %ymm12
3216	vblendps	$0x0f, %ymm12, %ymm4, %ymm4
3217	vmaskmovps	%ymm4, %ymm15,  128(%r10)
3218	cmpl		$6, %r12d
3219	jl			0f // end
3220	vmovaps 	160(%r10), %ymm12
3221	vblendps	$0x1f, %ymm12, %ymm5, %ymm5
3222	vmaskmovps	%ymm5, %ymm15, 160(%r10)
3223	cmpl		$7, %r12d
3224	jl			0f // end
3225	vmovaps 	192(%r10), %ymm12
3226	vblendps	$0x3f, %ymm12, %ymm6, %ymm6
3227	vmaskmovps	%ymm6, %ymm15, 192(%r10)
3228	je			0f // end
3229	vmovaps 	224(%r10), %ymm12
3230	vblendps	$0x7f, %ymm12, %ymm7, %ymm7
3231	vmaskmovps	%ymm7, %ymm15, 224(%r10)
3232
32330:
3234
3235#if MACRO_LEVEL>=1
3236	.endm
3237#else
3238	ret
3239
3240	FUN_END(inner_store_8x8_vs_lib8)
3241#endif
3242
3243
3244
3245
3246
3247// common inner routine with file scope
3248//
3249// store lower generalized
3250//
3251// input arguments:
3252// r10  <- offset
3253// r11  <- D
3254// r12  <- 4*sdd*sizeof(double)
3255// r13  <- m0 // row index: start from (inc)
3256// r14  <- m1 // row index: up to (exc)
3257// r15  <- n0 // col index: start from (inc)
3258// rax  <- n1 // col index: up to (exc)
3259// rbx  <- dirty
3260// ymm0 <- []
3261// ymm1 <- []
3262// ymm2 <- []
3263// ymm3 <- []
3264// ymm4 <- []
3265// ymm5 <- []
3266// ymm6 <- []
3267// ymm7 <- []
3268//
3269// output arguments:
3270// r10  <- offset
3271// r11  <- D
3272// r12  <- 4*sdd*sizeof(double)
3273// r13  <- m0 // row index: start from (inc)
3274// r14  <- m1 // row index: up to (exc)
3275// r15  <- n1-n0
3276// rax  <- n1-n0
3277// rbx  <- dirty
3278// ymm0 <- []
3279// ymm1 <- []
3280// ymm2 <- []
3281// ymm3 <- []
3282// ymm4 <- []
3283// ymm5 <- []
3284// ymm6 <- []
3285// ymm7 <- []
3286
3287#if MACRO_LEVEL>=1
3288	.macro INNER_STORE_L_8X8_GEN_LIB8
3289#else
3290	.p2align 4,,15
3291	FUN_START(inner_store_l_8x8_gen_lib8)
3292#endif
3293
3294	// compute mask for rows
3295	vcvtsi2ss	%r13d, %xmm14, %xmm14
3296	vcvtsi2ss	%r14d, %xmm15, %xmm15
3297#if defined(OS_LINUX) | defined(OS_WINDOWS)
3298	vmovups		.LC00(%rip), %ymm12
3299#elif defined(OS_MAC)
3300	vmovups		LC00(%rip), %ymm12
3301#endif
3302	vshufps		$0x00, %xmm14, %xmm14, %xmm14
3303	vshufps		$0x00, %xmm15, %xmm15, %xmm15
3304	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
3305	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
3306	vsubps		%ymm12, %ymm14, %ymm14
3307	vsubps		%ymm15, %ymm12, %ymm15
3308	vandps		%ymm14, %ymm15, %ymm15
3309
3310	// shift D and sol for cols
3311	cmpl	$0, %r15d
3312	jle		0f
3313
3314	vmovaps		%ymm1, %ymm0
3315	vmovaps		%ymm2, %ymm1
3316	vmovaps		%ymm3, %ymm2
3317	vmovaps		%ymm4, %ymm3
3318	vmovaps		%ymm5, %ymm4
3319	vmovaps		%ymm6, %ymm5
3320	vmovaps		%ymm7, %ymm6
3321	addq		$32, %r11
3322
3323	cmpl	$1, %r15d
3324	jle		0f
3325
3326	vmovaps		%ymm1, %ymm0
3327	vmovaps		%ymm2, %ymm1
3328	vmovaps		%ymm3, %ymm2
3329	vmovaps		%ymm4, %ymm3
3330	vmovaps		%ymm5, %ymm4
3331	vmovaps		%ymm6, %ymm5
3332	addq		$32, %r11
3333
3334	cmpl	$2, %r15d
3335	jle		0f
3336
3337	vmovaps		%ymm1, %ymm0
3338	vmovaps		%ymm3, %ymm2
3339	vmovaps		%ymm4, %ymm3
3340	vmovaps		%ymm5, %ymm4
3341	addq		$32, %r11
3342
33430:
3344
3345	// compute number of cols
3346	cmpl	$8, %eax
3347	jle		0f
3348	movl	$8, %eax
33490:
3350	subl	%r15d, %eax
3351	movl	%eax, %r15d
3352
3353	cmpl	$0, %r10d
3354	jg		0f
3355
3356	// offset==0
3357	vmaskmovps	%ymm0, %ymm15,  0(%r11)
3358	vmovaps 	32(%r11), %ymm12
3359	vblendps	$0x01, %ymm12, %ymm1, %ymm1
3360	vmaskmovps	%ymm1, %ymm15,  32(%r11)
3361	vmovaps 	64(%r11), %ymm12
3362	vblendps	$0x03, %ymm12, %ymm2, %ymm2
3363	vmaskmovps	%ymm2, %ymm15,  64(%r11)
3364	vmovaps 	96(%r11), %ymm12
3365	vblendps	$0x07, %ymm12, %ymm3, %ymm3
3366	vmaskmovps	%ymm3, %ymm15,  96(%r11)
3367	vmovaps 	128(%r11), %ymm12
3368	vblendps	$0x0f, %ymm12, %ymm4, %ymm4
3369	vmaskmovps	%ymm4, %ymm15,  128(%r11)
3370	cmpl		$6, %r15d
3371	jl			7f // end
3372	vmovaps 	160(%r11), %ymm12
3373	vblendps	$0x1f, %ymm12, %ymm5, %ymm5
3374	vmaskmovps	%ymm5, %ymm15, 160(%r11)
3375	cmpl		$7, %r15d
3376	jl			7f // end
3377	vmovaps 	192(%r11), %ymm12
3378	vblendps	$0x3f, %ymm12, %ymm6, %ymm6
3379	vmaskmovps	%ymm6, %ymm15, 192(%r11)
3380	je			7f // end
3381	vmovaps 	224(%r11), %ymm12
3382	vblendps	$0x7f, %ymm12, %ymm7, %ymm7
3383	vmaskmovps	%ymm7, %ymm15, 224(%r11)
3384	//
3385	jmp		7f
3386
33870:
3388	// offset > 0
3389	// 1 2 3 4 5 6 7
3390
3391	movq	%r11, %rbx // D0
3392	addq	%r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double)
3393
3394	cmpl	$4, %r10d
3395	jl		1f
3396	jg		2f
3397
3398	// offset==4
3399	// TODO
3400	jmp		7f
3401
34021:
3403	// 1 2 3
3404
3405	cmpl	$2, %r10d
3406	jl		3f
3407	jg		4f
3408
3409	// offset==2
3410	// TODO
3411	jmp		7f
3412
34133:
3414	// offset==1
3415	// TODO
3416	jmp		7f
3417
34184:
3419	// offset==3
3420	// TODO
3421	jmp		7f
3422
34232:
3424	// 5 6 7
3425
3426	cmpl	$6, %r10d
3427	jl		5f
3428	jg		6f
3429
3430	// offset==6
3431	// TODO
3432	jmp		7f
3433
34345:
3435	// offset==5
3436	// TODO
3437	jmp		7f
3438
34396:
3440	// offset==7
3441	// TODO
3442	jmp		7f
3443
3444	// end
34457:
3446
3447#if MACRO_LEVEL>=1
3448	.endm
3449#else
3450	ret
3451
3452	FUN_END(inner_store_8x8_gen_lib8)
3453#endif
3454
3455
3456
3457
3458
3459//                               1      2             3         4         5            6         7
3460// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
3461
3462	.p2align 4,,15
3463	GLOB_FUN_START(kernel_sgemm_nt_8x8_lib8)
3464
3465	PROLOGUE
3466
3467	// zero accumulation registers
3468
3469	vxorpd	%ymm0, %ymm0, %ymm0
3470	vmovaps	%ymm0, %ymm1
3471	vmovaps	%ymm0, %ymm2
3472	vmovaps	%ymm0, %ymm3
3473	vmovaps	%ymm0, %ymm4
3474	vmovaps	%ymm0, %ymm5
3475	vmovaps	%ymm0, %ymm6
3476	vmovaps	%ymm0, %ymm7
3477
3478
3479	// call inner dgemm kernel nt
3480
3481	movq	ARG1, %r10 // k
3482	movq	ARG3, %r11  // A
3483	movq	ARG4, %r12  // B
3484
3485#if MACRO_LEVEL>=2
3486	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
3487#else
3488	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
3489#endif
3490
3491
3492	// call inner scale
3493
3494	movq	ARG2, %r10 // alpha
3495	movq	ARG5, %r11 // beta
3496	movq	ARG6, %r12   // C
3497
3498#if MACRO_LEVEL>=1
3499	INNER_SCALE_AB_8X8_LIB8
3500#else
3501	CALL(inner_scale_ab_8x8_lib8)
3502#endif
3503
3504
3505	// store n
3506
3507	movq	ARG7, %r10 // D
3508
3509#if MACRO_LEVEL>=1
3510	INNER_STORE_8X8_LIB8
3511#else
3512	CALL(inner_store_8x8_lib8)
3513#endif
3514
3515
3516	EPILOGUE
3517
3518	ret
3519
3520	FUN_END(kernel_sgemm_nt_8x8_lib8)
3521
3522
3523
3524
3525
3526//                                  1      2             3         4         5            6         7         8       9
3527// void kernel_sgemm_nt_8x8_vs)lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
3528
3529	.p2align 4,,15
3530	GLOB_FUN_START(kernel_sgemm_nt_8x8_vs_lib8)
3531
3532	PROLOGUE
3533
3534	// zero accumulation registers
3535
3536	vxorpd	%ymm0, %ymm0, %ymm0
3537	vmovaps	%ymm0, %ymm1
3538	vmovaps	%ymm0, %ymm2
3539	vmovaps	%ymm0, %ymm3
3540	vmovaps	%ymm0, %ymm4
3541	vmovaps	%ymm0, %ymm5
3542	vmovaps	%ymm0, %ymm6
3543	vmovaps	%ymm0, %ymm7
3544
3545
3546	// call inner dgemm kernel nt
3547
3548	movq	ARG1, %r10 // k
3549	movq	ARG3, %r11  // A
3550	movq	ARG4, %r12  // B
3551
3552#if MACRO_LEVEL>=2
3553	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
3554#else
3555	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
3556#endif
3557
3558
3559	// call inner scale
3560
3561	movq	ARG2, %r10 // alpha
3562	movq	ARG5, %r11 // beta
3563	movq	ARG6, %r12   // C
3564
3565#if MACRO_LEVEL>=1
3566	INNER_SCALE_AB_8X8_LIB8
3567#else
3568	CALL(inner_scale_ab_8x8_lib8)
3569#endif
3570
3571
3572	// store n
3573
3574	movq	ARG7, %r10 // D
3575	movq	ARG8, %r11 // D
3576	movq	ARG9, %r12 // D
3577
3578#if MACRO_LEVEL>=1
3579	INNER_STORE_8X8_VS_LIB8
3580#else
3581	CALL(inner_store_8x8_vs_lib8)
3582#endif
3583
3584
3585	EPILOGUE
3586
3587	ret
3588
3589	FUN_END(kernel_sgemm_nt_8x8_vs_lib8)
3590
3591
3592
3593
3594
3595//                                   rdi    rsi           rdx       rcx       r8           r9           rsp+8     rsp+16   rsp+24       rsp+32    rsp+40   rsp+48  rsp+56  rsp+64  rsp+72
3596// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1);
3597
3598	.p2align 4,,15
3599	GLOB_FUN_START(kernel_sgemm_nt_8x8_gen_lib8)
3600
3601	PROLOGUE
3602
3603	// zero accumulation registers
3604
3605	vxorpd	%ymm0, %ymm0, %ymm0
3606	vmovaps	%ymm0, %ymm1
3607	vmovaps	%ymm0, %ymm2
3608	vmovaps	%ymm0, %ymm3
3609	vmovaps	%ymm0, %ymm4
3610	vmovaps	%ymm0, %ymm5
3611	vmovaps	%ymm0, %ymm6
3612	vmovaps	%ymm0, %ymm7
3613
3614
3615	// call inner dgemm kernel nt
3616
3617	movq	ARG1, %r10 // k
3618	movq	ARG3, %r11  // A
3619	movq	ARG4, %r12  // B
3620
3621#if MACRO_LEVEL>=2
3622	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
3623#else
3624	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
3625#endif
3626
3627
3628	// call inner blend scale
3629
3630	movq	ARG2, %r10 // alpha
3631	movq	ARG5, %r11 // beta
3632	movq	ARG6, %r12 // offsetC
3633	movq	ARG7, %r13 // C
3634	movq	ARG8, %r14 // sdc
3635	sall	$5, %r14d // 8*sdc*sizeof(float)
3636
3637#if MACRO_LEVEL>=1
3638	INNER_SCALE_AB_8X8_GEN_LIB8
3639#else
3640	CALL(inner_scale_ab_8x8_gen_lib8)
3641#endif
3642
3643
3644	// store n gen
3645
3646	movq	ARG9, %r10 // offsetD
3647	movq	ARG10, %r11 // D
3648	movq	ARG11, %r12 // sdd
3649	sall	$5, %r12d // 8*sdb*sizeof(float)
3650	movq	ARG12, %r13 // m0
3651	movq	ARG13, %r14 // m1
3652	movq	ARG14, %r15 // n0
3653	movq	ARG15, %rax // n1
3654
3655#if MACRO_LEVEL>=1
3656	INNER_STORE_8X8_GEN_LIB8
3657#else
3658	CALL(inner_store_8x8_gen_lib8)
3659#endif
3660
3661
3662	EPILOGUE
3663
3664	ret
3665
3666	FUN_END(kernel_sgemm_nt_8x8_gen_lib8)
3667
3668
3669
3670
3671
3672//                               1      2             3         4            5         6        7            8         9
3673// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D);
3674
3675	.p2align 4,,15
3676	GLOB_FUN_START(kernel_sgemm_nn_8x8_lib8)
3677
3678	PROLOGUE
3679
3680	// zero accumulation registers
3681
3682	ZERO_ACC
3683
3684
3685	// call inner dgemm kernel nn
3686
3687	movq	ARG1, %r10 // k
3688	movq	ARG3, %r11  // A
3689	movq	ARG5, %r12  // B
3690	movq	ARG6, %r13 // sdb
3691	sall	$5, %r13d // 4*sdb*sizeof(double)
3692	movq	ARG4, %r14 // offsetB
3693
3694#if MACRO_LEVEL>=1
3695	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
3696#else
3697	CALL(inner_edge_gemm_add_nn_8x8_lib8)
3698#endif
3699
3700#if MACRO_LEVEL>=2
3701	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
3702#else
3703	CALL(inner_kernel_gemm_add_nn_8x8_lib8)
3704#endif
3705
3706
3707	// call inner blend
3708
3709	movq	ARG2, %r10 // alpha
3710	movq	ARG7, %r11 // beta
3711	movq	ARG8, %r12   // C
3712
3713#if MACRO_LEVEL>=1
3714	INNER_SCALE_AB_8X8_LIB8
3715#else
3716	CALL(inner_scale_ab_8x8_lib8)
3717#endif
3718
3719
3720	// store n
3721
3722	movq	ARG9, %r10 // D
3723
3724#if MACRO_LEVEL>=1
3725	INNER_STORE_8X8_LIB8
3726#else
3727	CALL(inner_store_8x8_lib8)
3728#endif
3729
3730
3731	EPILOGUE
3732
3733	ret
3734
3735	FUN_END(kernel_sgemm_nn_8x8_lib8)
3736
3737
3738
3739
3740
3741//                               1      2             3         4            5         6        7            8         9         10      11
3742// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn);
3743
3744	.p2align 4,,15
3745	GLOB_FUN_START(kernel_sgemm_nn_8x8_vs_lib8)
3746
3747	PROLOGUE
3748
3749	// zero accumulation registers
3750
3751	ZERO_ACC
3752
3753
3754	// call inner dgemm kernel nn
3755
3756	movq	ARG1, %r10 // k
3757	movq	ARG3, %r11  // A
3758	movq	ARG5, %r12  // B
3759	movq	ARG6, %r13 // sdb
3760	sall	$5, %r13d // 4*sdb*sizeof(double)
3761	movq	ARG4, %r14 // offsetB
3762
3763#if MACRO_LEVEL>=1
3764	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
3765#else
3766	CALL(inner_edge_gemm_add_nn_8x8_lib8)
3767#endif
3768
3769#if MACRO_LEVEL>=2
3770	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
3771#else
3772	CALL(inner_kernel_gemm_add_nn_8x8_lib8)
3773#endif
3774
3775
3776	// call inner blend
3777
3778	movq	ARG2, %r10 // alpha
3779	movq	ARG7, %r11 // beta
3780	movq	ARG8, %r12   // C
3781
3782#if MACRO_LEVEL>=1
3783	INNER_SCALE_AB_8X8_LIB8
3784#else
3785	CALL(inner_scale_ab_8x8_lib8)
3786#endif
3787
3788
3789	// store n
3790
3791	movq	ARG9, %r10 // D
3792	movq	ARG10, %r11 // D
3793	movq	ARG11, %r12 // D
3794
3795#if MACRO_LEVEL>=1
3796	INNER_STORE_8X8_VS_LIB8
3797#else
3798	CALL(inner_store_8x8_vs_lib8)
3799#endif
3800
3801
3802	EPILOGUE
3803
3804	ret
3805
3806	FUN_END(kernel_sgemm_nn_8x8_vs_lib8)
3807
3808
3809
3810
3811
3812//                                   rdi    rsi           rdx       rcx       r8        r9       rsp+8        rsp+16    rsp+24    rsp+32    rsp+40   rsp+48     rsp+56   rsp+64  rsp+72  rsp+80  rsp+88
3813// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1);
3814
3815	.p2align 4,,15
3816	GLOB_FUN_START(kernel_sgemm_nn_8x8_gen_lib8)
3817
3818	PROLOGUE
3819
3820	// zero accumulation registers
3821
3822	ZERO_ACC
3823
3824
3825	// call inner dgemm kernel nn
3826
3827	movq	ARG1, %r10 // k
3828	movq	ARG3, %r11  // A
3829	movq	ARG5, %r12  // B
3830	movq	ARG6, %r13 // sdb
3831	sall	$5, %r13d // 4*sdb*sizeof(double)
3832	movq	ARG4, %r14 // offsetB
3833
3834#if MACRO_LEVEL>=1
3835	INNER_EDGE_GEMM_ADD_NN_8X8_LIB8
3836#else
3837	CALL(inner_edge_gemm_add_nn_8x8_lib8)
3838#endif
3839
3840#if MACRO_LEVEL>=2
3841	INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8
3842#else
3843	CALL(inner_kernel_gemm_add_nn_8x8_lib8)
3844#endif
3845
3846
3847	// call inner blend scale
3848
3849	movq	ARG2, %r10 // alpha
3850	movq	ARG7, %r11 // beta
3851	movq	ARG8, %r12 // offsetC
3852	movq	ARG9, %r13 // C
3853	movq	ARG10, %r14 // sdc
3854	sall	$5, %r14d // 4*sdc*sizeof(double)
3855
3856#if MACRO_LEVEL>=1
3857	INNER_SCALE_AB_8X8_GEN_LIB8
3858#else
3859	CALL(inner_scale_ab_8x8_gen_lib8)
3860#endif
3861
3862
3863	// store n gen
3864
3865	movq	ARG11, %r10 // offsetD
3866	movq	ARG12, %r11 // D
3867	movq	ARG13, %r12 // sdd
3868	sall	$5, %r12d // 4*sdb*sizeof(double)
3869	movq	ARG14, %r13 // m0
3870	movq	ARG15, %r14 // m1
3871	movq	ARG16, %r15 // n0
3872	movq	ARG17, %rax // n1
3873
3874#if MACRO_LEVEL>=1
3875	INNER_STORE_8X8_GEN_LIB8
3876#else
3877	CALL(inner_store_8x8_gen_lib8)
3878#endif
3879
3880
3881	EPILOGUE
3882
3883	ret
3884
3885	FUN_END(kernel_sgemm_nn_8x8_gen_lib8)
3886
3887
3888
3889
3890
3891//                                 rdi    rsi           rdx       rcx       r8           r9        rsp+8
3892// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D);
3893
3894	.p2align 4,,15
3895	GLOB_FUN_START(kernel_ssyrk_nt_l_8x8_lib8)
3896
3897	PROLOGUE
3898
3899	// zero accumulation registers
3900
3901	vxorpd	%ymm0, %ymm0, %ymm0
3902	vmovaps	%ymm0, %ymm1
3903	vmovaps	%ymm0, %ymm2
3904	vmovaps	%ymm0, %ymm3
3905	vmovaps	%ymm0, %ymm4
3906	vmovaps	%ymm0, %ymm5
3907	vmovaps	%ymm0, %ymm6
3908	vmovaps	%ymm0, %ymm7
3909
3910
3911	// call inner dgemm kernel nt
3912
3913	movq	ARG1, %r10 // k
3914	movq	ARG3, %r11  // A
3915	movq	ARG4, %r12  // B
3916
3917#if MACRO_LEVEL>=2
3918	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
3919#else
3920	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
3921#endif
3922
3923
3924	// call inner scale
3925
3926	movq	ARG2, %r10 // alpha
3927	movq	ARG5, %r11 // beta
3928	movq	ARG6, %r12   // C
3929
3930#if MACRO_LEVEL>=1
3931	INNER_SCALE_AB_8X8_LIB8
3932#else
3933	CALL(inner_scale_ab_8x8_lib8)
3934#endif
3935
3936
3937	// store n
3938
3939	movq	ARG7, %r10 // D
3940
3941#if MACRO_LEVEL>=1
3942	INNER_STORE_L_8X8_LIB8
3943#else
3944	CALL(inner_store_l_8x8_lib8)
3945#endif
3946
3947
3948	EPILOGUE
3949
3950	ret
3951
3952	FUN_END(kernel_ssyrk_nt_l_8x8_lib8)
3953
3954
3955
3956
3957
3958//                                    1      2             3         4         5            6         7         8       9
3959// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn);
3960
3961	.p2align 4,,15
3962	GLOB_FUN_START(kernel_ssyrk_nt_l_8x8_vs_lib8)
3963
3964	PROLOGUE
3965
3966	// zero accumulation registers
3967
3968	vxorpd	%ymm0, %ymm0, %ymm0
3969	vmovaps	%ymm0, %ymm1
3970	vmovaps	%ymm0, %ymm2
3971	vmovaps	%ymm0, %ymm3
3972	vmovaps	%ymm0, %ymm4
3973	vmovaps	%ymm0, %ymm5
3974	vmovaps	%ymm0, %ymm6
3975	vmovaps	%ymm0, %ymm7
3976
3977
3978	// call inner dgemm kernel nt
3979
3980	movq	ARG1, %r10 // k
3981	movq	ARG3, %r11  // A
3982	movq	ARG4, %r12  // B
3983
3984#if MACRO_LEVEL>=2
3985	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
3986#else
3987	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
3988#endif
3989
3990
3991	// call inner scale
3992
3993	movq	ARG2, %r10 // alpha
3994	movq	ARG5, %r11 // beta
3995	movq	ARG6, %r12   // C
3996
3997#if MACRO_LEVEL>=1
3998	INNER_SCALE_AB_8X8_LIB8
3999#else
4000	CALL(inner_scale_ab_8x8_lib8)
4001#endif
4002
4003
4004	// store n
4005
4006	movq	ARG7, %r10 // D
4007	movq	ARG8, %r11 // km
4008	movq	ARG9, %r12 // kn
4009
4010#if MACRO_LEVEL>=1
4011	INNER_STORE_L_8X8_VS_LIB8
4012#else
4013	CALL(inner_store_l_8x8_vs_lib8)
4014#endif
4015
4016
4017	EPILOGUE
4018
4019	ret
4020
4021	FUN_END(kernel_ssyrk_nt_l_8x8_vs_lib8)
4022
4023
4024
4025
4026
4027//                                      edi    rsi       rdx       ecx       r8        r9        rsp+8
4028// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E);
4029
4030	.p2align 4,,15
4031	GLOB_FUN_START(kernel_strsm_nt_rl_inv_8x8_lib8)
4032
4033	PROLOGUE
4034
4035	// zero accumulation registers
4036
4037	vxorpd	%ymm0, %ymm0, %ymm0
4038	vmovaps	%ymm0, %ymm1
4039	vmovaps	%ymm0, %ymm2
4040	vmovaps	%ymm0, %ymm3
4041	vmovaps	%ymm0, %ymm4
4042	vmovaps	%ymm0, %ymm5
4043	vmovaps	%ymm0, %ymm6
4044	vmovaps	%ymm0, %ymm7
4045
4046
4047	// call inner dgemm kernel nt
4048
4049	movq	ARG1, %r10
4050	movq	ARG2, %r11
4051	movq	ARG3, %r12
4052
4053#if MACRO_LEVEL>=2
4054	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4055#else
4056	CALL(inner_kernel_gemm_sub_nt_8x8_lib8)
4057#endif
4058
4059
4060	// call inner blender_loader nn
4061
4062	movq	ARG4, %r10
4063
4064#if MACRO_LEVEL>=1
4065	INNER_SCALE_11_8X8_LIB8
4066#else
4067	CALL(inner_scale_11_8x8_lib8)
4068#endif
4069
4070
4071	// solve
4072
4073	movq	ARG6, %r10  // E
4074	movq	ARG7, %r11  // inv_diag_E
4075	movq	$8, %r12 // n1
4076
4077#if MACRO_LEVEL>=1
4078	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
4079#else
4080	CALL(inner_edge_trsm_rlt_inv_8x8_vs_lib8)
4081#endif
4082
4083
4084	// store
4085
4086	movq	ARG5, %r10 // D
4087
4088#if MACRO_LEVEL>=1
4089	INNER_STORE_8X8_LIB8
4090#else
4091	CALL(inner_store_8x8_lib8)
4092#endif
4093
4094
4095	EPILOGUE
4096
4097	ret
4098
4099	FUN_END(kernel_strsm_nt_rl_inv_8x8_lib8)
4100
4101
4102
4103
4104
4105//                                         edi    rsi       rdx       ecx       r8        r9        rsp+8               rsp+16  rsp+24
4106// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
4107
4108	.p2align 4,,15
4109	GLOB_FUN_START(kernel_strsm_nt_rl_inv_8x8_vs_lib8)
4110
4111	PROLOGUE
4112
4113	// zero accumulation registers
4114
4115	vxorpd	%ymm0, %ymm0, %ymm0
4116	vmovaps	%ymm0, %ymm1
4117	vmovaps	%ymm0, %ymm2
4118	vmovaps	%ymm0, %ymm3
4119	vmovaps	%ymm0, %ymm4
4120	vmovaps	%ymm0, %ymm5
4121	vmovaps	%ymm0, %ymm6
4122	vmovaps	%ymm0, %ymm7
4123
4124
4125	// call inner dgemm kernel nt
4126
4127	movq	ARG1, %r10
4128	movq	ARG2, %r11
4129	movq	ARG3, %r12
4130
4131#if MACRO_LEVEL>=2
4132	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4133#else
4134	CALL(inner_kernel_gemm_sub_nt_8x8_lib8)
4135#endif
4136
4137
4138	// call inner blender_loader nn // TODO scale gen
4139
4140	movq	ARG4, %r10 // C
4141
4142#if MACRO_LEVEL>=1
4143	INNER_SCALE_11_8X8_LIB8
4144#else
4145	CALL(inner_scale_11_8x8_lib8)
4146#endif
4147
4148
4149	// solve
4150
4151	movq	ARG6, %r10  // E
4152	movq	ARG7, %r11  // inv_diag_E
4153	movq	ARG9, %r12 // kn
4154
4155#if MACRO_LEVEL>=1
4156	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
4157#else
4158	CALL(inner_edge_trsm_rlt_inv_8x8_vs_lib8)
4159#endif
4160
4161
4162	// store
4163
4164	movq	ARG5, %r10 // D
4165	movq	ARG8, %r11 // m1
4166	movq	ARG9, %r12 // n1
4167
4168#if MACRO_LEVEL>=1
4169	INNER_STORE_8X8_VS_LIB8
4170#else
4171	CALL(inner_store_8x8_vs_lib8)
4172#endif
4173
4174
4175	EPILOGUE
4176
4177	ret
4178
4179	FUN_END(kernel_strsm_nt_rl_inv_8x8_vs_lib8)
4180
4181
4182
4183
4184
4185//                                            1       2          3          4       5          6          7         8         9         10
4186// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E);
4187
4188	.p2align 4,,15
4189	GLOB_FUN_START(kernel_sgemm_strsm_nt_rl_inv_8x8_lib8)
4190
4191	PROLOGUE
4192
4193	// zero accumulation registers
4194
4195	vxorpd	%ymm0, %ymm0, %ymm0
4196	vmovaps	%ymm0, %ymm1
4197	vmovaps	%ymm0, %ymm2
4198	vmovaps	%ymm0, %ymm3
4199	vmovaps	%ymm0, %ymm4
4200	vmovaps	%ymm0, %ymm5
4201	vmovaps	%ymm0, %ymm6
4202	vmovaps	%ymm0, %ymm7
4203
4204
4205	// call inner dgemm kernel nt add
4206
4207	movq	ARG1, %r10 // kp
4208	movq	ARG2, %r11  // Ap
4209	movq	ARG3, %r12  // Bp
4210
4211#if MACRO_LEVEL>=2
4212	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
4213#else
4214	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
4215#endif
4216
4217
4218	// call inner dgemm kernel nt sub
4219
4220	movq	ARG4, %r10 // km
4221	movq	ARG5, %r11   // Am
4222	movq	ARG6, %r12   // Bm
4223
4224#if MACRO_LEVEL>=2
4225	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4226#else
4227	CALL(inner_kernel_gemm_sub_nt_8x8_lib8)
4228#endif
4229
4230
4231	// call inner blender_loader nn
4232
4233	movq	ARG7, %r10   // C
4234
4235#if MACRO_LEVEL>=1
4236	INNER_SCALE_11_8X8_LIB8
4237#else
4238	CALL(inner_scale_11_8x8_lib8)
4239#endif
4240
4241
4242	// solve
4243
4244	movq	ARG9, %r10  // E
4245	movq	ARG10, %r11  // inv_diag_E
4246	movq	$8, %r12 // n1
4247
4248#if MACRO_LEVEL>=1
4249	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
4250#else
4251	CALL(inner_edge_trsm_rlt_inv_8x8_vs_lib8)
4252#endif
4253
4254
4255	// store
4256
4257	movq	ARG8, %r10   // D
4258
4259#if MACRO_LEVEL>=1
4260	INNER_STORE_8X8_LIB8
4261#else
4262	CALL(inner_store_8x8_lib8)
4263#endif
4264
4265
4266	EPILOGUE
4267
4268	ret
4269
4270	FUN_END(kernel_sgemm_strsm_nt_rl_inv_8x8_lib8)
4271
4272
4273
4274
4275
4276//                                               1       2          3          4       5          6          7         8         9         10                 11      12
4277// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn);
4278
4279	.p2align 4,,15
4280	GLOB_FUN_START(kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8)
4281
4282	PROLOGUE
4283
4284	// zero accumulation registers
4285
4286	vxorpd	%ymm0, %ymm0, %ymm0
4287	vmovaps	%ymm0, %ymm1
4288	vmovaps	%ymm0, %ymm2
4289	vmovaps	%ymm0, %ymm3
4290	vmovaps	%ymm0, %ymm4
4291	vmovaps	%ymm0, %ymm5
4292	vmovaps	%ymm0, %ymm6
4293	vmovaps	%ymm0, %ymm7
4294
4295
4296	// call inner dgemm kernel nt add
4297
4298	movq	ARG1, %r10 // kp
4299	movq	ARG2, %r11  // Ap
4300	movq	ARG3, %r12  // Bp
4301
4302#if MACRO_LEVEL>=2
4303	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
4304#else
4305	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
4306#endif
4307
4308
4309	// call inner dgemm kernel nt sub
4310
4311	movq	ARG4, %r10 // km
4312	movq	ARG5, %r11   // Am
4313	movq	ARG6, %r12   // Bm
4314
4315#if MACRO_LEVEL>=2
4316	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4317#else
4318	CALL(inner_kernel_gemm_sub_nt_8x8_lib8)
4319#endif
4320
4321
4322	// call inner blender_loader nn
4323
4324	movq	ARG7, %r10  // C
4325
4326#if MACRO_LEVEL>=1
4327	INNER_SCALE_11_8X8_LIB8
4328#else
4329	CALL(inner_scale_11_8x8_lib8)
4330#endif
4331
4332
4333	// solve
4334
4335	movq	ARG9, %r10  // E
4336	movq	ARG10, %r11  // inv_diag_E
4337	movq	ARG12, %r12 // kn
4338
4339#if MACRO_LEVEL>=1
4340	INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8
4341#else
4342	CALL(inner_edge_trsm_rlt_inv_8x8_vs_lib8)
4343#endif
4344
4345
4346	// store
4347
4348	movq	ARG8, %r10 // D
4349	movq	ARG11, %r11 // km
4350	movq	ARG12, %r12 // kn
4351
4352#if MACRO_LEVEL>=1
4353	INNER_STORE_8X8_VS_LIB8
4354#else
4355	CALL(inner_store_8x8_vs_lib8)
4356#endif
4357
4358
4359	EPILOGUE
4360
4361	ret
4362
4363	FUN_END(kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8)
4364
4365
4366
4367
4368
4369//                                  edi    rsi       rdx       rcx       r8        r9
4370// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D);
4371
4372	.p2align 4,,15
4373	GLOB_FUN_START(kernel_spotrf_nt_l_8x8_lib8)
4374
4375	PROLOGUE
4376
4377	// zero accumulation registers
4378
4379	vxorpd	%ymm0, %ymm0, %ymm0
4380	vmovaps	%ymm0, %ymm1
4381	vmovaps	%ymm0, %ymm2
4382	vmovaps	%ymm0, %ymm3
4383	vmovaps	%ymm0, %ymm4
4384	vmovaps	%ymm0, %ymm5
4385	vmovaps	%ymm0, %ymm6
4386	vmovaps	%ymm0, %ymm7
4387
4388
4389	// call inner dgemm kernel nt
4390
4391	movq	ARG1, %r10
4392	movq	ARG2, %r11
4393	movq	ARG3, %r12
4394
4395#if MACRO_LEVEL>=2
4396	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4397#else
4398	CALL(inner_kernel_gemm_sub_nt_8x8_lib8)
4399#endif
4400
4401
4402	// call inner blender_loader nn
4403
4404	movq	ARG4, %r10 // C
4405
4406#if MACRO_LEVEL>=1
4407	INNER_SCALE_11_8X8_LIB8
4408#else
4409	CALL(inner_scale_11_8x8_lib8)
4410#endif
4411
4412
4413	// factorization
4414
4415	movq	ARG6, %r10  // inv_diag_D
4416	movl	$8, %r11d // n1
4417
4418#if MACRO_LEVEL>=1
4419	INNER_EDGE_POTRF_8X8_VS_LIB8
4420#else
4421	CALL(inner_edge_potrf_8x8_vs_lib8)
4422#endif
4423
4424
4425	// store
4426
4427	movq	ARG5, %r10 // D
4428
4429#if MACRO_LEVEL>=1
4430	INNER_STORE_L_8X8_LIB8
4431#else
4432	CALL(inner_store_l_8x8_lib8)
4433#endif
4434
4435
4436	EPILOGUE
4437
4438	ret
4439
4440	FUN_END(kernel_spotrf_nt_l_8x8_lib8)
4441
4442
4443
4444
4445
4446//                                     edi    rsi       rdx       rcx       r8        r9                  rsp+8   rsp+16
4447// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn);
4448
4449	.p2align 4,,15
4450	GLOB_FUN_START(kernel_spotrf_nt_l_8x8_vs_lib8)
4451
4452	PROLOGUE
4453
4454	// zero accumulation registers
4455
4456	vxorpd	%ymm0, %ymm0, %ymm0
4457	vmovaps	%ymm0, %ymm1
4458	vmovaps	%ymm0, %ymm2
4459	vmovaps	%ymm0, %ymm3
4460	vmovaps	%ymm0, %ymm4
4461	vmovaps	%ymm0, %ymm5
4462	vmovaps	%ymm0, %ymm6
4463	vmovaps	%ymm0, %ymm7
4464
4465
4466	// call inner dgemm kernel nt
4467
4468	movq	ARG1, %r10
4469	movq	ARG2, %r11
4470	movq	ARG3, %r12
4471
4472#if MACRO_LEVEL>=2
4473	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4474#else
4475	CALL(inner_kernel_gemm_sub_nt_8x8_lib8)
4476#endif
4477
4478
4479	// call inner blender_loader nn
4480
4481	movq	ARG4, %r10 // C
4482
4483#if MACRO_LEVEL>=1
4484	INNER_SCALE_11_8X8_LIB8
4485#else
4486	CALL(inner_scale_11_8x8_lib8)
4487#endif
4488
4489
4490	// factorization
4491
4492	movq	ARG6, %r10  // inv_diag_D
4493	movq	ARG8, %r11 // kn
4494
4495#if MACRO_LEVEL>=1
4496	INNER_EDGE_POTRF_8X8_VS_LIB8
4497#else
4498	CALL(inner_edge_potrf_8x8_vs_lib8)
4499#endif
4500
4501
4502	// store
4503
4504	movq	ARG5, %r10 // D
4505	movq	ARG7, %r11 // m1
4506	movq	ARG8, %r12 // n1
4507
4508#if MACRO_LEVEL>=1
4509	INNER_STORE_L_8X8_VS_LIB8
4510#else
4511	CALL(inner_store_l_8x8_vs_lib8)
4512#endif
4513
4514
4515	EPILOGUE
4516
4517	ret
4518
4519	FUN_END(kernel_spotrf_nt_l_8x8_vs_lib8)
4520
4521
4522
4523
4524
4525//                                        1       2          3          4       5          6          7         8         9
4526// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D);
4527
4528	.p2align 4,,15
4529	GLOB_FUN_START(kernel_ssyrk_spotrf_nt_l_8x8_lib8)
4530
4531	PROLOGUE
4532
4533	// zero accumulation registers
4534
4535	ZERO_ACC
4536
4537
4538	// call inner dgemm kernel nt add
4539
4540	movq	ARG1, %r10 // kp
4541	movq	ARG2, %r11  // Ap
4542	movq	ARG3, %r12  // Bp
4543
4544#if MACRO_LEVEL>=2
4545	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
4546#else
4547	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
4548#endif
4549
4550
4551	// call inner dgemm kernel nt sub
4552
4553	movq	ARG4, %r10 // km
4554	movq	ARG5, %r11   // Am
4555	movq	ARG6, %r12   // Bm
4556
4557#if MACRO_LEVEL>=2
4558	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4559#else
4560	CALL(inner_kernel_gemm_sub_nt_8x8_lib8)
4561#endif
4562
4563
4564	// call inner blender_loader nn
4565
4566	movq	ARG7, %r10   // C
4567
4568#if MACRO_LEVEL>=1
4569	INNER_SCALE_11_8X8_LIB8
4570#else
4571	CALL(inner_scale_11_8x8_lib8)
4572#endif
4573
4574
4575	// factorization
4576
4577	movq	ARG9, %r10  // inv_diag_D
4578	movl	$8, %r11d
4579
4580#if MACRO_LEVEL>=1
4581	INNER_EDGE_POTRF_8X8_VS_LIB8
4582#else
4583	CALL(inner_edge_potrf_8x8_vs_lib8)
4584#endif
4585
4586
4587	// store
4588
4589	movq	ARG8, %r10  // D
4590
4591#if MACRO_LEVEL>=1
4592	INNER_STORE_L_8X8_LIB8
4593#else
4594	CALL(inner_store_l_8x8_lib8)
4595#endif
4596
4597
4598	EPILOGUE
4599
4600	ret
4601
4602	FUN_END(kernel_ssyrk_spotrf_nt_l_8x8_lib8)
4603
4604
4605
4606
4607
4608//                                           1       2          3          4       5          6          7         8         9                  10      11
4609// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn);
4610
4611	.p2align 4,,15
4612	GLOB_FUN_START(kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8)
4613
4614	PROLOGUE
4615
4616	// zero accumulation registers
4617
4618	vxorpd	%ymm0, %ymm0, %ymm0
4619	vmovaps	%ymm0, %ymm1
4620	vmovaps	%ymm0, %ymm2
4621	vmovaps	%ymm0, %ymm3
4622	vmovaps	%ymm0, %ymm4
4623	vmovaps	%ymm0, %ymm5
4624	vmovaps	%ymm0, %ymm6
4625	vmovaps	%ymm0, %ymm7
4626
4627
4628	// call inner dgemm kernel nt add
4629
4630	movq	ARG1, %r10 // kp
4631	movq	ARG2, %r11  // Ap
4632	movq	ARG3, %r12  // Bp
4633
4634#if MACRO_LEVEL>=2
4635	INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8
4636#else
4637	CALL(inner_kernel_gemm_add_nt_8x8_lib8)
4638#endif
4639
4640
4641	// call inner dgemm kernel nt sub
4642
4643	movq	ARG4, %r10 // km
4644	movq	ARG5, %r11   // Am
4645	movq	ARG6, %r12   // Bm
4646
4647#if MACRO_LEVEL>=2
4648	INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8
4649#else
4650	CALL(inner_kernel_gemm_sub_nt_8x8_lib8)
4651#endif
4652
4653
4654	// call inner blender_loader nn
4655
4656	movq	ARG7, %r10   // C
4657
4658#if MACRO_LEVEL>=1
4659	INNER_SCALE_11_8X8_LIB8
4660#else
4661	CALL(inner_scale_11_8x8_lib8)
4662#endif
4663
4664
4665	// factorization
4666
4667	movq	ARG9, %r10  // inv_diag_D
4668	movq	ARG11, %r11 // kn
4669
4670#if MACRO_LEVEL>=1
4671	INNER_EDGE_POTRF_8X8_VS_LIB8
4672#else
4673	CALL(inner_edge_potrf_8x8_vs_lib8)
4674#endif
4675
4676
4677	// store
4678
4679	movq	ARG8, %r10  // D
4680	movq	ARG10, %r11 // km
4681	movq	ARG11, %r12 // kn
4682
4683#if MACRO_LEVEL>=1
4684	INNER_STORE_L_8X8_VS_LIB8
4685#else
4686	CALL(inner_store_l_8x8_vs_lib8)
4687#endif
4688
4689
4690	EPILOGUE
4691
4692	ret
4693
4694	FUN_END(kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8)
4695
4696
4697
4698
4699
4700#if defined(BLAS_API)
4701
4702#include "kernel_sgemm_8x8_lib.S"
4703
4704#endif
4705
4706
4707
4708
4709
4710	// read-only data
4711#if defined(OS_LINUX)
4712	.section	.rodata.cst32,"aM",@progbits,32
4713#elif defined(OS_MAC)
4714	.section	__TEXT,__const
4715#elif defined(OS_WINDOWS)
4716	.section .rdata,"dr"
4717#endif
4718
4719#if defined(OS_LINUX) | defined(OS_WINDOWS)
4720	.align 32
4721.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
4722#elif defined(OS_MAC)
4723	.align 5
4724LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
4725#endif
4726	.long	1056964608
4727	.long	1069547520
4728	.long	1075838976
4729	.long	1080033280
4730	.long	1083179008
4731	.long	1085276160
4732	.long	1087373312
4733	.long	1089470464
4734
4735#if defined(OS_LINUX) | defined(OS_WINDOWS)
4736	.align 32
4737.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
4738#elif defined(OS_MAC)
4739	.align 5
4740LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 }
4741#endif
4742	.long	1091043328
4743	.long	1092091904
4744	.long	1093140480
4745	.long	1094189056
4746	.long	1095237632
4747	.long	1096286208
4748	.long	1097334784
4749	.long	1098383360
4750
4751#if defined(OS_LINUX) | defined(OS_WINDOWS)
4752	.align 32
4753.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
4754#elif defined(OS_MAC)
4755	.align 5
4756LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 }
4757#endif
4758	.long	1099169792
4759	.long	1099694080
4760	.long	1100218368
4761	.long	1100742656
4762	.long	1101266944
4763	.long	1101791232
4764	.long	1102315520
4765	.long	1102839808
4766
4767#if defined(OS_LINUX) | defined(OS_WINDOWS)
4768	.align 32
4769.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
4770#elif defined(OS_MAC)
4771	.align 5
4772LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
4773#endif
4774	.long	1065353216
4775	.long	1065353216
4776	.long	1065353216
4777	.long	1065353216
4778	.long	1065353216
4779	.long	1065353216
4780	.long	1065353216
4781	.long	1065353216
4782
4783#if defined(OS_LINUX) | defined(OS_WINDOWS)
4784	.align 32
4785.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
4786#elif defined(OS_MAC)
4787	.align 5
4788LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 }
4789#endif
4790	.long	1065353216
4791	.long	1065353216
4792	.long	1065353216
4793	.long	1065353216
4794	.long	1065353216
4795	.long	1065353216
4796	.long	3212836864
4797	.long	3212836864
4798
4799
4800
4801#if defined(OS_LINUX)
4802	.section	.note.GNU-stack,"",@progbits
4803#elif defined(OS_MAC)
4804	.subsections_via_symbols
4805#endif
4806
4807