1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36#if defined(OS_LINUX) | defined(OS_MAC)
37
38//#define STACKSIZE 96
39#define STACKSIZE 64
40#define ARG1  %rdi
41#define ARG2  %rsi
42#define ARG3  %rdx
43#define ARG4  %rcx
44#define ARG5  %r8
45#define ARG6  %r9
46#define ARG7  STACKSIZE +  8(%rsp)
47#define ARG8  STACKSIZE + 16(%rsp)
48#define ARG9  STACKSIZE + 24(%rsp)
49#define ARG10 STACKSIZE + 32(%rsp)
50#define ARG11 STACKSIZE + 40(%rsp)
51#define ARG12 STACKSIZE + 48(%rsp)
52#define ARG13 STACKSIZE + 56(%rsp)
53#define ARG14 STACKSIZE + 64(%rsp)
54#define ARG15 STACKSIZE + 72(%rsp)
55#define ARG16 STACKSIZE + 80(%rsp)
56#define ARG17 STACKSIZE + 88(%rsp)
57#define ARG18 STACKSIZE + 96(%rsp)
58#define PROLOGUE \
59	subq	$STACKSIZE, %rsp; \
60	movq	%rbx,   (%rsp); \
61	movq	%rbp,  8(%rsp); \
62	movq	%r12, 16(%rsp); \
63	movq	%r13, 24(%rsp); \
64	movq	%r14, 32(%rsp); \
65	movq	%r15, 40(%rsp); \
66	vzeroupper;
67#define EPILOGUE \
68	vzeroupper; \
69	movq	  (%rsp), %rbx; \
70	movq	 8(%rsp), %rbp; \
71	movq	16(%rsp), %r12; \
72	movq	24(%rsp), %r13; \
73	movq	32(%rsp), %r14; \
74	movq	40(%rsp), %r15; \
75	addq	$STACKSIZE, %rsp;
76
77#if defined(OS_LINUX)
78
79#define GLOB_FUN_START(NAME) \
80	.globl NAME; \
81	.type NAME, @function; \
82NAME:
83#define FUN_START(NAME) \
84	.type NAME, @function; \
85NAME:
86#define FUN_END(NAME) \
87	.size	NAME, .-NAME
88#define CALL(NAME) \
89	call NAME
90//#define ZERO_ACC \
91//	vxorpd	%ymm0, %ymm0, %ymm0; \
92//	vmovapd	%ymm0, %ymm1; \
93//	vmovapd	%ymm0, %ymm2; \
94//	vmovapd	%ymm0, %ymm3
95//#define NEG_ACC \
96//	vmovapd		.LC11(%rip), %ymm15; \
97//	vxorpd		%ymm15, %ymm0, %ymm0; \
98//	vxorpd		%ymm15, %ymm1, %ymm1; \
99//	vxorpd		%ymm15, %ymm2, %ymm2; \
100//	vxorpd		%ymm15, %ymm3, %ymm3
101
102#else // defined(OS_MAC)
103
104#define GLOB_FUN_START(NAME) \
105	.globl _ ## NAME; \
106_ ## NAME:
107#define FUN_START(NAME) \
108_ ## NAME:
109#define FUN_END(NAME)
110#define CALL(NAME) \
111	callq _ ## NAME
112//#define ZERO_ACC \
113//	vxorpd	%ymm0, %ymm0, %ymm0; \
114//	vmovapd	%ymm0, %ymm1; \
115//	vmovapd	%ymm0, %ymm2; \
116//	vmovapd	%ymm0, %ymm3
117//#define NEG_ACC \
118//	vmovapd		LC11(%rip), %ymm15; \
119//	vxorpd		%ymm15, %ymm0, %ymm0; \
120//	vxorpd		%ymm15, %ymm1, %ymm1; \
121//	vxorpd		%ymm15, %ymm2, %ymm2; \
122//	vxorpd		%ymm15, %ymm3, %ymm3
123
124#endif
125
126#elif defined(OS_WINDOWS)
127
128#define STACKSIZE 256
129#define ARG1  %rcx
130#define ARG2  %rdx
131#define ARG3  %r8
132#define ARG4  %r9
133#define ARG5  STACKSIZE + 40(%rsp)
134#define ARG6  STACKSIZE + 48(%rsp)
135#define ARG7  STACKSIZE + 56(%rsp)
136#define ARG8  STACKSIZE + 64(%rsp)
137#define ARG9  STACKSIZE + 72(%rsp)
138#define ARG10 STACKSIZE + 80(%rsp)
139#define ARG11 STACKSIZE + 88(%rsp)
140#define ARG12 STACKSIZE + 96(%rsp)
141#define ARG13 STACKSIZE + 104(%rsp)
142#define ARG14 STACKSIZE + 112(%rsp)
143#define ARG15 STACKSIZE + 120(%rsp)
144#define ARG16 STACKSIZE + 128(%rsp)
145#define ARG17 STACKSIZE + 136(%rsp)
146#define ARG18 STACKSIZE + 144(%rsp)
147#define PROLOGUE \
148	subq	$STACKSIZE, %rsp; \
149	movq	%rbx,   (%rsp); \
150	movq	%rbp,  8(%rsp); \
151	movq	%r12, 16(%rsp); \
152	movq	%r13, 24(%rsp); \
153	movq	%r14, 32(%rsp); \
154	movq	%r15, 40(%rsp); \
155	movq	%rdi, 48(%rsp); \
156	movq	%rsi, 56(%rsp); \
157	vmovups	%xmm6, 64(%rsp); \
158	vmovups	%xmm7, 80(%rsp); \
159	vmovups	%xmm8, 96(%rsp); \
160	vmovups	%xmm9, 112(%rsp); \
161	vmovups	%xmm10, 128(%rsp); \
162	vmovups	%xmm11, 144(%rsp); \
163	vmovups	%xmm12, 160(%rsp); \
164	vmovups	%xmm13, 176(%rsp); \
165	vmovups	%xmm14, 192(%rsp); \
166	vmovups	%xmm15, 208(%rsp); \
167	vzeroupper;
168#define EPILOGUE \
169	vzeroupper; \
170	movq	  (%rsp), %rbx; \
171	movq	 8(%rsp), %rbp; \
172	movq	16(%rsp), %r12; \
173	movq	24(%rsp), %r13; \
174	movq	32(%rsp), %r14; \
175	movq	40(%rsp), %r15; \
176	movq	48(%rsp), %rdi; \
177	movq	56(%rsp), %rsi; \
178	vmovups	64(%rsp), %xmm6; \
179	vmovups	80(%rsp), %xmm7; \
180	vmovups	96(%rsp), %xmm8; \
181	vmovups	112(%rsp), %xmm9; \
182	vmovups	128(%rsp), %xmm10; \
183	vmovups	144(%rsp), %xmm11; \
184	vmovups	160(%rsp), %xmm12; \
185	vmovups	176(%rsp), %xmm13; \
186	vmovups	192(%rsp), %xmm14; \
187	vmovups	208(%rsp), %xmm15; \
188	addq	$STACKSIZE, %rsp;
189
190#define GLOB_FUN_START(NAME) \
191	.globl NAME; \
192	.def NAME; .scl 2; .type 32; .endef; \
193NAME:
194#define FUN_START(NAME) \
195	.def NAME; .scl 2; .type 32; .endef; \
196NAME:
197#define FUN_END(NAME)
198#define CALL(NAME) \
199	call NAME
200//#define ZERO_ACC \
201//	vxorpd	%ymm0, %ymm0, %ymm0; \
202//	vmovapd	%ymm0, %ymm1; \
203//	vmovapd	%ymm0, %ymm2; \
204//	vmovapd	%ymm0, %ymm3
205//#define NEG_ACC \
206//	vmovapd		.LC11(%rip), %ymm15; \
207//	vxorpd		%ymm15, %ymm0, %ymm0; \
208//	vxorpd		%ymm15, %ymm1, %ymm1; \
209//	vxorpd		%ymm15, %ymm2, %ymm2; \
210//	vxorpd		%ymm15, %ymm3, %ymm3
211
212#else
213
214#error wrong OS
215
216#endif
217
218
219
220#if defined(OS_LINUX) | defined(OS_WINDOWS)
221	.text
222#elif defined(OS_MAC)
223	.section	__TEXT,__text,regular,pure_instructions
224#endif
225
226
227
228
229
230// common inner routine with file scope
231//
232// input arguments:
233// r10d  <- k
234// r11   <- A
235// r12   <- x
236// ymm0  <- [z0 z1 z2 z3]_a
237// ymm1  <- [z0 z1 z2 z3]_b
238// ymm2  <- [z0 z1 z2 z3]_c
239// ymm3  <- [z0 z1 z2 z3]_d
240// ymm8  <- dirty
241// ymm9  <- dirty
242// ymm10 <- dirty
243// ymm11 <- dirty
244// ymm12 <- dirty
245// ymm13 <- dirty
246// ymm14 <- dirty
247// ymm15 <- dirty
248
249//
250// output arguments:
251// r10d  <- 0
252// r11   <- A+4*k*sizeof(double)
253// r12   <- x+k*sizeof(double)
254// ymm0  <- [z0 z1 z2 z3]_a
255// ymm1  <- [z0 z1 z2 z3]_b
256// ymm2  <- [z0 z1 z2 z3]_c
257// ymm3  <- [z0 z1 z2 z3]_d
258// ymm8  <- dirty
259// ymm9  <- dirty
260// ymm10 <- dirty
261// ymm11 <- dirty
262// ymm12 <- dirty
263// ymm13 <- dirty
264// ymm14 <- dirty
265// ymm15 <- dirty
266
267#if MACRO_LEVEL>=2
268	.macro INNER_KERNEL_GEMV_ADD_N_8_LIB8
269#else
270	.p2align 4,,15
271	FUN_START(inner_kernel_gemv_add_n_8_lib8)
272#endif
273
274	cmpl	$0, %r10d
275	jle		2f // return
276
277	cmpl	$4, %r10d
278	jl		0f // clean-up loop
279
280	// main loop
281	.p2align 3
2821: // main loop
283
284	vmovaps			0(%r11), %ymm8
285	vbroadcastss	0(%r12), %ymm12
286	vmulps			%ymm8, %ymm12, %ymm15
287	vaddps			%ymm0, %ymm15, %ymm0
288
289	subl	$4, %r10d
290
291	vmovaps			32(%r11), %ymm8
292	vbroadcastss	4(%r12), %ymm12
293	vmulps			%ymm8, %ymm12, %ymm15
294	vaddps			%ymm1, %ymm15, %ymm1
295
296	vmovaps			64(%r11), %ymm8
297	vbroadcastss	8(%r12), %ymm12
298	vmulps			%ymm8, %ymm12, %ymm15
299	vaddps			%ymm2, %ymm15, %ymm2
300
301	vmovaps			96(%r11), %ymm8
302	vbroadcastss	12(%r12), %ymm12
303	vmulps			%ymm8, %ymm12, %ymm15
304	vaddps			%ymm3, %ymm15, %ymm3
305
306	addq	$128, %r11
307	addq	$16, %r12
308
309	cmpl	$3, %r10d
310
311	jg		1b // main loop
312
313
314	// consider clean-up
315	cmpl	$0, %r10d
316	jle		2f // return
317
3180: // clean-up
319
320	vmovaps			0(%r11), %ymm8
321	vbroadcastss	0(%r12), %ymm12
322	vmulps			%ymm8, %ymm12, %ymm15
323	vaddps			%ymm0, %ymm15, %ymm0
324
325	addq	$32, %r11
326	addq	$4, %r12
327
328	subl	$1, %r10d
329	cmpl	$0, %r10d
330
331	jg		0b // clean
332
3332: // return
334
335#if MACRO_LEVEL>=2
336	.endm
337#else
338	ret
339
340	FUN_END(inner_kernel_gemv_add_n_8_lib8)
341#endif
342
343
344
345
346
347// common inner routine with file scope
348//
349// input arguments:
350// r10d  <- k
351// r11   <- A
352// r12   <- bs*sda*sizeof(double) = 32*sda
353// r13   <- x
354// ymm0  <- [z0a z0b z0c z0d]
355// ymm1  <- [z1a z1b z1c z1d]
356// ymm2  <- [z2a z2b z2c z2d]
357// ymm3  <- [z3a z3b z3c z3d]
358// ymm8  <- dirty
359// ymm9  <- dirty
360// ymm10 <- dirty
361// ymm11 <- dirty
362// ymm12 <- dirty
363// ymm13 <- dirty
364// ymm14 <- dirty
365// ymm15 <- dirty
366
367//
368// output arguments:
369// r10d  <- 0
370// r11   <- A+4*k*sizeof(double)
371// r12   <- bs*sda*sizeof(double) = 32*sda
372// r13   <- x+k*sizeof(double)
373// ymm0  <- [z0a z0b z0c z0d]
374// ymm1  <- [z1a z1b z1c z1d]
375// ymm2  <- [z2a z2b z2c z2d]
376// ymm3  <- [z3a z3b z3c z3d]
377// ymm8  <- dirty
378// ymm9  <- dirty
379// ymm10 <- dirty
380// ymm11 <- dirty
381// ymm12 <- dirty
382// ymm13 <- dirty
383// ymm14 <- dirty
384// ymm15 <- dirty
385
386#if MACRO_LEVEL>=2
387	.macro INNER_KERNEL_GEMV_ADD_T_8_LIB8
388#else
389	.p2align 4,,15
390	FUN_START(inner_kernel_gemv_add_t_8_lib8)
391#endif
392
393	cmpl	$0, %r10d
394	jle		2f // return
395
396	cmpl	$8, %r10d
397	jl		0f // clean-up loop
398
399	// main loop
400	.p2align 3
4011: // main loop
402
403	vmovups		0(%r13), %ymm12
404
405	vmovaps		0(%r11), %ymm8
406	vmulps		%ymm8, %ymm12, %ymm15
407	vaddps		%ymm0, %ymm15, %ymm0
408
409	subl	$8, %r10d
410
411	vmovaps		32(%r11), %ymm8
412	vmulps		%ymm8, %ymm12, %ymm15
413	vaddps		%ymm1, %ymm15, %ymm1
414
415	vmovaps		64(%r11), %ymm8
416	vmulps		%ymm8, %ymm12, %ymm15
417	vaddps		%ymm2, %ymm15, %ymm2
418
419	vmovaps		96(%r11), %ymm8
420	vmulps		%ymm8, %ymm12, %ymm15
421	vaddps		%ymm3, %ymm15, %ymm3
422
423	vmovaps		128(%r11), %ymm8
424	vmulps		%ymm8, %ymm12, %ymm15
425	vaddps		%ymm4, %ymm15, %ymm4
426
427	vmovaps		160(%r11), %ymm8
428	vmulps		%ymm8, %ymm12, %ymm15
429	vaddps		%ymm5, %ymm15, %ymm5
430
431	vmovaps		192(%r11), %ymm8
432	vmulps		%ymm8, %ymm12, %ymm15
433	vaddps		%ymm6, %ymm15, %ymm6
434
435	vmovaps		224(%r11), %ymm8
436	vmulps		%ymm8, %ymm12, %ymm15
437	vaddps		%ymm7, %ymm15, %ymm7
438
439	addq	%r12, %r11
440	addq	$32, %r13
441
442	cmpl	$7, %r10d
443
444	jg		1b // main loop
445
446
447	// consider clean-up
448	cmpl	$0, %r10d
449	jle		2f // return
450
4510: // clean-up
452
453	vcvtsi2ss	%r10d, %xmm14, %xmm14
454#if defined(OS_LINUX) | defined(OS_WINDOWS)
455	vmovups		.LC00(%rip), %ymm13
456#elif defined(OS_MAC)
457	vmovups		LC00(%rip), %ymm13
458#endif
459	vshufps		$0x00, %xmm14, %xmm14, %xmm14
460	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
461	vsubps		%ymm14, %ymm13, %ymm14
462
463	vmaskmovps	0(%r13), %ymm14, %ymm12
464
465	vmaskmovps	0(%r11), %ymm14, %ymm8
466	vmulps		%ymm8, %ymm12, %ymm15
467	vaddps		%ymm0, %ymm15, %ymm0
468
469	vmaskmovps	32(%r11), %ymm14, %ymm8
470	vmulps		%ymm8, %ymm12, %ymm15
471	vaddps		%ymm1, %ymm15, %ymm1
472
473	vmaskmovps	64(%r11), %ymm14, %ymm8
474	vmulps		%ymm8, %ymm12, %ymm15
475	vaddps		%ymm2, %ymm15, %ymm2
476
477	vmaskmovps	96(%r11), %ymm14, %ymm8
478	vmulps		%ymm8, %ymm12, %ymm15
479	vaddps		%ymm3, %ymm15, %ymm3
480
481	vmaskmovps	128(%r11), %ymm14, %ymm8
482	vmulps		%ymm8, %ymm12, %ymm15
483	vaddps		%ymm4, %ymm15, %ymm4
484
485	vmaskmovps	160(%r11), %ymm14, %ymm8
486	vmulps		%ymm8, %ymm12, %ymm15
487	vaddps		%ymm5, %ymm15, %ymm5
488
489	vmaskmovps	192(%r11), %ymm14, %ymm8
490	vmulps		%ymm8, %ymm12, %ymm15
491	vaddps		%ymm6, %ymm15, %ymm6
492
493	vmaskmovps	224(%r11), %ymm14, %ymm8
494	vmulps		%ymm8, %ymm12, %ymm15
495	vaddps		%ymm7, %ymm15, %ymm7
496
497	sall	$2, %r10d
498	addq	%r10, %r11
499	addq	%r10, %r13
500	xorl	%r10d, %r10d
501
502
5032: // return
504
505#if MACRO_LEVEL>=2
506	.endm
507#else
508	ret
509
510	FUN_END(inner_kernel_gemv_add_t_8_lib8)
511#endif
512
513
514
515
516
517// common inner routine with file scope
518//
519// input arguments:
520// r10d  <- k
521// r11   <- A
522// r12   <- bs*sda*sizeof(double) = 32*sda
523// r13   <- x
524// r14d  <- offA
525// ymm0  <- [z0a z0b z0c z0d]
526// ymm1  <- [z1a z1b z1c z1d]
527// ymm2  <- [z2a z2b z2c z2d]
528// ymm3  <- [z3a z3b z3c z3d]
529// ymm8  <- dirty
530// ymm9  <- dirty
531// ymm10 <- dirty
532// ymm11 <- dirty
533// ymm12 <- dirty
534// ymm13 <- dirty
535// ymm14 <- dirty
536// ymm15 <- dirty
537
538//
539// output arguments:
540// r10d  <-
541// r11   <-
542// r12   <-
543// r13   <-
544// r14d  <- offA
545// ymm0  <- [z0a z0b z0c z0d]
546// ymm1  <- [z1a z1b z1c z1d]
547// ymm2  <- [z2a z2b z2c z2d]
548// ymm3  <- [z3a z3b z3c z3d]
549// ymm8  <- dirty
550// ymm9  <- dirty
551// ymm10 <- dirty
552// ymm11 <- dirty
553// ymm12 <- dirty
554// ymm13 <- dirty
555// ymm14 <- dirty
556// ymm15 <- dirty
557
558#if MACRO_LEVEL>=2
559	.macro INNER_EDGE_GEMV_ADD_T_8_LIB8
560#else
561	.p2align 4,,15
562	FUN_START(inner_edge_gemv_add_t_8_lib8)
563#endif
564
565	cmpl	$0, %r14d
566	jle		0f // return
567
568	movl	%r14d, %r15d
569	sall	$2, %r15d // offA*sizeof(float)
570
571	subq	%r15, %r11 // A - offA
572	subq	%r15, %r13 // x - offA
573
574	movl	%r10d, %r15d // kmax
575	addl	%r14d, %r15d // kmax + offA
576
577	vcvtsi2ss	%r14d, %xmm14, %xmm14 // offA
578	vcvtsi2ss	%r15d, %xmm15, %xmm15 // offA + kmax
579#if defined(OS_LINUX) | defined(OS_WINDOWS)
580	vmovups		.LC00(%rip), %ymm13
581#elif defined(OS_MAC)
582	vmovups		LC00(%rip), %ymm13
583#endif
584	vshufps		$0x00, %xmm14, %xmm14, %xmm14
585	vshufps		$0x00, %xmm15, %xmm15, %xmm15
586	vinsertf128	$1, %xmm14, %ymm14, %ymm14
587	vinsertf128	$1, %xmm15, %ymm15, %ymm15
588	vsubps		%ymm13, %ymm14, %ymm14
589	vsubps		%ymm15, %ymm13, %ymm15
590	vandps		%ymm15, %ymm14, %ymm14
591
592	vmaskmovps	0(%r13), %ymm14, %ymm12
593
594	vmovaps		0(%r11), %ymm8
595	vmulps		%ymm8, %ymm12, %ymm15
596	vaddps		%ymm0, %ymm15, %ymm0
597
598	vmovaps		32(%r11), %ymm8
599	vmulps		%ymm8, %ymm12, %ymm15
600	vaddps		%ymm1, %ymm15, %ymm1
601
602	vmovaps		64(%r11), %ymm8
603	vmulps		%ymm8, %ymm12, %ymm15
604	vaddps		%ymm2, %ymm15, %ymm2
605
606	vmovaps		96(%r11), %ymm8
607	vmulps		%ymm8, %ymm12, %ymm15
608	vaddps		%ymm3, %ymm15, %ymm3
609
610	vmovaps		128(%r11), %ymm8
611	vmulps		%ymm8, %ymm12, %ymm15
612	vaddps		%ymm4, %ymm15, %ymm4
613
614	vmovaps		160(%r11), %ymm8
615	vmulps		%ymm8, %ymm12, %ymm15
616	vaddps		%ymm5, %ymm15, %ymm5
617
618	vmovaps		192(%r11), %ymm8
619	vmulps		%ymm8, %ymm12, %ymm15
620	vaddps		%ymm6, %ymm15, %ymm6
621
622	vmovaps		224(%r11), %ymm8
623	vmulps		%ymm8, %ymm12, %ymm15
624	vaddps		%ymm7, %ymm15, %ymm7
625
626	addq	$32, %r13 // x + 4
627	addq	%r12, %r11 // A + bs*sda
628
629	addl	%r14d, %r10d
630	subl	$8, %r10d // kmax - (8-offA)
631
6320: // return
633
634#if MACRO_LEVEL>=2
635	.endm
636#else
637	ret
638
639	FUN_END(inner_edge_gemv_add_t_8_lib8)
640#endif
641
642
643
644
645
646// common inner routine with file scope
647//
648// triangular substitution with vector RHS
649//
650// input arguments:
651// r10  <- E
652// r11  <- inv_diag_E
653// ymm0 <- [z0 z1 z2 z3]
654// ymm12 <- dirty
655// ymm13 <- dirty
656//
657// output arguments:
658// r10  <- E
659// r11  <- inv_diag_E
660// ymm0 <- [z0 z1 z2 z3]
661// ymm12 <- dirty
662// ymm13 <- dirty
663
664#if MACRO_LEVEL>=1
665	.macro INNER_EDGE_TRSV_LN_INV_8_LIB8
666#else
667	.p2align 4,,15
668	FUN_START(inner_edge_trsv_ln_inv_8_lib8)
669#endif
670
671	vxorps			%ymm14, %ymm14, %ymm14
672
673	vbroadcastss	0(%r11), %ymm12
674	vmulps			%ymm0, %ymm12, %ymm1
675	vblendps		$0x01, %ymm1, %ymm0, %ymm0
676
677	vmovaps			0(%r10), %ymm13
678	vblendps		$0x01, %ymm14, %ymm13, %ymm13
679	vpermilps		$0x00, %ymm0, %ymm12
680	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
681	vmulps			%ymm13, %ymm12, %ymm15
682	vsubps			%ymm15, %ymm0, %ymm0
683	vbroadcastss	4(%r11), %ymm12
684	vmulps			%ymm0, %ymm12, %ymm1
685	vblendps		$0x02, %ymm1, %ymm0, %ymm0
686
687	vmovaps			32(%r10), %ymm13
688	vblendps		$0x03, %ymm14, %ymm13, %ymm13
689	vpermilps		$0x55, %ymm0, %ymm12
690	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
691	vmulps			%ymm13, %ymm12, %ymm15
692	vsubps			%ymm15, %ymm0, %ymm0
693	vbroadcastss	8(%r11), %ymm12
694	vmulps			%ymm0, %ymm12, %ymm1
695	vblendps		$0x04, %ymm1, %ymm0, %ymm0
696
697	vmovaps			64(%r10), %ymm13
698	vblendps		$0x07, %ymm14, %ymm13, %ymm13
699	vpermilps		$0xaa, %ymm0, %ymm12
700	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
701	vmulps			%ymm13, %ymm12, %ymm15
702	vsubps			%ymm15, %ymm0, %ymm0
703	vbroadcastss	12(%r11), %ymm12
704	vmulps			%ymm0, %ymm12, %ymm1
705	vblendps		$0x08, %ymm1, %ymm0, %ymm0
706
707	vmovaps			96(%r10), %ymm13
708	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
709	vpermilps		$0xff, %ymm0, %ymm12
710	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
711	vmulps			%ymm13, %ymm12, %ymm15
712	vsubps			%ymm15, %ymm0, %ymm0
713	vbroadcastss	16(%r11), %ymm12
714	vmulps			%ymm0, %ymm12, %ymm1
715	vblendps		$0x10, %ymm1, %ymm0, %ymm0
716
717	vmovaps			128(%r10), %ymm13
718	vblendps		$0x1f, %ymm14, %ymm13, %ymm13
719	vpermilps		$0x00, %ymm0, %ymm12
720	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
721	vmulps			%ymm13, %ymm12, %ymm15
722	vsubps			%ymm15, %ymm0, %ymm0
723	vbroadcastss	20(%r11), %ymm12
724	vmulps			%ymm0, %ymm12, %ymm1
725	vblendps		$0x20, %ymm1, %ymm0, %ymm0
726
727	vmovaps			160(%r10), %ymm13
728	vblendps		$0x3f, %ymm14, %ymm13, %ymm13
729	vpermilps		$0x55, %ymm0, %ymm12
730	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
731	vmulps			%ymm13, %ymm12, %ymm15
732	vsubps			%ymm15, %ymm0, %ymm0
733	vbroadcastss	24(%r11), %ymm12
734	vmulps			%ymm0, %ymm12, %ymm1
735	vblendps		$0x40, %ymm1, %ymm0, %ymm0
736
737	vmovaps			192(%r10), %ymm13
738	vblendps		$0x7f, %ymm14, %ymm13, %ymm13
739	vpermilps		$0xaa, %ymm0, %ymm12
740	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
741	vmulps			%ymm13, %ymm12, %ymm15
742	vsubps			%ymm15, %ymm0, %ymm0
743	vbroadcastss	28(%r11), %ymm12
744	vmulps			%ymm0, %ymm12, %ymm1
745	vblendps		$0x80, %ymm1, %ymm0, %ymm0
746
747#if MACRO_LEVEL>=1
748	.endm
749#else
750	ret
751
752	FUN_END(inner_edge_trsv_ln_inv_8_lib8)
753#endif
754
755
756
757
758
759// common inner routine with file scope
760//
761// triangular substitution with vector RHS
762//
763// input arguments:
764// r10  <- E
765// r11  <- inv_diag_E
766// r12d <- kn
767// ymm0 <- [z0 z1 z2 z3]
768// ymm12 <- dirty
769// ymm13 <- dirty
770//
771// output arguments:
772// r10  <- E
773// r11  <- inv_diag_E
774// r12d <- kn
775// ymm0 <- [z0 z1 z2 z3]
776// ymm12 <- dirty
777// ymm13 <- dirty
778
779#if MACRO_LEVEL>=1
780	.macro INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
781#else
782	.p2align 4,,15
783	FUN_START(inner_edge_trsv_ln_inv_8_vs_lib8)
784#endif
785
786	vxorps			%ymm14, %ymm14, %ymm14
787
788	vbroadcastss	0(%r11), %ymm12
789	vmulps			%ymm0, %ymm12, %ymm1
790	vblendps		$0x01, %ymm1, %ymm0, %ymm0
791	vmovaps			0(%r10), %ymm13
792	vblendps		$0x01, %ymm14, %ymm13, %ymm13
793	vpermilps		$0x00, %ymm0, %ymm12
794	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
795	vmulps			%ymm13, %ymm12, %ymm15
796	vsubps			%ymm15, %ymm0, %ymm0
797
798	cmpl			$2, %r12d
799	jl				0f // ret
800
801	vbroadcastss	4(%r11), %ymm12
802	vmulps			%ymm0, %ymm12, %ymm1
803	vblendps		$0x02, %ymm1, %ymm0, %ymm0
804	vmovaps			32(%r10), %ymm13
805	vblendps		$0x03, %ymm14, %ymm13, %ymm13
806	vpermilps		$0x55, %ymm0, %ymm12
807	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
808	vmulps			%ymm13, %ymm12, %ymm15
809	vsubps			%ymm15, %ymm0, %ymm0
810
811	cmpl			$3, %r12d
812	jl				0f // ret
813
814	vbroadcastss	8(%r11), %ymm12
815	vmulps			%ymm0, %ymm12, %ymm1
816	vblendps		$0x04, %ymm1, %ymm0, %ymm0
817	vmovaps			64(%r10), %ymm13
818	vblendps		$0x07, %ymm14, %ymm13, %ymm13
819	vpermilps		$0xaa, %ymm0, %ymm12
820	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
821	vmulps			%ymm13, %ymm12, %ymm15
822	vsubps			%ymm15, %ymm0, %ymm0
823
824	cmpl			$4, %r12d
825	jl				0f // ret
826
827	vbroadcastss	12(%r11), %ymm12
828	vmulps			%ymm0, %ymm12, %ymm1
829	vblendps		$0x08, %ymm1, %ymm0, %ymm0
830	vmovaps			96(%r10), %ymm13
831	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
832	vpermilps		$0xff, %ymm0, %ymm12
833	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
834	vmulps			%ymm13, %ymm12, %ymm15
835	vsubps			%ymm15, %ymm0, %ymm0
836
837	cmpl			$5, %r12d
838	jl				0f // ret
839
840	vbroadcastss	16(%r11), %ymm12
841	vmulps			%ymm0, %ymm12, %ymm1
842	vblendps		$0x10, %ymm1, %ymm0, %ymm0
843	vmovaps			128(%r10), %ymm13
844	vblendps		$0x1f, %ymm14, %ymm13, %ymm13
845	vpermilps		$0x00, %ymm0, %ymm12
846	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
847	vmulps			%ymm13, %ymm12, %ymm15
848	vsubps			%ymm15, %ymm0, %ymm0
849
850	cmpl			$6, %r12d
851	jl				0f // ret
852
853	vbroadcastss	20(%r11), %ymm12
854	vmulps			%ymm0, %ymm12, %ymm1
855	vblendps		$0x20, %ymm1, %ymm0, %ymm0
856	vmovaps			160(%r10), %ymm13
857	vblendps		$0x3f, %ymm14, %ymm13, %ymm13
858	vpermilps		$0x55, %ymm0, %ymm12
859	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
860	vmulps			%ymm13, %ymm12, %ymm15
861	vsubps			%ymm15, %ymm0, %ymm0
862
863	cmpl			$7, %r12d
864	jl				0f // ret
865
866	vbroadcastss	24(%r11), %ymm12
867	vmulps			%ymm0, %ymm12, %ymm1
868	vblendps		$0x40, %ymm1, %ymm0, %ymm0
869	vmovaps			192(%r10), %ymm13
870	vblendps		$0x7f, %ymm14, %ymm13, %ymm13
871	vpermilps		$0xaa, %ymm0, %ymm12
872	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
873	vmulps			%ymm13, %ymm12, %ymm15
874	vsubps			%ymm15, %ymm0, %ymm0
875
876	cmpl			$8, %r12d
877	jl				0f // ret
878
879	vbroadcastss	28(%r11), %ymm12
880	vmulps			%ymm0, %ymm12, %ymm1
881	vblendps		$0x80, %ymm1, %ymm0, %ymm0
882
8830:
884
885#if MACRO_LEVEL>=1
886	.endm
887#else
888	ret
889
890	FUN_END(inner_edge_trsv_ln_inv_8_vs_lib8)
891#endif
892
893
894
895
896
897// common inner routine with file scope
898//
899// triangular substitution with vector RHS
900//
901// input arguments:
902// r10  <- E
903// r11  <- inv_diag_E
904// ymm0 <- [z0 z1 z2 z3]
905// ymm12 <- dirty
906// ymm13 <- dirty
907//
908// output arguments:
909// r10  <- E
910// r11  <- inv_diag_E
911// ymm0 <- [z0 z1 z2 z3]
912// ymm12 <- dirty
913// ymm13 <- dirty
914
915#if MACRO_LEVEL>=1
916	.macro INNER_EDGE_TRSV_LT_INV_8_LIB8
917#else
918	.p2align 4,,15
919	FUN_START(inner_edge_trsv_lt_inv_8_lib8)
920#endif
921
922	vxorps			%ymm14, %ymm14, %ymm14
923
924	vmovaps			0(%r10), %ymm12
925	vblendps		$0x01, %ymm14, %ymm12, %ymm12
926	vmovaps			32(%r10), %ymm13
927	vblendps		$0x03, %ymm14, %ymm13, %ymm13
928	vunpcklps		%ymm13, %ymm12, %ymm8
929	vunpckhps		%ymm13, %ymm12, %ymm9
930
931	vmovaps			64(%r10), %ymm12
932	vblendps		$0x07, %ymm14, %ymm12, %ymm12
933	vmovaps			96(%r10), %ymm13
934	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
935	vunpcklps		%ymm13, %ymm12, %ymm10
936	vunpckhps		%ymm13, %ymm12, %ymm11
937
938	vshufps			$0x44, %ymm10, %ymm8, %ymm7
939	vshufps			$0xee, %ymm10, %ymm8, %ymm4
940	vshufps			$0x44, %ymm11, %ymm9, %ymm5
941	vshufps			$0xee, %ymm11, %ymm9, %ymm6
942	vextractf128	$0x1, %ymm7, %xmm7
943	vextractf128	$0x1, %ymm4, %xmm8
944	vextractf128	$0x1, %ymm5, %xmm9
945	vextractf128	$0x1, %ymm6, %xmm10
946
947	vmovaps			144(%r10), %xmm12
948	vblendps		$0x01, %xmm14, %xmm12, %xmm12
949	vmovaps			176(%r10), %xmm13
950	vblendps		$0x03, %xmm14, %xmm13, %xmm13
951	vunpcklps		%xmm13, %xmm12, %xmm1
952	vunpckhps		%xmm13, %xmm12, %xmm2
953
954	vmovaps			208(%r10), %xmm12
955	vblendps		$0x07, %xmm14, %xmm12, %xmm12
956	vmovaps			240(%r10), %xmm13
957	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
958	vunpcklps		%xmm13, %xmm12, %xmm3
959	vunpckhps		%xmm13, %xmm12, %xmm15
960
961	vshufps			$0xee, %xmm3, %xmm1, %xmm11
962	vshufps			$0x44, %xmm15, %xmm2, %xmm12
963	vshufps			$0xee, %xmm15, %xmm2, %xmm13
964
965
966	vxorps			%ymm14, %ymm14, %ymm14
967
968	vextractf128	$0x1, %ymm0, %xmm1
969
970	vshufps			$0xff, %xmm1, %xmm1, %xmm2
971	vbroadcastss	28(%r11), %xmm15
972	vmulps			%xmm2, %xmm15, %xmm2
973	vblendps		$0x08, %xmm2, %xmm1, %xmm1
974	vmulps			%xmm10, %xmm2, %xmm15
975	vsubps			%xmm15, %xmm0, %xmm0
976	vmulps			%xmm13, %xmm2, %xmm15
977	vsubps			%xmm15, %xmm1, %xmm1
978
979	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
980	vbroadcastss	24(%r11), %xmm15
981	vmulps			%xmm2, %xmm15, %xmm2
982	vblendps		$0x04, %xmm2, %xmm1, %xmm1
983	vmulps			%xmm9, %xmm2, %xmm15
984	vsubps			%xmm15, %xmm0, %xmm0
985	vmulps			%xmm12, %xmm2, %xmm15
986	vsubps			%xmm15, %xmm1, %xmm1
987
988	vshufps			$0x55, %xmm1, %xmm1, %xmm2
989	vbroadcastss	20(%r11), %xmm15
990	vmulps			%xmm2, %xmm15, %xmm2
991	vblendps		$0x02, %xmm2, %xmm1, %xmm1
992	vmulps			%xmm8, %xmm2, %xmm15
993	vsubps			%xmm15, %xmm0, %xmm0
994	vmulps			%xmm11, %xmm2, %xmm15
995	vsubps			%xmm15, %xmm1, %xmm1
996
997	vshufps			$0x00, %xmm1, %xmm1, %xmm2
998	vbroadcastss	16(%r11), %xmm15
999	vmulps			%xmm2, %xmm15, %xmm2
1000	vblendps		$0x01, %xmm2, %xmm1, %xmm1
1001	vmulps			%xmm7, %xmm2, %xmm15
1002	vsubps			%xmm15, %xmm0, %xmm0
1003
1004	vshufps			$0xff, %xmm0, %xmm0, %xmm2
1005	vbroadcastss	12(%r11), %xmm15
1006	vmulps			%xmm2, %xmm15, %xmm2
1007	vblendps		$0x08, %xmm2, %xmm0, %xmm0
1008	vmulps			%xmm6, %xmm2, %xmm15
1009	vsubps			%xmm15, %xmm0, %xmm0
1010
1011	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
1012	vbroadcastss	8(%r11), %xmm15
1013	vmulps			%xmm2, %xmm15, %xmm2
1014	vblendps		$0x04, %xmm2, %xmm0, %xmm0
1015	vmulps			%xmm5, %xmm2, %xmm15
1016	vsubps			%xmm15, %xmm0, %xmm0
1017
1018	vshufps			$0x55, %xmm0, %xmm0, %xmm2
1019	vbroadcastss	4(%r11), %xmm15
1020	vmulps			%xmm2, %xmm15, %xmm2
1021	vblendps		$0x02, %xmm2, %xmm0, %xmm0
1022	vmulps			%xmm4, %xmm2, %xmm15
1023	vsubps			%xmm15, %xmm0, %xmm0
1024
1025	vshufps			$0x00, %xmm0, %xmm0, %xmm2
1026	vbroadcastss	0(%r11), %xmm15
1027	vmulps			%xmm2, %xmm15, %xmm2
1028	vblendps		$0x01, %xmm2, %xmm0, %xmm0
1029
1030	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
1031
1032#if MACRO_LEVEL>=1
1033	.endm
1034#else
1035	ret
1036
1037	FUN_END(inner_edge_trsv_lt_inv_8_lib8)
1038#endif
1039
1040
1041
1042
1043
1044// common inner routine with file scope
1045//
1046// triangular substitution with vector RHS
1047//
1048// input arguments:
1049// r10  <- E
1050// r11  <- inv_diag_E
1051// r12  <- km
1052// r13  <- kn
1053// r14  <- x
1054// ymm0 <- [z0 z1 z2 z3]
1055// ymm12 <- dirty
1056// ymm13 <- dirty
1057//
1058// output arguments:
1059// r10  <- E
1060// r11  <- inv_diag_E
1061// r12  <- km
1062// r13  <- kn
1063// r14  <- x
1064// ymm0 <- [z0 z1 z2 z3]
1065// ymm12 <- dirty
1066// ymm13 <- dirty
1067
1068#if MACRO_LEVEL>=1
1069	.macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
1070#else
1071	.p2align 4,,15
1072	FUN_START(inner_edge_trsv_lt_inv_8_vs_lib8)
1073#endif
1074
1075	vcvtsi2ss	%r13d, %xmm14, %xmm14
1076#if defined(OS_LINUX) | defined(OS_WINDOWS)
1077	vmovups		.LC00(%rip), %ymm13
1078#elif defined(OS_MAC)
1079	vmovups		LC00(%rip), %ymm13
1080#endif
1081	vshufps		$0x00, %xmm14, %xmm14, %xmm14
1082	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
1083	vsubps		%ymm14, %ymm13, %ymm14
1084
1085	vmovups		0(%r14), %ymm15
1086	vblendvps	%ymm14, %ymm0, %ymm15, %ymm0
1087
1088
1089
1090	vxorps			%ymm14, %ymm14, %ymm14
1091
1092	vmovaps			0(%r10), %ymm12
1093	vblendps		$0x01, %ymm14, %ymm12, %ymm12
1094	cmpl	$2, %r13d
1095	jl		1f
1096	vmovaps			32(%r10), %ymm13
1097	vblendps		$0x03, %ymm14, %ymm13, %ymm13
1098	vunpcklps		%ymm13, %ymm12, %ymm8
1099	vunpckhps		%ymm13, %ymm12, %ymm9
1100
1101	cmpl	$3, %r13d
1102	jl		2f
1103	vmovaps			64(%r10), %ymm12
1104	vblendps		$0x07, %ymm14, %ymm12, %ymm12
1105	cmpl	$4, %r13d
1106	jl		3f
1107	vmovaps			96(%r10), %ymm13
1108	vblendps		$0x0f, %ymm14, %ymm13, %ymm13
1109	vunpcklps		%ymm13, %ymm12, %ymm10
1110	vunpckhps		%ymm13, %ymm12, %ymm11
1111
1112	vshufps			$0x44, %ymm10, %ymm8, %ymm7
1113	vshufps			$0xee, %ymm10, %ymm8, %ymm4
1114	vshufps			$0x44, %ymm11, %ymm9, %ymm5
1115	vshufps			$0xee, %ymm11, %ymm9, %ymm6
1116	vextractf128	$0x1, %ymm7, %xmm7
1117	vextractf128	$0x1, %ymm4, %xmm8
1118	vextractf128	$0x1, %ymm5, %xmm9
1119	vextractf128	$0x1, %ymm6, %xmm10
1120
1121	cmpl	$5, %r13d
1122	jl		4f
1123	vmovaps			144(%r10), %xmm12
1124	vblendps		$0x01, %xmm14, %xmm12, %xmm12
1125	cmpl	$6, %r13d
1126	jl		5f
1127	vmovaps			176(%r10), %xmm13
1128	vblendps		$0x03, %xmm14, %xmm13, %xmm13
1129	vunpcklps		%xmm13, %xmm12, %xmm1
1130	vunpckhps		%xmm13, %xmm12, %xmm2
1131
1132	cmpl	$7, %r13d
1133	jl		6f
1134	vmovaps			208(%r10), %xmm12
1135	vblendps		$0x07, %xmm14, %xmm12, %xmm12
1136	cmpl	$8, %r13d
1137	jl		7f
1138	vmovaps			240(%r10), %xmm13
1139	vblendps		$0x0f, %xmm14, %xmm13, %xmm13
1140	vunpcklps		%xmm13, %xmm12, %xmm3
1141	vunpckhps		%xmm13, %xmm12, %xmm15
1142
1143	vshufps			$0xee, %xmm3, %xmm1, %xmm11
1144	vshufps			$0x44, %xmm15, %xmm2, %xmm12
1145	vshufps			$0xee, %xmm15, %xmm2, %xmm13
1146
1147	jmp		0f
1148
1149
1150
1151	vmovaps			%ymm14, %ymm12
11521:
1153	vmovaps			%ymm14, %ymm13
1154	vunpcklps		%ymm13, %ymm12, %ymm8
1155	vunpckhps		%ymm13, %ymm12, %ymm9
1156
11572:
1158	vmovaps			%ymm14, %ymm12
11593:
1160	vmovaps			%ymm14, %ymm13
1161	vunpcklps		%ymm13, %ymm12, %ymm10
1162	vunpckhps		%ymm13, %ymm12, %ymm11
1163
1164	vshufps			$0x44, %ymm10, %ymm8, %ymm7
1165	vshufps			$0xee, %ymm10, %ymm8, %ymm4
1166	vshufps			$0x44, %ymm11, %ymm9, %ymm5
1167	vshufps			$0xee, %ymm11, %ymm9, %ymm6
1168	vextractf128	$0x1, %ymm7, %xmm7
1169	vextractf128	$0x1, %ymm4, %xmm8
1170	vextractf128	$0x1, %ymm5, %xmm9
1171	vextractf128	$0x1, %ymm6, %xmm10
1172
1173	jmp		8f
1174
11754:
1176	vmovaps			%xmm14, %xmm12
11775:
1178	vmovaps			%xmm14, %xmm13
1179	vunpcklps		%xmm13, %xmm12, %xmm1
1180	vunpckhps		%xmm13, %xmm12, %xmm2
1181
11826:
1183	vmovaps			%xmm14, %xmm12
11847:
1185	vmovaps			%xmm14, %xmm13
1186	vunpcklps		%xmm13, %xmm12, %xmm3
1187	vunpckhps		%xmm13, %xmm12, %xmm15
1188
1189	vshufps			$0xee, %xmm3, %xmm1, %xmm11
1190	vshufps			$0x44, %xmm15, %xmm2, %xmm12
1191	vshufps			$0xee, %xmm15, %xmm2, %xmm13
1192
11938:
1194
1195	vmovaps			%xmm14, %xmm11
1196	vmovaps			%xmm14, %xmm12
1197	vmovaps			%xmm14, %xmm13
1198
11990:
1200	vxorps			%ymm14, %ymm14, %ymm14
1201
1202	vextractf128	$0x1, %ymm0, %xmm1
1203
1204	cmpl	$8, %r12d
1205	jl		0f
1206
1207	vshufps			$0xff, %xmm1, %xmm1, %xmm2
1208	cmpl	$8, %r13d
1209	jl		1f
1210	vbroadcastss	28(%r11), %xmm15
1211	vmulps			%xmm2, %xmm15, %xmm2
1212	vblendps		$0x08, %xmm2, %xmm1, %xmm1
12131:
1214	vmulps			%xmm10, %xmm2, %xmm15
1215	vsubps			%xmm15, %xmm0, %xmm0
1216	vmulps			%xmm13, %xmm2, %xmm15
1217	vsubps			%xmm15, %xmm1, %xmm1
1218
12190:
1220	cmpl	$7, %r12d
1221	jl		0f
1222
1223	vshufps			$0xaa, %xmm1, %xmm1, %xmm2
1224	cmpl	$7, %r13d
1225	jl		1f
1226	vbroadcastss	24(%r11), %xmm15
1227	vmulps			%xmm2, %xmm15, %xmm2
1228	vblendps		$0x04, %xmm2, %xmm1, %xmm1
12291:
1230	vmulps			%xmm9, %xmm2, %xmm15
1231	vsubps			%xmm15, %xmm0, %xmm0
1232	vmulps			%xmm12, %xmm2, %xmm15
1233	vsubps			%xmm15, %xmm1, %xmm1
1234
12350:
1236	cmpl	$6, %r12d
1237	jl		0f
1238
1239	vshufps			$0x55, %xmm1, %xmm1, %xmm2
1240	cmpl	$6, %r13d
1241	jl		1f
1242	vbroadcastss	20(%r11), %xmm15
1243	vmulps			%xmm2, %xmm15, %xmm2
1244	vblendps		$0x02, %xmm2, %xmm1, %xmm1
12451:
1246	vmulps			%xmm8, %xmm2, %xmm15
1247	vsubps			%xmm15, %xmm0, %xmm0
1248	vmulps			%xmm11, %xmm2, %xmm15
1249	vsubps			%xmm15, %xmm1, %xmm1
1250
12510:
1252	cmpl	$5, %r12d
1253	jl		0f
1254
1255	vshufps			$0x00, %xmm1, %xmm1, %xmm2
1256	cmpl	$5, %r13d
1257	jl		1f
1258	vbroadcastss	16(%r11), %xmm15
1259	vmulps			%xmm2, %xmm15, %xmm2
1260	vblendps		$0x01, %xmm2, %xmm1, %xmm1
12611:
1262	vmulps			%xmm7, %xmm2, %xmm15
1263	vsubps			%xmm15, %xmm0, %xmm0
1264
12650:
1266	cmpl	$4, %r12d
1267	jl		0f
1268
1269	vshufps			$0xff, %xmm0, %xmm0, %xmm2
1270	cmpl	$4, %r13d
1271	jl		1f
1272	vbroadcastss	12(%r11), %xmm15
1273	vmulps			%xmm2, %xmm15, %xmm2
1274	vblendps		$0x08, %xmm2, %xmm0, %xmm0
12751:
1276	vmulps			%xmm6, %xmm2, %xmm15
1277	vsubps			%xmm15, %xmm0, %xmm0
1278
12790:
1280	cmpl	$3, %r12d
1281	jl		0f
1282
1283	vshufps			$0xaa, %xmm0, %xmm0, %xmm2
1284	cmpl	$3, %r13d
1285	jl		1f
1286	vbroadcastss	8(%r11), %xmm15
1287	vmulps			%xmm2, %xmm15, %xmm2
1288	vblendps		$0x04, %xmm2, %xmm0, %xmm0
12891:
1290	vmulps			%xmm5, %xmm2, %xmm15
1291	vsubps			%xmm15, %xmm0, %xmm0
1292
12930:
1294	cmpl	$2, %r12d
1295	jl		0f
1296
1297	vshufps			$0x55, %xmm0, %xmm0, %xmm2
1298	cmpl	$2, %r13d
1299	jl		1f
1300	vbroadcastss	4(%r11), %xmm15
1301	vmulps			%xmm2, %xmm15, %xmm2
1302	vblendps		$0x02, %xmm2, %xmm0, %xmm0
13031:
1304	vmulps			%xmm4, %xmm2, %xmm15
1305	vsubps			%xmm15, %xmm0, %xmm0
1306
13070:
1308	cmpl	$1, %r12d
1309	jl		0f
1310
1311	vshufps			$0x00, %xmm0, %xmm0, %xmm2
1312	cmpl	$1, %r13d
1313	jl		1f
1314	vbroadcastss	0(%r11), %xmm15
1315	vmulps			%xmm2, %xmm15, %xmm2
1316	vblendps		$0x01, %xmm2, %xmm0, %xmm0
13171:
1318
13190:
1320
1321	vinsertf128		$0x1, %xmm1, %ymm0, %ymm0
1322
1323#if MACRO_LEVEL>=1
1324	.endm
1325#else
1326	ret
1327
1328	FUN_END(inner_edge_trsv_lt_inv_8_vs_lib8)
1329#endif
1330
1331
1332
1333
1334
1335// common inner routine with file scope
1336//
1337// blend for ta==n, scale for generic alpha and beta
1338//
1339// input arguments:
1340// r10  <- alpha
1341// r11  <- beta
1342// r12  <- y
1343// ymm0 <- [z0 z1 z2 z3]_a
1344// ymm1 <- [z0 z1 z2 z3]_b
1345// ymm2 <- [z0 z1 z2 z3]_c
1346// ymm3 <- [z0 z1 z2 z3]_d
1347// ymm8  <- dirty
1348// ymm9  <- dirty
1349// ymm10 <- dirty
1350// ymm11 <- dirty
1351// ymm15 <- dirty
1352//
1353// output arguments:
1354// r10  <- alpha
1355// r11  <- beta
1356// r12  <- y
1357// ymm0 <- [z0 z1 z2 z3]
1358// ymm1 <- dirty
1359// ymm2 <- dirty
1360// ymm3 <- dirty
1361// ymm8  <- dirty
1362// ymm9  <- dirty
1363// ymm10 <- dirty
1364// ymm11 <- dirty
1365// ymm15 <- dirty
1366
1367#if MACRO_LEVEL>=1
1368	.macro INNER_BLEND_N_SCALE_AB_8_LIB8
1369#else
1370	.p2align 4,,15
1371	FUN_START(inner_blend_n_scale_ab_8_lib8)
1372#endif
1373
1374	// reduction
1375	vaddps			%ymm0, %ymm1, %ymm0
1376	vaddps			%ymm2, %ymm3, %ymm2
1377	vaddps			%ymm0, %ymm2, %ymm0
1378
1379	// alpha
1380	vbroadcastss	0(%r10), %ymm15
1381	vmulps			%ymm0, %ymm15, %ymm0
1382
1383	// beta
1384	vbroadcastss	0(%r11), %ymm15
1385	vmovups			0(%r12), %ymm14
1386	vmulps			%ymm15, %ymm14, %ymm14
1387	vaddps			%ymm0, %ymm14, %ymm0
1388
1389#if MACRO_LEVEL>=1
1390	.endm
1391#else
1392	ret
1393
1394	FUN_END(inner_blend_n_scale_ab_8_lib8)
1395#endif
1396
1397
1398
1399
1400
1401// common inner routine with file scope
1402//
1403// blend for ta==n, scale for alpha=-1.0 and beta=1.0
1404//
1405// input arguments:
1406// r10  <- y
1407// ymm0 <- [z0 z1 z2 z3]_a
1408// ymm1 <- [z0 z1 z2 z3]_b
1409// ymm2 <- [z0 z1 z2 z3]_c
1410// ymm3 <- [z0 z1 z2 z3]_d
1411// ymm8  <- dirty
1412// ymm9  <- dirty
1413// ymm10 <- dirty
1414// ymm11 <- dirty
1415// ymm15 <- dirty
1416//
1417// output arguments:
1418// r10  <- y
1419// ymm0 <- [z0 z1 z2 z3]
1420// ymm1 <- dirty
1421// ymm2 <- dirty
1422// ymm3 <- dirty
1423// ymm8  <- dirty
1424// ymm9  <- dirty
1425// ymm10 <- dirty
1426// ymm11 <- dirty
1427// ymm15 <- dirty
1428
1429#if MACRO_LEVEL>=1
1430	.macro INNER_BLEND_N_SCALE_M11_8_LIB8
1431#else
1432	.p2align 4,,15
1433	FUN_START(inner_blend_n_scale_m11_8_lib8)
1434#endif
1435
1436	// reduction
1437	vaddps	%ymm0, %ymm1, %ymm0
1438	vaddps	%ymm2, %ymm3, %ymm2
1439	vaddps	%ymm0, %ymm2, %ymm0
1440
1441	// beta
1442	vmovups		0(%r10), %ymm14
1443	vsubps		%ymm0, %ymm14, %ymm0
1444
1445#if MACRO_LEVEL>=1
1446	.endm
1447#else
1448	ret
1449
1450	FUN_END(inner_blend_n_scale_m11_8_lib8)
1451#endif
1452
1453
1454
1455
1456
1457// common inner routine with file scope
1458//
1459// blend for ta==t, scale for generic alpha and beta
1460//
1461// input arguments:
1462// r10  <- alpha
1463// r11  <- beta
1464// r12  <- y
1465// ymm0 <- [z0a z0b z0c z0d]
1466// ymm1 <- [z1a z1b z1c z1d]
1467// ymm2 <- [z2a z2b z2c z2d]
1468// ymm3 <- [z3a z3b z3c z3d]
1469// ymm8  <- dirty
1470// ymm9  <- dirty
1471// ymm10 <- dirty
1472// ymm11 <- dirty
1473// ymm15 <- dirty
1474//
1475// output arguments:
1476// r10  <- alpha
1477// r11  <- beta
1478// r12  <- y
1479// ymm0 <- [z0 z1 z2 z3]
1480// ymm1 <- dirty
1481// ymm2 <- dirty
1482// ymm3 <- dirty
1483// ymm8  <- dirty
1484// ymm9  <- dirty
1485// ymm10 <- dirty
1486// ymm11 <- dirty
1487// ymm15 <- dirty
1488
1489#if MACRO_LEVEL>=1
1490	.macro INNER_BLEND_T_SCALE_AB_8_LIB8
1491#else
1492	.p2align 4,,15
1493	FUN_START(inner_blend_t_scale_ab_8_lib8)
1494#endif
1495
1496	// reduction
1497	vhaddps			%ymm1, %ymm0, %ymm0
1498	vhaddps			%ymm3, %ymm2, %ymm2
1499	vhaddps			%ymm5, %ymm4, %ymm4
1500	vhaddps			%ymm7, %ymm6, %ymm6
1501
1502	vhaddps			%ymm2, %ymm0, %ymm0
1503	vhaddps			%ymm6, %ymm4, %ymm4
1504
1505	vperm2f128		$0x20, %ymm4, %ymm0, %ymm1
1506	vperm2f128		$0x13, %ymm0, %ymm4, %ymm0
1507
1508	vaddps			%ymm0, %ymm1, %ymm0
1509
1510	// alpha
1511	vbroadcastss	0(%r10), %ymm15
1512	vmulps			%ymm0, %ymm15, %ymm0
1513
1514	// beta
1515	vbroadcastss	0(%r11), %ymm15
1516	vmovups			0(%r12), %ymm14
1517	vmulps			%ymm15, %ymm14, %ymm14
1518	vaddps			%ymm0, %ymm14, %ymm0
1519
1520#if MACRO_LEVEL>=1
1521	.endm
1522#else
1523	ret
1524
1525	FUN_END(inner_blend_t_scale_ab_8_lib8)
1526#endif
1527
1528
1529
1530
1531
1532// common inner routine with file scope
1533//
1534// blend for ta==t, scale for alpha=-1.0 and beta=1.0
1535//
1536// input arguments:
1537// r10  <- y
1538// ymm0 <- [z0a z0b z0c z0d]
1539// ymm1 <- [z1a z1b z1c z1d]
1540// ymm2 <- [z2a z2b z2c z2d]
1541// ymm3 <- [z3a z3b z3c z3d]
1542// ymm8  <- dirty
1543// ymm9  <- dirty
1544// ymm10 <- dirty
1545// ymm11 <- dirty
1546// ymm15 <- dirty
1547//
1548// output arguments:
1549// r10  <- y
1550// ymm0 <- [z0 z1 z2 z3]
1551// ymm1 <- dirty
1552// ymm2 <- dirty
1553// ymm3 <- dirty
1554// ymm8  <- dirty
1555// ymm9  <- dirty
1556// ymm10 <- dirty
1557// ymm11 <- dirty
1558// ymm15 <- dirty
1559
1560#if MACRO_LEVEL>=1
1561	.macro INNER_BLEND_T_SCALE_M11_8_LIB8
1562#else
1563	.p2align 4,,15
1564	FUN_START(inner_blend_t_scale_m11_8_lib8)
1565#endif
1566
1567	// reduction
1568	vhaddps			%ymm1, %ymm0, %ymm0
1569	vhaddps			%ymm3, %ymm2, %ymm2
1570	vhaddps			%ymm5, %ymm4, %ymm4
1571	vhaddps			%ymm7, %ymm6, %ymm6
1572
1573	vhaddps			%ymm2, %ymm0, %ymm0
1574	vhaddps			%ymm6, %ymm4, %ymm4
1575
1576	vperm2f128		$0x20, %ymm4, %ymm0, %ymm1
1577	vperm2f128		$0x13, %ymm0, %ymm4, %ymm0
1578
1579	vaddps			%ymm0, %ymm1, %ymm0
1580
1581	// beta
1582	vmovups			0(%r10), %ymm14
1583	vsubps			%ymm0, %ymm14, %ymm0
1584
1585#if MACRO_LEVEL>=1
1586	.endm
1587#else
1588	ret
1589
1590	FUN_END(inner_blend_t_scale_m11_8_lib8)
1591#endif
1592
1593
1594
1595
1596
1597// common inner routine with file scope
1598//
1599// store
1600//
1601// input arguments:
1602// r10  <- z
1603// ymm0 <- [z0 z1 z2 z3]
1604//
1605// output arguments:
1606// r10  <- z
1607// ymm0 <- [z0 z1 z2 z3]
1608
1609#if MACRO_LEVEL>=1
1610	.macro INNER_STORE_8_LIB8
1611#else
1612	.p2align 4,,15
1613	FUN_START(inner_store_8_lib8)
1614#endif
1615
1616	vmovups %ymm0,  0(%r10)
1617
1618#if MACRO_LEVEL>=1
1619	.endm
1620#else
1621	ret
1622
1623	FUN_END(inner_store_8_lib8)
1624#endif
1625
1626
1627
1628
1629
1630// common inner routine with file scope
1631//
1632// store vs
1633//
1634// input arguments:
1635// r10   <- D
1636// r11d   <- km
1637// ymm0  <- [z0 z1 z2 z3]
1638// ymm14 <- dirty
1639// ymm15 <- dirty
1640//
1641// output arguments:
1642// r10   <- D
1643// r11d   <- km
1644// ymm0  <- [z0 z1 z2 z3]
1645// ymm14 <- dirty
1646// ymm15 <- dirty
1647
1648#if MACRO_LEVEL>=1
1649	.macro INNER_STORE_8_VS_LIB8
1650#else
1651	.p2align 4,,15
1652	FUN_START(inner_store_8_vs_lib8)
1653#endif
1654
1655	vcvtsi2ss	%r11d, %xmm15, %xmm15
1656#if defined(OS_LINUX) | defined(OS_WINDOWS)
1657	vmovups		.LC00(%rip), %ymm14
1658#elif defined(OS_MAC)
1659	vmovups		LC00(%rip), %ymm14
1660#endif
1661	vshufps		$0x00, %xmm15, %xmm15, %xmm15
1662	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
1663	vsubps		%ymm15, %ymm14, %ymm15
1664
1665	vmaskmovps	%ymm0, %ymm15,  0(%r10)
1666
1667#if MACRO_LEVEL>=1
1668	.endm
1669#else
1670	ret
1671
1672	FUN_END(inner_store_8_vs_lib8)
1673#endif
1674
1675
1676
1677
1678
1679// common inner routine with file scope
1680//
1681// store gen
1682//
1683// input arguments:
1684// r10   <- D
1685// r11d  <- k0 : start form (inc)
1686// r12d  <- k1 : up to (exc)
1687// ymm0  <- [z0 z1 z2 z3]
1688// ymm14 <- dirty
1689// ymm15 <- dirty
1690//
1691// output arguments:
1692// r10   <- D
1693// r11d  <- k0 : start form (inc)
1694// r12d  <- k1 : up to (exc)
1695// ymm0  <- [z0 z1 z2 z3]
1696// ymm14 <- dirty
1697// ymm15 <- dirty
1698
1699#if MACRO_LEVEL>=1
1700	.macro INNER_STORE_8_GEN_LIB8
1701#else
1702	.p2align 4,,15
1703	FUN_START(inner_store_8_gen_lib8)
1704#endif
1705
1706	// compute mask for rows
1707	vcvtsi2ss	%r11d, %xmm14, %xmm14
1708	vcvtsi2ss	%r12d, %xmm15, %xmm15
1709#if defined(OS_LINUX) | defined(OS_WINDOWS)
1710	vmovups		.LC00(%rip), %ymm12
1711#elif defined(OS_MAC)
1712	vmovups		LC00(%rip), %ymm12
1713#endif
1714	vshufps		$0x00, %xmm14, %xmm14, %xmm14
1715	vshufps		$0x00, %xmm15, %xmm15, %xmm15
1716	vinsertf128	$0x1, %xmm14, %ymm14, %ymm14
1717	vinsertf128	$0x1, %xmm15, %ymm15, %ymm15
1718	vsubps		%ymm12, %ymm14, %ymm14
1719	vsubps		%ymm15, %ymm12, %ymm15
1720	vandps		%ymm14, %ymm15, %ymm15
1721
1722	vmaskmovps	%ymm0, %ymm15,  0(%r10)
1723
1724#if MACRO_LEVEL>=1
1725	.endm
1726#else
1727	ret
1728
1729	FUN_END(inner_store_8_gen_lib8)
1730#endif
1731
1732
1733
1734
1735
1736//                            1      2              3          4          5             6          7
1737// void kernel_sgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
1738
1739	.p2align 4,,15
1740	GLOB_FUN_START(kernel_sgemv_n_8_lib8)
1741
1742	PROLOGUE
1743
1744	// zero accumulation registers
1745
1746	vxorps	%ymm0, %ymm0, %ymm0
1747	vmovaps	%ymm0, %ymm1
1748	vmovaps	%ymm0, %ymm2
1749	vmovaps	%ymm0, %ymm3
1750
1751
1752	// call inner sgemv kernel n
1753
1754	movq	ARG1, %r10 // k
1755	movq	ARG3, %r11  // A
1756	movq	ARG4, %r12  // x
1757
1758#if MACRO_LEVEL>=2
1759	INNER_KERNEL_GEMV_ADD_N_8_LIB8
1760#else
1761	CALL(inner_kernel_gemv_add_n_8_lib8)
1762#endif
1763
1764
1765	// call inner blend n scale ab
1766
1767	movq	ARG2, %r10 // alpha
1768	movq	ARG5, %r11   // beta
1769	movq	ARG6, %r12   // y
1770
1771#if MACRO_LEVEL>=1
1772	INNER_BLEND_N_SCALE_AB_8_LIB8
1773#else
1774	CALL(inner_blend_n_scale_ab_8_lib8)
1775#endif
1776
1777
1778	// store
1779
1780	movq	ARG7, %r10 // z
1781
1782#if MACRO_LEVEL>=1
1783	INNER_STORE_8_LIB8
1784#else
1785	CALL(inner_store_8_lib8)
1786#endif
1787
1788
1789	EPILOGUE
1790
1791	ret
1792
1793	FUN_END(kernel_sgemv_n_8_lib8)
1794
1795
1796
1797
1798
1799//                               1      2              3          4          5             6          7          8
1800// void kernel_sgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
1801
1802	.p2align 4,,15
1803	GLOB_FUN_START(kernel_sgemv_n_8_vs_lib8)
1804
1805	PROLOGUE
1806
1807	// zero accumulation registers
1808
1809	vxorps	%ymm0, %ymm0, %ymm0
1810	vmovaps	%ymm0, %ymm1
1811	vmovaps	%ymm0, %ymm2
1812	vmovaps	%ymm0, %ymm3
1813
1814
1815	// call inner sgemv kernel n
1816
1817	movq	ARG1, %r10 // k
1818	movq	ARG3, %r11  // A
1819	movq	ARG4, %r12  // x
1820
1821#if MACRO_LEVEL>=2
1822	INNER_KERNEL_GEMV_ADD_N_8_LIB8
1823#else
1824	CALL(inner_kernel_gemv_add_n_8_lib8)
1825#endif
1826
1827
1828	// call inner blend n scale ab
1829
1830	movq	ARG2, %r10 // alpha
1831	movq	ARG5, %r11   // beta
1832	movq	ARG6, %r12   // y
1833
1834#if MACRO_LEVEL>=1
1835	INNER_BLEND_N_SCALE_AB_8_LIB8
1836#else
1837	CALL(inner_blend_n_scale_ab_8_lib8)
1838#endif
1839
1840
1841	// store
1842
1843	movq	ARG7, %r10 // z
1844	movq	ARG8, %r11 // k1
1845
1846#if MACRO_LEVEL>=1
1847	INNER_STORE_8_VS_LIB8
1848#else
1849	CALL(inner_store_8_vs_lib8)
1850#endif
1851
1852
1853	EPILOGUE
1854
1855	ret
1856
1857	FUN_END(kernel_sgemv_n_8_vs_lib8)
1858
1859
1860
1861
1862
1863//                                1      2              3          4          5             6          7          8       9
1864// void kernel_sgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int kq);
1865
1866	.p2align 4,,15
1867	GLOB_FUN_START(kernel_sgemv_n_8_gen_lib8)
1868
1869	PROLOGUE
1870
1871	// zero accumulation registers
1872
1873	vxorps	%ymm0, %ymm0, %ymm0
1874	vmovaps	%ymm0, %ymm1
1875	vmovaps	%ymm0, %ymm2
1876	vmovaps	%ymm0, %ymm3
1877
1878
1879	// call inner sgemv kernel n
1880
1881	movq	ARG1, %r10 // k
1882	movq	ARG3, %r11  // A
1883	movq	ARG4, %r12  // x
1884
1885#if MACRO_LEVEL>=2
1886	INNER_KERNEL_GEMV_ADD_N_8_LIB8
1887#else
1888	CALL(inner_kernel_gemv_add_n_8_lib8)
1889#endif
1890
1891
1892	// call inner blend n scale ab
1893
1894	movq	ARG2, %r10 // alpha
1895	movq	ARG5, %r11   // beta
1896	movq	ARG6, %r12   // y
1897
1898#if MACRO_LEVEL>=1
1899	INNER_BLEND_N_SCALE_AB_8_LIB8
1900#else
1901	CALL(inner_blend_n_scale_ab_8_lib8)
1902#endif
1903
1904
1905	// store
1906
1907	movq	ARG7, %r10 // z
1908	movq	ARG8, %r11 // k1
1909	movq	ARG9, %r12 // k2
1910
1911#if MACRO_LEVEL>=1
1912	INNER_STORE_8_GEN_LIB8
1913#else
1914	CALL(inner_store_8_gen_lib8)
1915#endif
1916
1917
1918	EPILOGUE
1919
1920	ret
1921
1922	FUN_END(kernel_sgemv_n_8_gen_lib8)
1923
1924
1925
1926
1927
1928//                            1      2              3          4        5          6             7         8      9
1929// void kernel_sgemv_t_8_lib8(int k, double *alpha, offA, double *A, int sda, double *x, double *beta, double *y, double *z);
1930
1931	.p2align 4,,15
1932	GLOB_FUN_START(kernel_sgemv_t_8_lib8)
1933
1934	PROLOGUE
1935
1936	// zero accumulation registers
1937
1938	vxorps	%ymm0, %ymm0, %ymm0
1939	vmovaps	%ymm0, %ymm1
1940	vmovaps	%ymm0, %ymm2
1941	vmovaps	%ymm0, %ymm3
1942	vmovaps	%ymm0, %ymm4
1943	vmovaps	%ymm0, %ymm5
1944	vmovaps	%ymm0, %ymm6
1945	vmovaps	%ymm0, %ymm7
1946
1947
1948	// call inner sgemv kernel n
1949
1950	movq	ARG1, %r10 // k
1951	movq	ARG4, %r11  // A
1952	movq	ARG5, %r12 // sda
1953	sall	$5, %r12d // 8*sda*sizeof(float)
1954	movq	ARG6, %r13  // x
1955	movq	ARG3, %r14 // offA
1956
1957#if MACRO_LEVEL>=2
1958	INNER_EDGE_GEMV_ADD_T_8_LIB8
1959#else
1960	CALL(inner_edge_gemv_add_t_8_lib8)
1961#endif
1962
1963#if MACRO_LEVEL>=2
1964	INNER_KERNEL_GEMV_ADD_T_8_LIB8
1965#else
1966	CALL(inner_kernel_gemv_add_t_8_lib8)
1967#endif
1968
1969
1970	// call inner blender t
1971
1972	movq	ARG2, %r10 // alpha
1973	movq	ARG7, %r11   // beta
1974	movq	ARG8, %r12 // y
1975
1976#if MACRO_LEVEL>=1
1977	INNER_BLEND_T_SCALE_AB_8_LIB8
1978#else
1979	CALL(inner_blend_t_scale_ab_8_lib8)
1980#endif
1981
1982
1983	// store
1984
1985	movq	ARG9, %r10 // z
1986
1987#if MACRO_LEVEL>=1
1988	INNER_STORE_8_LIB8
1989#else
1990	CALL(inner_store_8_lib8)
1991#endif
1992
1993
1994	EPILOGUE
1995
1996	ret
1997
1998	FUN_END(kernel_sgemv_t_8_lib8)
1999
2000
2001
2002
2003
2004//                               1      2              3          4        5          6             7         8           9     10
2005// void kernel_sgemv_t_8_vs_lib8(int k, double *alpha, offA, double *A, int sda, double *x, double *beta, double *y, double *z, int k1);
2006
2007	.p2align 4,,15
2008	GLOB_FUN_START(kernel_sgemv_t_8_vs_lib8)
2009
2010	PROLOGUE
2011
2012	// zero accumulation registers
2013
2014	vxorps	%ymm0, %ymm0, %ymm0
2015	vmovaps	%ymm0, %ymm1
2016	vmovaps	%ymm0, %ymm2
2017	vmovaps	%ymm0, %ymm3
2018	vmovaps	%ymm0, %ymm4
2019	vmovaps	%ymm0, %ymm5
2020	vmovaps	%ymm0, %ymm6
2021	vmovaps	%ymm0, %ymm7
2022
2023
2024	// call inner sgemv kernel n
2025
2026	movq	ARG1, %r10 // k
2027	movq	ARG4, %r11  // A
2028	movq	ARG5, %r12 // sda
2029	sall	$5, %r12d // 8*sda*sizeof(float)
2030	movq	ARG6, %r13  // x
2031	movq	ARG3, %r14 // offA
2032
2033#if MACRO_LEVEL>=2
2034	INNER_EDGE_GEMV_ADD_T_8_LIB8
2035#else
2036	CALL(inner_edge_gemv_add_t_8_lib8)
2037#endif
2038
2039#if MACRO_LEVEL>=2
2040	INNER_KERNEL_GEMV_ADD_T_8_LIB8
2041#else
2042	CALL(inner_kernel_gemv_add_t_8_lib8)
2043#endif
2044
2045
2046	// call inner blender t
2047
2048	movq	ARG2, %r10 // alpha
2049	movq	ARG7, %r11   // beta
2050	movq	ARG8, %r12 // y
2051
2052#if MACRO_LEVEL>=1
2053	INNER_BLEND_T_SCALE_AB_8_LIB8
2054#else
2055	CALL(inner_blend_t_scale_ab_8_lib8)
2056#endif
2057
2058
2059	// store
2060
2061	movq	ARG9, %r10 // z
2062	movq	ARG10, %r11 // km
2063
2064#if MACRO_LEVEL>=1
2065	INNER_STORE_8_VS_LIB8
2066#else
2067	CALL(inner_store_8_vs_lib8)
2068#endif
2069
2070
2071	EPILOGUE
2072
2073	ret
2074
2075	FUN_END(kernel_sgemv_t_8_vs_lib8)
2076
2077
2078
2079
2080
2081//                                 1      2          3                   4          5          6
2082// void kernel_strsv_ln_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z);
2083
2084	.p2align 4,,15
2085	GLOB_FUN_START(kernel_strsv_ln_inv_8_lib8)
2086
2087	PROLOGUE
2088
2089	// zero accumulation registers
2090
2091	vxorps	%ymm0, %ymm0, %ymm0
2092	vmovaps	%ymm0, %ymm1
2093	vmovaps	%ymm0, %ymm2
2094	vmovaps	%ymm0, %ymm3
2095	vmovaps	%ymm0, %ymm4
2096	vmovaps	%ymm0, %ymm5
2097	vmovaps	%ymm0, %ymm6
2098	vmovaps	%ymm0, %ymm7
2099
2100
2101	// call inner dgemv kernel n
2102
2103	movq	ARG1, %r10 // k
2104	movq	ARG2, %r11  // A
2105	movq	ARG4, %r12  // x
2106
2107#if MACRO_LEVEL>=2
2108	INNER_KERNEL_GEMV_ADD_N_8_LIB8
2109#else
2110	CALL(inner_kernel_gemv_add_n_8_lib8)
2111#endif
2112
2113	movq	%r11, %r13 // A+k*sizeof(double)
2114
2115
2116	// call inner blender n
2117
2118	movq	ARG5, %r10   // y
2119
2120#if MACRO_LEVEL>=1
2121	INNER_BLEND_N_SCALE_M11_8_LIB8
2122#else
2123	CALL(inner_blend_n_scale_m11_8_lib8)
2124#endif
2125
2126
2127	// solution
2128
2129	movq	%r13, %r10 // A+k*sizeof(double)
2130	movq	ARG3, %r11 // inv_diag_A
2131
2132#if MACRO_LEVEL>=1
2133	INNER_EDGE_TRSV_LN_INV_8_LIB8
2134#else
2135	CALL(inner_edge_trsv_ln_inv_8_lib8)
2136#endif
2137
2138
2139	// store
2140
2141	movq	ARG6, %r10 // z
2142
2143#if MACRO_LEVEL>=1
2144	INNER_STORE_8_LIB8
2145#else
2146	CALL(inner_store_8_lib8)
2147#endif
2148
2149
2150	EPILOGUE
2151
2152	ret
2153
2154	FUN_END(kernel_strsv_ln_inv_8_lib8)
2155
2156
2157
2158
2159
2160//                                    1      2          3                   4          5          6          7       8
2161// void kernel_strsv_ln_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
2162
2163	.p2align 4,,15
2164	GLOB_FUN_START(kernel_strsv_ln_inv_8_vs_lib8)
2165
2166	PROLOGUE
2167
2168	// zero accumulation registers
2169
2170	vxorps	%ymm0, %ymm0, %ymm0
2171	vmovaps	%ymm0, %ymm1
2172	vmovaps	%ymm0, %ymm2
2173	vmovaps	%ymm0, %ymm3
2174	vmovaps	%ymm0, %ymm4
2175	vmovaps	%ymm0, %ymm5
2176	vmovaps	%ymm0, %ymm6
2177	vmovaps	%ymm0, %ymm7
2178
2179
2180	// call inner dgemv kernel n
2181
2182	movq	ARG1, %r10 // k
2183	movq	ARG2, %r11  // A
2184	movq	ARG4, %r12  // x
2185
2186#if MACRO_LEVEL>=2
2187	INNER_KERNEL_GEMV_ADD_N_8_LIB8
2188#else
2189	CALL(inner_kernel_gemv_add_n_8_lib8)
2190#endif
2191
2192	movq	%r11, %r13 // A+k*sizeof(double)
2193
2194
2195	// call inner blender n
2196
2197	movq	ARG5, %r10   // y
2198
2199#if MACRO_LEVEL>=1
2200	INNER_BLEND_N_SCALE_M11_8_LIB8
2201#else
2202	CALL(inner_blend_n_scale_m11_8_lib8)
2203#endif
2204
2205
2206	// solution
2207
2208	movq	%r13, %r10 // A+k*sizeof(double)
2209	movq	ARG3, %r11 // inv_diag_A
2210	movq	ARG8, %r12 // kn
2211
2212#if MACRO_LEVEL>=1
2213	INNER_EDGE_TRSV_LN_INV_8_VS_LIB8
2214#else
2215	CALL(inner_edge_trsv_ln_inv_8_vs_lib8)
2216#endif
2217
2218
2219	// store
2220
2221	movq	ARG6, %r10 // z
2222	movq	ARG7, %r11 // km
2223
2224#if MACRO_LEVEL>=1
2225	INNER_STORE_8_VS_LIB8
2226#else
2227	CALL(inner_store_8_vs_lib8)
2228#endif
2229
2230
2231	EPILOGUE
2232
2233	ret
2234
2235	FUN_END(kernel_strsv_ln_inv_8_vs_lib8)
2236
2237
2238
2239
2240
2241//                                 1      2          3        4                   5          6          7
2242// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z);
2243
2244	.p2align 4,,15
2245	GLOB_FUN_START(kernel_strsv_lt_inv_8_lib8)
2246
2247	PROLOGUE
2248
2249	// zero accumulation registers
2250
2251	vxorps	%ymm0, %ymm0, %ymm0
2252	vmovaps	%ymm0, %ymm1
2253	vmovaps	%ymm0, %ymm2
2254	vmovaps	%ymm0, %ymm3
2255	vmovaps	%ymm0, %ymm4
2256	vmovaps	%ymm0, %ymm5
2257	vmovaps	%ymm0, %ymm6
2258	vmovaps	%ymm0, %ymm7
2259
2260
2261	// call inner dgemv kernel n
2262
2263	movq	ARG1, %r10 // k
2264	subl	$8, %r10d
2265	movq	ARG2, %r11 // A
2266	movq	ARG3, %r12
2267	sall	$5, %r12d // 8*sda*sizeof(float)
2268	addq	%r12, %r11 // A+8*sda*sizeof(float)
2269	movq	ARG5, %r13 // x
2270	addq	$32, %r13 // x+8
2271
2272#if MACRO_LEVEL>=2
2273	INNER_KERNEL_GEMV_ADD_T_8_LIB8
2274#else
2275	CALL(inner_kernel_gemv_add_t_8_lib8)
2276#endif
2277
2278
2279	// call inner blender t
2280
2281	movq	ARG6, %r10 // y
2282
2283#if MACRO_LEVEL>=1
2284	INNER_BLEND_T_SCALE_M11_8_LIB8
2285#else
2286	CALL(inner_blend_t_scale_m11_8_lib8)
2287#endif
2288
2289
2290	// solution
2291
2292	movq	ARG2, %r10 // A
2293	movq	ARG4, %r11 // inv_diag_A
2294
2295#if MACRO_LEVEL>=1
2296	INNER_EDGE_TRSV_LT_INV_8_LIB8
2297#else
2298	CALL(inner_edge_trsv_lt_inv_8_lib8)
2299#endif
2300
2301
2302	// store
2303
2304	movq	ARG7, %r10 // z
2305
2306#if MACRO_LEVEL>=1
2307	INNER_STORE_8_LIB8
2308#else
2309	CALL(inner_store_8_lib8)
2310#endif
2311
2312
2313	EPILOGUE
2314
2315	ret
2316
2317	FUN_END(kernel_strsv_lt_inv_8_lib8)
2318
2319
2320
2321
2322
2323//                                    1      2          3        4                   5          6          7          8      9
2324// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn);
2325
2326	.p2align 4,,15
2327	GLOB_FUN_START(kernel_strsv_lt_inv_8_vs_lib8)
2328
2329	PROLOGUE
2330
2331	// zero accumulation registers
2332
2333	vxorps	%ymm0, %ymm0, %ymm0
2334	vmovaps	%ymm0, %ymm1
2335	vmovaps	%ymm0, %ymm2
2336	vmovaps	%ymm0, %ymm3
2337	vmovaps	%ymm0, %ymm4
2338	vmovaps	%ymm0, %ymm5
2339	vmovaps	%ymm0, %ymm6
2340	vmovaps	%ymm0, %ymm7
2341
2342
2343	// call inner dgemv kernel n
2344
2345	movq	ARG1, %r10 // k
2346	subl	$8, %r10d
2347	movq	ARG2, %r11 // A
2348	movq	ARG3, %r12
2349	sall	$5, %r12d // 8*sda*sizeof(float)
2350	addq	%r12, %r11 // A+8*sda*sizeof(float)
2351	movq	ARG5, %r13 // x
2352	addq	$32, %r13 // x+8
2353
2354#if MACRO_LEVEL>=2
2355	INNER_KERNEL_GEMV_ADD_T_8_LIB8
2356#else
2357	CALL(inner_kernel_gemv_add_t_8_lib8)
2358#endif
2359
2360
2361	// call inner blender t
2362
2363	movq	ARG6, %r10 // y
2364
2365#if MACRO_LEVEL>=1
2366	INNER_BLEND_T_SCALE_M11_8_LIB8
2367#else
2368	CALL(inner_blend_t_scale_m11_8_lib8)
2369#endif
2370
2371
2372	// solution
2373
2374	movq	ARG2, %r10 // A
2375	movq	ARG4, %r11 // inv_diag_A
2376	movq	ARG8, %r12 // km
2377	movq	ARG9, %r13 // kn
2378	movq	ARG5, %r14 // x
2379
2380#if MACRO_LEVEL>=1
2381	INNER_EDGE_TRSV_LT_INV_8_VS_LIB8
2382#else
2383	CALL(inner_edge_trsv_lt_inv_8_vs_lib8)
2384#endif
2385
2386
2387	// store
2388
2389	movq	ARG7, %r10 // z
2390	movq	ARG9, %r11 // kn
2391
2392#if MACRO_LEVEL>=1
2393	INNER_STORE_8_VS_LIB8
2394#else
2395	CALL(inner_store_8_vs_lib8)
2396#endif
2397
2398
2399	EPILOGUE
2400
2401	ret
2402
2403	FUN_END(kernel_strsv_lt_inv_8_vs_lib8)
2404
2405
2406
2407
2408
2409	// read-only data
2410#if defined(OS_LINUX)
2411	.section	.rodata.cst32,"aM",@progbits,32
2412#elif defined(OS_MAC)
2413	.section	__TEXT,__const
2414#elif defined(OS_WINDOWS)
2415	.section .rdata,"dr"
2416#endif
2417
2418#if defined(OS_LINUX) | defined(OS_WINDOWS)
2419	.align 32
2420.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
2421#elif defined(OS_MAC)
2422	.align 5
2423LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 }
2424#endif
2425	.float	0.5
2426	.float	1.5
2427	.float	2.5
2428	.float	3.5
2429	.float	4.5
2430	.float	5.5
2431	.float	6.5
2432	.float	7.5
2433
2434
2435
2436
2437#if defined(OS_LINUX)
2438	.section	.note.GNU-stack,"",@progbits
2439#elif defined(OS_MAC)
2440	.subsections_via_symbols
2441#endif
2442
2443