1/**************************************************************************************************
2*                                                                                                 *
3* This file is part of BLASFEO.                                                                   *
4*                                                                                                 *
5* BLASFEO -- BLAS For Embedded Optimization.                                                      *
6* Copyright (C) 2019 by Gianluca Frison.                                                          *
7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl.              *
8* All rights reserved.                                                                            *
9*                                                                                                 *
10* The 2-Clause BSD License                                                                        *
11*                                                                                                 *
12* Redistribution and use in source and binary forms, with or without                              *
13* modification, are permitted provided that the following conditions are met:                     *
14*                                                                                                 *
15* 1. Redistributions of source code must retain the above copyright notice, this                  *
16*    list of conditions and the following disclaimer.                                             *
17* 2. Redistributions in binary form must reproduce the above copyright notice,                    *
18*    this list of conditions and the following disclaimer in the documentation                    *
19*    and/or other materials provided with the distribution.                                       *
20*                                                                                                 *
21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND                 *
22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED                   *
23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                          *
24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR                 *
25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                  *
26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;                    *
27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND                     *
28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT                      *
29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS                   *
30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                                    *
31*                                                                                                 *
32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de                             *
33*                                                                                                 *
34**************************************************************************************************/
35
36#if defined(OS_LINUX) | defined(OS_MAC)
37
38//#define STACKSIZE 96
39#define STACKSIZE 64
40#define ARG1  %rdi
41#define ARG2  %rsi
42#define ARG3  %rdx
43#define ARG4  %rcx
44#define ARG5  %r8
45#define ARG6  %r9
46#define ARG7  STACKSIZE +  8(%rsp)
47#define ARG8  STACKSIZE + 16(%rsp)
48#define ARG9  STACKSIZE + 24(%rsp)
49#define ARG10 STACKSIZE + 32(%rsp)
50#define ARG11 STACKSIZE + 40(%rsp)
51#define ARG12 STACKSIZE + 48(%rsp)
52#define ARG13 STACKSIZE + 56(%rsp)
53#define ARG14 STACKSIZE + 64(%rsp)
54#define ARG15 STACKSIZE + 72(%rsp)
55#define ARG16 STACKSIZE + 80(%rsp)
56#define ARG17 STACKSIZE + 88(%rsp)
57#define ARG18 STACKSIZE + 96(%rsp)
58#define PROLOGUE \
59	subq	$STACKSIZE, %rsp; \
60	movq	%rbx,   (%rsp); \
61	movq	%rbp,  8(%rsp); \
62	movq	%r12, 16(%rsp); \
63	movq	%r13, 24(%rsp); \
64	movq	%r14, 32(%rsp); \
65	movq	%r15, 40(%rsp);
66#define EPILOGUE \
67	movq	  (%rsp), %rbx; \
68	movq	 8(%rsp), %rbp; \
69	movq	16(%rsp), %r12; \
70	movq	24(%rsp), %r13; \
71	movq	32(%rsp), %r14; \
72	movq	40(%rsp), %r15; \
73	addq	$STACKSIZE, %rsp;
74
75#if defined(OS_LINUX)
76
77#define GLOB_FUN_START(NAME) \
78	.globl NAME; \
79	.type NAME, @function; \
80NAME:
81#define FUN_START(NAME) \
82	.type NAME, @function; \
83NAME:
84#define FUN_END(NAME) \
85	.size	NAME, .-NAME
86#define CALL(NAME) \
87	call NAME
88#define ZERO_ACC \
89	xorpd	%xmm0, %xmm0; \
90	movapd	%xmm0, %xmm1; \
91	movapd	%xmm0, %xmm2; \
92	movapd	%xmm0, %xmm3
93//#define NEG_ACC \
94//	movapd		.LC11(%rip), %xmm15; \
95//	xorpd		%xmm15, %xmm0; \
96//	xorpd		%xmm15, %xmm1; \
97//	xorpd		%xmm15, %xmm2; \
98//	xorpd		%xmm15, %xmm3; \
99//	xorpd		%xmm15, %xmm4; \
100//	xorpd		%xmm15, %xmm5; \
101//	xorpd		%xmm15, %xmm6; \
102//	xorpd		%xmm15, %xmm7
103
104#else // defined(OS_MAC)
105
106#define GLOB_FUN_START(NAME) \
107	.globl _ ## NAME; \
108_ ## NAME:
109#define FUN_START(NAME) \
110_ ## NAME:
111#define FUN_END(NAME)
112#define CALL(NAME) \
113	callq _ ## NAME
114#define ZERO_ACC \
115	xorpd	%xmm0, %xmm0; \
116	movapd	%xmm0, %xmm1; \
117	movapd	%xmm0, %xmm2; \
118	movapd	%xmm0, %xmm3
119//#define NEG_ACC \
120//	movapd		LC11(%rip), %xmm15; \
121//	xorpd		%xmm15, %xmm0; \
122//	xorpd		%xmm15, %xmm1; \
123//	xorpd		%xmm15, %xmm2; \
124//	xorpd		%xmm15, %xmm3; \
125//	xorpd		%xmm15, %xmm4; \
126//	xorpd		%xmm15, %xmm5; \
127//	xorpd		%xmm15, %xmm6; \
128//	xorpd		%xmm15, %xmm7
129
130#endif
131
132#elif defined(OS_WINDOWS)
133
134#define STACKSIZE 256
135#define ARG1  %rcx
136#define ARG2  %rdx
137#define ARG3  %r8
138#define ARG4  %r9
139#define ARG5  STACKSIZE + 40(%rsp)
140#define ARG6  STACKSIZE + 48(%rsp)
141#define ARG7  STACKSIZE + 56(%rsp)
142#define ARG8  STACKSIZE + 64(%rsp)
143#define ARG9  STACKSIZE + 72(%rsp)
144#define ARG10 STACKSIZE + 80(%rsp)
145#define ARG11 STACKSIZE + 88(%rsp)
146#define ARG12 STACKSIZE + 96(%rsp)
147#define ARG13 STACKSIZE + 104(%rsp)
148#define ARG14 STACKSIZE + 112(%rsp)
149#define ARG15 STACKSIZE + 120(%rsp)
150#define ARG16 STACKSIZE + 128(%rsp)
151#define ARG17 STACKSIZE + 136(%rsp)
152#define ARG18 STACKSIZE + 144(%rsp)
153#define PROLOGUE \
154	subq	$STACKSIZE, %rsp; \
155	movq	%rbx,   (%rsp); \
156	movq	%rbp,  8(%rsp); \
157	movq	%r12, 16(%rsp); \
158	movq	%r13, 24(%rsp); \
159	movq	%r14, 32(%rsp); \
160	movq	%r15, 40(%rsp); \
161	movq	%rdi, 48(%rsp); \
162	movq	%rsi, 56(%rsp); \
163	movups	%xmm6, 64(%rsp); \
164	movups	%xmm7, 80(%rsp); \
165	movups	%xmm8, 96(%rsp); \
166	movups	%xmm9, 112(%rsp); \
167	movups	%xmm10, 128(%rsp); \
168	movups	%xmm11, 144(%rsp); \
169	movups	%xmm12, 160(%rsp); \
170	movups	%xmm13, 176(%rsp); \
171	movups	%xmm14, 192(%rsp); \
172	movups	%xmm15, 208(%rsp);
173#define EPILOGUE \
174	movq	  (%rsp), %rbx; \
175	movq	 8(%rsp), %rbp; \
176	movq	16(%rsp), %r12; \
177	movq	24(%rsp), %r13; \
178	movq	32(%rsp), %r14; \
179	movq	40(%rsp), %r15; \
180	movq	48(%rsp), %rdi; \
181	movq	56(%rsp), %rsi; \
182	movups	64(%rsp), %xmm6; \
183	movups	80(%rsp), %xmm7; \
184	movups	96(%rsp), %xmm8; \
185	movups	112(%rsp), %xmm9; \
186	movups	128(%rsp), %xmm10; \
187	movups	144(%rsp), %xmm11; \
188	movups	160(%rsp), %xmm12; \
189	movups	176(%rsp), %xmm13; \
190	movups	192(%rsp), %xmm14; \
191	movups	208(%rsp), %xmm15; \
192	addq	$STACKSIZE, %rsp;
193
194#define GLOB_FUN_START(NAME) \
195	.globl NAME; \
196	.def NAME; .scl 2; .type 32; .endef; \
197NAME:
198#define FUN_START(NAME) \
199	.def NAME; .scl 2; .type 32; .endef; \
200NAME:
201#define FUN_END(NAME)
202#define CALL(NAME) \
203	call NAME
204#define ZERO_ACC \
205	xorpd	%xmm0, %xmm0; \
206	movapd	%xmm0, %xmm1; \
207	movapd	%xmm0, %xmm2; \
208	movapd	%xmm0, %xmm3
209//#define NEG_ACC \
210//	movapd		.LC11(%rip), %xmm15; \
211//	xorpd		%xmm15, %xmm0; \
212//	xorpd		%xmm15, %xmm1; \
213//	xorpd		%xmm15, %xmm2; \
214//	xorpd		%xmm15, %xmm3; \
215//	xorpd		%xmm15, %xmm4; \
216//	xorpd		%xmm15, %xmm5; \
217//	xorpd		%xmm15, %xmm6; \
218//	xorpd		%xmm15, %xmm7
219
220#else
221
222#error wrong OS
223
224#endif
225
226
227
228#if defined(OS_LINUX) | defined(OS_WINDOWS)
229	.text
230#elif defined(OS_MAC)
231	.section	__TEXT,__text,regular,pure_instructions
232#endif
233
234
235
236
237
238// common inner routine with file scope
239//
240// input arguments:
241// r10d  <- k
242// r11   <- A
243// r12   <- x
244// xmm0 <- [z0 z1]_a
245// xmm1 <- [z2 z3]_a
246// xmm2 <- [z0 z1]_b
247// xmm3 <- [z2 z3]_b
248
249//
250// output arguments:
251// r10d  <- 0
252// r11   <- A+4*k*sizeof(double)
253// r12   <- x+k*sizeof(double)
254// xmm0 <- [z0 z1]_a
255// xmm1 <- [z2 z3]_a
256// xmm2 <- [z0 z1]_b
257// xmm3 <- [z2 z3]_b
258
259#if MACRO_LEVEL>=2
260	.macro INNER_KERNEL_DGEMV_ADD_N_4_LIB4
261#else
262	.p2align 4,,15
263	FUN_START(inner_kernel_dgemv_add_n_4_lib4)
264#endif
265
266	cmpl	$0, %r10d
267	jle		2f // return
268
269	cmpl	$4, %r10d
270	jl		0f // clean-up loop
271
272	// main loop
273	.p2align 3
2741: // main loop
275
276	movddup	0(%r12), %xmm12
277	movapd	0(%r11), %xmm8
278	mulpd	%xmm12, %xmm8
279	addpd	%xmm8, %xmm0
280	movapd	16(%r11), %xmm8
281	mulpd	%xmm12, %xmm8
282	addpd	%xmm8, %xmm1
283	subl	$4, %r10d
284
285	movddup	8(%r12), %xmm12
286	movapd	32(%r11), %xmm8
287	mulpd	%xmm12, %xmm8
288	addpd	%xmm8, %xmm2
289	movapd	48(%r11), %xmm8
290	mulpd	%xmm12, %xmm8
291	addpd	%xmm8, %xmm3
292
293	movddup	16(%r12), %xmm12
294	movapd	64(%r11), %xmm8
295	mulpd	%xmm12, %xmm8
296	addpd	%xmm8, %xmm0
297	movapd	80(%r11), %xmm8
298	mulpd	%xmm12, %xmm8
299	addpd	%xmm8, %xmm1
300
301	movddup	24(%r12), %xmm12
302	movapd	96(%r11), %xmm8
303	mulpd	%xmm12, %xmm8
304	addpd	%xmm8, %xmm2
305	movapd	112(%r11), %xmm8
306	mulpd	%xmm12, %xmm8
307	addpd	%xmm8, %xmm3
308
309	addq	$128, %r11
310	addq	$32, %r12
311
312	cmpl	$3, %r10d
313
314	jg		1b // main loop
315
316
317	// consider clean-up
318	cmpl	$0, %r10d
319	jle		2f // return
320
3210: // clean-up
322
323	movddup	0(%r12), %xmm12
324	movapd	0(%r11), %xmm8
325	mulpd	%xmm12, %xmm8
326	addpd	%xmm8, %xmm0
327	movapd	16(%r11), %xmm8
328	mulpd	%xmm12, %xmm8
329	addpd	%xmm8, %xmm1
330
331	addq	$32, %r11
332	addq	$8, %r12
333
334	subl	$1, %r10d
335	cmpl	$0, %r10d
336
337	jg		0b // clean
338
3392: // return
340
341#if MACRO_LEVEL>=2
342	.endm
343#else
344	ret
345
346	FUN_END(inner_kernel_dgemv_add_n_4_lib4)
347#endif
348
349
350
351
352
353// common inner routine with file scope
354//
355// input arguments:
356// r10d  <- k
357// r11   <- A
358// r12   <- bs*sda*sizeof(double) = 32*sda
359// r13   <- x
360// xmm0  <- [z0a z0b]
361// xmm1  <- [z1a z1b]
362// xmm2  <- [z2a z2b]
363// xmm3  <- [z3a z3b]
364
365//
366// output arguments:
367// r10d  <- 0
368// r11   <- A+4*k*sizeof(double)
369// r12   <- bs*sda*sizeof(double) = 32*sda
370// r13   <- x+k*sizeof(double)
371// xmm0  <- [z0a z0b]
372// xmm1  <- [z1a z1b]
373// xmm2  <- [z2a z2b]
374// xmm3  <- [z3a z3b]
375
376#if MACRO_LEVEL>=2
377	.macro INNER_KERNEL_DGEMV_ADD_T_4_LIB4
378#else
379	.p2align 4,,15
380	FUN_START(inner_kernel_dgemv_add_t_4_lib4)
381#endif
382
383	cmpl	$0, %r10d
384	jle		2f // return
385
386	cmpl	$4, %r10d
387	jl		0f // clean-up loop
388
389	// main loop
390	.p2align 3
3911: // main loop
392
393	movupd	0(%r13), %xmm12
394
395	movapd	0(%r11), %xmm8
396	mulpd	%xmm12, %xmm8
397	addpd	%xmm8, %xmm0
398	subl	$4, %r10d
399
400	movapd	32(%r11), %xmm8
401	mulpd	%xmm12, %xmm8
402	addpd	%xmm8, %xmm1
403
404	movapd	64(%r11), %xmm8
405	mulpd	%xmm12, %xmm8
406	addpd	%xmm8, %xmm2
407
408	movapd	96(%r11), %xmm8
409	mulpd	%xmm12, %xmm8
410	addpd	%xmm8, %xmm3
411
412	movupd	16(%r13), %xmm12
413
414	movapd	16(%r11), %xmm8
415	mulpd	%xmm12, %xmm8
416	addpd	%xmm8, %xmm0
417
418	movapd	48(%r11), %xmm8
419	mulpd	%xmm12, %xmm8
420	addpd	%xmm8, %xmm1
421
422	movapd	80(%r11), %xmm8
423	mulpd	%xmm12, %xmm8
424	addpd	%xmm8, %xmm2
425
426	movapd	112(%r11), %xmm8
427	mulpd	%xmm12, %xmm8
428	addpd	%xmm8, %xmm3
429
430	addq	%r12, %r11
431	addq	$32, %r13
432
433	cmpl	$3, %r10d
434	jg		1b // main loop
435
436
437	// consider clean-up
438	cmpl	$0, %r10d
439	jle		2f // return
440
4410: // clean-up
442
443	movsd	0(%r13), %xmm12
444
445	movsd	0(%r11), %xmm8
446	mulsd	%xmm12, %xmm8
447	addsd	%xmm8, %xmm0
448	subl	$1, %r10d
449
450	movsd	32(%r11), %xmm8
451	mulsd	%xmm12, %xmm8
452	addsd	%xmm8, %xmm1
453
454	movsd	64(%r11), %xmm8
455	mulsd	%xmm12, %xmm8
456	addsd	%xmm8, %xmm2
457
458	movsd	96(%r11), %xmm8
459	mulsd	%xmm12, %xmm8
460	addsd	%xmm8, %xmm3
461
462	addq	$8, %r11
463	addq	$8, %r13
464
465	cmpl	$0, %r10d
466	jg		0b // main loop
467
468
4692: // return
470
471#if MACRO_LEVEL>=2
472	.endm
473#else
474	ret
475
476	FUN_END(inner_kernel_dgemv_add_t_4_lib4)
477#endif
478
479
480
481
482
483// common inner routine with file scope
484//
485// input arguments:
486// r10d  <- k
487// r11   <- A
488// r12   <- bs*sda*sizeof(double) = 32*sda
489// r13   <- x_t
490// r14   <- z_n
491// xmm0  <- [z_t_0a z_t_0b]
492// xmm1  <- [z_t_1a z_t_1b]
493// xmm2  <- [z_t_2a z_t_2b]
494// xmm3  <- [z_t_3a z_t_3b]
495// xmm4  <- x_n_0
496// xmm5  <- x_n_1
497// xmm6  <- x_n_2
498// xmm7  <- x_n_3
499
500//
501// output arguments:
502// r10d  <- 0
503// r11   <- A+4*k*sizeof(double)
504// r12   <- bs*sda*sizeof(double) = 32*sda
505// r13   <- x_t+k*sizeof(double)
506// r14   <- z_n+k*sizeof(double)
507// xmm0  <- [z_t_0a z_t_0b]
508// xmm1  <- [z_t_1a z_t_1b]
509// xmm2  <- [z_t_2a z_t_2b]
510// xmm3  <- [z_t_3a z_t_3b]
511// xmm4  <- x_n_0
512// xmm5  <- x_n_1
513// xmm6  <- x_n_2
514// xmm7  <- x_n_3
515
516#if MACRO_LEVEL>=2
517	.macro INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
518#else
519	.p2align 4,,15
520	FUN_START(inner_kernel_dgemv_add_nt_4_lib4)
521#endif
522
523	cmpl	$0, %r10d
524	jle		2f // return
525
526	cmpl	$4, %r10d
527	jl		0f // clean-up loop
528
529	// main loop
530	.p2align 3
5311: // main loop
532
533	movupd	0(%r13), %xmm9
534	movupd	16(%r13), %xmm10
535	movupd	0(%r14), %xmm11
536	movupd	16(%r14), %xmm12
537
538	subl	$4, %r10d
539
540	movapd	0(%r11), %xmm14
541	movapd	%xmm14, %xmm15
542	mulpd	%xmm9, %xmm14
543	addpd	%xmm14, %xmm0
544	mulpd	%xmm4, %xmm15
545	addpd	%xmm15, %xmm11
546
547	movapd	16(%r11), %xmm14
548	movapd	%xmm14, %xmm15
549	mulpd	%xmm10, %xmm14
550	addpd	%xmm14, %xmm0
551	mulpd	%xmm4, %xmm15
552	addpd	%xmm15, %xmm12
553
554	movapd	32(%r11), %xmm14
555	movapd	%xmm14, %xmm15
556	mulpd	%xmm9, %xmm14
557	addpd	%xmm14, %xmm1
558	mulpd	%xmm5, %xmm15
559	addpd	%xmm15, %xmm11
560
561	movapd	48(%r11), %xmm14
562	movapd	%xmm14, %xmm15
563	mulpd	%xmm10, %xmm14
564	addpd	%xmm14, %xmm1
565	mulpd	%xmm5, %xmm15
566	addpd	%xmm15, %xmm12
567
568	movapd	64(%r11), %xmm14
569	movapd	%xmm14, %xmm15
570	mulpd	%xmm9, %xmm14
571	addpd	%xmm14, %xmm2
572	mulpd	%xmm6, %xmm15
573	addpd	%xmm15, %xmm11
574
575	movapd	80(%r11), %xmm14
576	movapd	%xmm14, %xmm15
577	mulpd	%xmm10, %xmm14
578	addpd	%xmm14, %xmm2
579	mulpd	%xmm6, %xmm15
580	addpd	%xmm15, %xmm12
581
582	movapd	96(%r11), %xmm14
583	movapd	%xmm14, %xmm15
584	mulpd	%xmm9, %xmm14
585	addpd	%xmm14, %xmm3
586	mulpd	%xmm7, %xmm15
587	addpd	%xmm15, %xmm11
588
589	movapd	112(%r11), %xmm14
590	movapd	%xmm14, %xmm15
591	mulpd	%xmm10, %xmm14
592	addpd	%xmm14, %xmm3
593	mulpd	%xmm7, %xmm15
594	addpd	%xmm15, %xmm12
595
596	movupd	%xmm11, 0(%r14)
597	movupd	%xmm12, 16(%r14)
598
599	addq	%r12, %r11
600	addq	$32, %r13
601	addq	$32, %r14
602
603	cmpl	$3, %r10d
604	jg		1b // main loop
605
606
607	// consider clean-up
608	cmpl	$0, %r10d
609	jle		2f // return
610
6110: // clean-up
612
613	movsd	0(%r13), %xmm9
614	movsd	0(%r14), %xmm11
615
616	subl	$1, %r10d
617
618	movsd	0(%r11), %xmm14
619	movsd	%xmm14, %xmm15
620	mulsd	%xmm9, %xmm14
621	addsd	%xmm14, %xmm0
622	mulsd	%xmm4, %xmm15
623	addsd	%xmm15, %xmm11
624
625	movsd	32(%r11), %xmm14
626	movsd	%xmm14, %xmm15
627	mulsd	%xmm9, %xmm14
628	addsd	%xmm14, %xmm1
629	mulsd	%xmm5, %xmm15
630	addsd	%xmm15, %xmm11
631
632	movsd	64(%r11), %xmm14
633	movsd	%xmm14, %xmm15
634	mulsd	%xmm9, %xmm14
635	addsd	%xmm14, %xmm2
636	mulsd	%xmm6, %xmm15
637	addsd	%xmm15, %xmm11
638
639	movsd	96(%r11), %xmm14
640	movsd	%xmm14, %xmm15
641	mulsd	%xmm9, %xmm14
642	addsd	%xmm14, %xmm3
643	mulsd	%xmm7, %xmm15
644	addsd	%xmm15, %xmm11
645
646	movsd	%xmm11, 0(%r14)
647
648	addq	$8, %r11
649	addq	$8, %r13
650	addq	$8, %r14
651
652	cmpl	$0, %r10d
653	jg		0b // main loop
654
6552: // return
656
657#if MACRO_LEVEL>=2
658	.endm
659#else
660	ret
661
662	FUN_END(inner_kernel_dgemv_add_nt_4_lib4)
663#endif
664
665
666
667
668
669// common inner routine with file scope
670//
671// input arguments:
672// r10d  <- k
673// r11   <- A
674// r12   <- bs*sda*sizeof(double) = 32*sda
675// r13   <- x
676// r14d  <- offA
677// xmm0  <- [z0a z0b]
678// xmm1  <- [z1a z1b]
679// xmm2  <- [z2a z2b]
680// xmm3  <- [z3a z3b]
681
682//
683// output arguments:
684// r10d  <-
685// r11   <-
686// r12   <-
687// r13   <-
688// r14d  <- offA
689// xmm0  <- [z0a z0b]
690// xmm1  <- [z1a z1b]
691// xmm2  <- [z2a z2b]
692// xmm3  <- [z3a z3b]
693
694#if MACRO_LEVEL>=2
695	.macro INNER_EDGE_GEMV_ADD_T_4_LIB4
696#else
697	.p2align 4,,15
698	FUN_START(inner_edge_dgemv_add_t_4_lib4)
699#endif
700
701	cmpl			$0, %r14d				// offset==0
702	jle				2f						// end
703
704	cmpl			$0, %r10d				// k==0
705	jle				2f						// end
706
707	movl			$4, %r15d				// load 4
708	subl			%r14d, %r15d			// 4-offsetA
709	cmpl			%r10d, %r15d			// k > 4-offsetA
710	cmovgl			%r10d, %r15d			// kend=min(k,4-offsetA)
711
712//	movl			%r14d, %eax				// load offsetA
713//	sall			$3, %eax				// offsetA*sizeof(double)
714//	addq			%rax, %r11				// A+offsetA*sizeof(double)
715
7161:
717	movsd	0(%r13), %xmm12
718
719	movsd	0(%r11), %xmm8
720	mulsd	%xmm12, %xmm8
721	addsd	%xmm8, %xmm0
722	subl	$1, %r10d
723
724	movsd	32(%r11), %xmm8
725	mulsd	%xmm12, %xmm8
726	addsd	%xmm8, %xmm1
727
728	movsd	64(%r11), %xmm8
729	mulsd	%xmm12, %xmm8
730	addsd	%xmm8, %xmm2
731
732	movsd	96(%r11), %xmm8
733	mulsd	%xmm12, %xmm8
734	addsd	%xmm8, %xmm3
735
736	subl	$1, %r10d				// k=-1
737	subl	$1, %r15d				// k_panel=-1
738	addq	$8, %r11				// A=+bs
739	addq	$8, %r13				// x=+1
740
741	cmpl	$0, %r15d				// if k_panel=0
742	jg		1b						// loop 1
743
744	cmpl	$0, %r10d				// if k=0
745	jle		2f						// end
746
747	addq	%r12, %r11				// B=Boff+sdb*bs
748	subq	$32, %r11				// B-=4*sizeof(double) (loop+offsetB)
749
7502:
751
752#if MACRO_LEVEL>=2
753	.endm
754#else
755	ret
756
757	FUN_END(inner_edge_dgemv_add_t_4_lib4)
758#endif
759
760
761
762
763
764// common inner routine with file scope
765//
766// input arguments:
767// r10   <- kmax
768// r11   <- A
769// r12   <- bs*sda*sizeof(double) = 32*sda
770// r13   <- x_t
771// r14   <- z_n
772// xmm0  <- [z_t_0a z_t_0b]
773// xmm1  <- [z_t_1a z_t_1b]
774// xmm2  <- [z_t_2a z_t_2b]
775// xmm3  <- [z_t_3a z_t_3b]
776// xmm4  <- x_n_0
777// xmm5  <- x_n_1
778// xmm6  <- x_n_2
779// xmm7  <- x_n_3
780
781//
782// output arguments:
783// r10   <- kmax-4
784// r11   <- A+4*k*sizeof(double)
785// r12   <- bs*sda*sizeof(double) = 32*sda
786// r13   <- x_t+k*sizeof(double)
787// r14   <- z_n+k*sizeof(double)
788// xmm0  <- [z_t_0a z_t_0b]
789// xmm1  <- [z_t_1a z_t_1b]
790// xmm2  <- [z_t_2a z_t_2b]
791// xmm3  <- [z_t_3a z_t_3b]
792// xmm4  <- x_n_0
793// xmm5  <- x_n_1
794// xmm6  <- x_n_2
795// xmm7  <- x_n_3
796
797#if MACRO_LEVEL>=2
798	.macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4
799#else
800	.p2align 4,,15
801	FUN_START(inner_edge_dsymv_add_nt_4_lib4)
802#endif
803
804	xorpd	%xmm13, %xmm13
805
806	movupd	0(%r13), %xmm9
807	movupd	16(%r13), %xmm10
808	movupd	0(%r14), %xmm11
809	movupd	16(%r14), %xmm12
810
811	// 0
812	movapd	0(%r11), %xmm14
813	movapd	%xmm14, %xmm15
814	mulpd	%xmm9, %xmm14
815	addpd	%xmm14, %xmm0
816	movsd	%xmm13, %xmm15 //
817	mulpd	%xmm4, %xmm15
818	addpd	%xmm15, %xmm11
819
820	movapd	16(%r11), %xmm14
821	movapd	%xmm14, %xmm15
822	mulpd	%xmm10, %xmm14
823	addpd	%xmm14, %xmm0
824	mulpd	%xmm4, %xmm15
825	addpd	%xmm15, %xmm12
826
827	// 1
828	movapd	32(%r11), %xmm14
829	movapd	%xmm14, %xmm15
830	movsd	%xmm13, %xmm14 //
831	mulpd	%xmm9, %xmm14
832	addpd	%xmm14, %xmm1
833//	movapd	%xmm13, %xmm15 //
834//	mulpd	%xmm5, %xmm15
835//	addpd	%xmm15, %xmm11
836
837	movapd	48(%r11), %xmm14
838	movapd	%xmm14, %xmm15
839	mulpd	%xmm10, %xmm14
840	addpd	%xmm14, %xmm1
841	mulpd	%xmm5, %xmm15
842	addpd	%xmm15, %xmm12
843
844	// 2
845//	movapd	64(%r11), %xmm14
846//	movapd	%xmm14, %xmm15
847//	movapd	%xmm13, %xmm14 //
848//	mulpd	%xmm9, %xmm14
849//	addpd	%xmm14, %xmm2
850//	movapd	%xmm13, %xmm15 //
851//	mulpd	%xmm6, %xmm15
852//	addpd	%xmm15, %xmm11
853
854	movapd	80(%r11), %xmm14
855	movapd	%xmm14, %xmm15
856	mulpd	%xmm10, %xmm14
857	addpd	%xmm14, %xmm2
858	movsd	%xmm13, %xmm15 //
859	mulpd	%xmm6, %xmm15
860	addpd	%xmm15, %xmm12
861
862	// 3
863//	movapd	96(%r11), %xmm14
864//	movapd	%xmm14, %xmm15
865//	movapd	%xmm13, %xmm14 //
866//	mulpd	%xmm9, %xmm14
867//	addpd	%xmm14, %xmm3
868//	movapd	%xmm13, %xmm15 //
869//	mulpd	%xmm7, %xmm15
870//	addpd	%xmm15, %xmm11
871
872	movapd	112(%r11), %xmm14
873	movapd	%xmm14, %xmm15
874	movsd	%xmm13, %xmm14 //
875	mulpd	%xmm10, %xmm14
876	addpd	%xmm14, %xmm3
877//	movapd	%xmm13, %xmm15 //
878//	mulpd	%xmm7, %xmm15
879//	addpd	%xmm15, %xmm12
880
881	movupd	%xmm11, 0(%r14)
882	movupd	%xmm12, 16(%r14)
883
884	addq	%r12, %r11
885	addq	$32, %r13
886	addq	$32, %r14
887
888	subq	$4, %r10
889
890#if MACRO_LEVEL>=2
891	.endm
892#else
893	ret
894
895	FUN_END(inner_edge_dsymv_add_nt_4_lib4)
896#endif
897
898
899
900
901
902
903#if 0
904
905// common inner routine with file scope
906//
907// triangular substitution with vector RHS
908//
909// input arguments:
910// r10  <- E
911// r11  <- inv_diag_E
912// xmm0 <- [z0 z1]
913// xmm1 <- [z2 z3]
914//
915// output arguments:
916// r10  <- E
917// r11  <- inv_diag_E
918// xmm0 <- [z0 z1]
919// xmm1 <- [z2 z3]
920
921#if MACRO_LEVEL>=1
922	.macro INNER_EDGE_DTRSV_LN_INV_4_LIB4
923#else
924	.p2align 4,,15
925	FUN_START(inner_edge_dtrsv_ln_inv_4_lib4)
926#endif
927
928	xorpd			%xmm14, %xmm14
929
930	movddup			0(%r11), %xmm12
931	mulpd			%xmm0, %xmm12
932	movsd			%xmm12, %xmm0
933
934	movapd			0(%r10), %xmm13
935	movsd			%xmm14, %xmm13
936	movddup			%xmm0, %xmm12
937	mulpd			%xmm13, %xmm12
938	subpd			%xmm12, %xmm0
939	movddup			8(%r11), %xmm12
940	mulpd			%xmm0, %xmm12
941	movhpd			%xmm12, %xmm0
942
943	movapd			32(%r10), %ymm13
944	vblendpd		$0x3, %ymm14, %ymm13, %ymm13
945	vpermilpd		$0x3, %ymm0, %ymm12
946	vperm2f128		$0x00, %ymm12, %ymm12, %ymm12
947	vmulpd			%ymm13, %ymm12, %ymm15
948	vsubpd			%ymm15, %ymm0, %ymm0
949	vbroadcastsd	16(%r11), %ymm12
950	vmulpd			%ymm0, %ymm12, %ymm1
951	vblendpd		$0x4, %ymm1, %ymm0, %ymm0
952
953	vmovapd			64(%r10), %ymm13
954	vblendpd		$0x7, %ymm14, %ymm13, %ymm13
955	vpermilpd		$0x0, %ymm0, %ymm12
956	vperm2f128		$0x11, %ymm12, %ymm12, %ymm12
957	vmulpd			%ymm13, %ymm12, %ymm15
958	vsubpd			%ymm15, %ymm0, %ymm0
959	vbroadcastsd	24(%r11), %ymm12
960	vmulpd			%ymm0, %ymm12, %ymm1
961	vblendpd		$0x8, %ymm1, %ymm0, %ymm0
962
963#if MACRO_LEVEL>=1
964	.endm
965#else
966	ret
967
968	FUN_END(inner_edge_dtrsv_ln_inv_4_lib4)
969#endif
970
971#endif
972
973
974
975
976// common inner routine with file scope
977//
978// blend for ta==n, scale for generic alpha and beta
979//
980// input arguments:
981// r10  <- alpha
982// r11  <- beta
983// r12  <- y
984// xmm0 <- [z0 z1]_a
985// xmm1 <- [z2 z3]_a
986// xmm2 <- [z0 z1]_b
987// xmm3 <- [z2 z3]_b
988//
989// output arguments:
990// r10  <- alpha
991// r11  <- beta
992// r12  <- y
993// xmm0 <- [z0 z1]
994// xmm1 <- [z2 z3]
995
996#if MACRO_LEVEL>=1
997	.macro INNER_BLEND_N_SCALE_AB_4_LIB4
998#else
999	.p2align 4,,15
1000	FUN_START(inner_blend_n_scale_ab_4_lib4)
1001#endif
1002
1003	// reduction
1004	addpd	%xmm2, %xmm0
1005	addpd	%xmm3, %xmm1
1006
1007	// alpha
1008	movddup	0(%r10), %xmm15
1009	mulpd	%xmm15, %xmm0
1010	mulpd	%xmm15, %xmm1
1011
1012	// beta
1013	movddup	0(%r11), %xmm15
1014	movupd	0(%r12), %xmm14
1015	mulpd	%xmm15, %xmm14
1016	addpd	%xmm14, %xmm0
1017	movupd	16(%r12), %xmm14
1018	mulpd	%xmm15, %xmm14
1019	addpd	%xmm14, %xmm1
1020
1021#if MACRO_LEVEL>=1
1022	.endm
1023#else
1024	ret
1025
1026	FUN_END(inner_blend_n_scale_ab_4_lib4)
1027#endif
1028
1029
1030
1031
1032
1033// common inner routine with file scope
1034//
1035// blend for ta==t, scale for generic alpha and beta
1036//
1037// input arguments:
1038// r10  <- alpha
1039// r11  <- beta
1040// r12  <- y
1041// xmm0 <- [z0a z0b]
1042// xmm1 <- [z1a z1b]
1043// xmm2 <- [z2a z2b]
1044// xmm3 <- [z3a z3b]
1045//
1046// output arguments:
1047// r10  <- alpha
1048// r11  <- beta
1049// r12  <- y
1050// xmm0 <- [z0 z1]
1051// xmm1 <- [z2 z3]
1052
1053#if MACRO_LEVEL>=1
1054	.macro INNER_BLEND_T_SCALE_AB_4_LIB4
1055#else
1056	.p2align 4,,15
1057	FUN_START(inner_blend_t_scale_ab_4_lib4)
1058#endif
1059
1060	// reduction
1061	haddpd	%xmm1, %xmm0
1062	haddpd	%xmm3, %xmm2
1063	movapd	%xmm2, %xmm1
1064
1065	// alpha
1066	movddup	0(%r10), %xmm15
1067	mulpd	%xmm15, %xmm0
1068	mulpd	%xmm15, %xmm1
1069
1070	// beta
1071	movddup	0(%r11), %xmm15
1072	movupd	0(%r12), %xmm14
1073	mulpd	%xmm15, %xmm14
1074	addpd	%xmm14, %xmm0
1075	movupd	16(%r12), %xmm14
1076	mulpd	%xmm15, %xmm14
1077	addpd	%xmm14, %xmm1
1078
1079
1080#if MACRO_LEVEL>=1
1081	.endm
1082#else
1083	ret
1084
1085	FUN_END(inner_blend_t_scale_ab_4_lib4)
1086#endif
1087
1088
1089
1090
1091
1092// common inner routine with file scope
1093//
1094// blend for ta==t, scale for generic alpha and beta=1.0
1095//
1096// input arguments:
1097// r10  <- alpha
1098// r11  <- y
1099// xmm0 <- [z0a z0b]
1100// xmm1 <- [z1a z1b]
1101// xmm2 <- [z2a z2b]
1102// xmm3 <- [z3a z3b]
1103//
1104// output arguments:
1105// r10  <- alpha
1106// r11  <- y
1107// xmm0 <- [z0 z1]
1108// xmm1 <- [z2 z3]
1109
1110#if MACRO_LEVEL>=1
1111	.macro INNER_BLEND_T_SCALE_A1_4_LIB4
1112#else
1113	.p2align 4,,15
1114	FUN_START(inner_blend_t_scale_a1_4_lib4)
1115#endif
1116
1117	// reduction
1118	haddpd	%xmm1, %xmm0
1119	haddpd	%xmm3, %xmm2
1120	movapd	%xmm2, %xmm1
1121
1122	// alpha
1123	movddup	0(%r10), %xmm15
1124	mulpd	%xmm15, %xmm0
1125	mulpd	%xmm15, %xmm1
1126
1127	// beta
1128	movupd	0(%r11), %xmm14
1129	addpd	%xmm14, %xmm0
1130	movupd	16(%r11), %xmm14
1131	addpd	%xmm14, %xmm1
1132
1133#if MACRO_LEVEL>=1
1134	.endm
1135#else
1136	ret
1137
1138	FUN_END(inner_blend_t_scale_a1_4_lib4)
1139#endif
1140
1141
1142
1143
1144
1145// common inner routine with file scope
1146//
1147// store
1148//
1149// input arguments:
1150// r10  <- z
1151// xmm0 <- [z0 z1]
1152// xmm1 <- [z2 z3]
1153//
1154// output arguments:
1155// r10  <- z
1156// xmm0 <- [z0 z1]
1157// xmm1 <- [z2 z3]
1158
1159#if MACRO_LEVEL>=1
1160	.macro INNER_STORE_4_LIB4
1161#else
1162	.p2align 4,,15
1163	FUN_START(inner_store_4_lib4)
1164#endif
1165
1166	movupd %xmm0,  0(%r10)
1167	movupd %xmm1, 16(%r10)
1168
1169#if MACRO_LEVEL>=1
1170	.endm
1171#else
1172	ret
1173
1174	FUN_END(inner_store_4_lib4)
1175#endif
1176
1177
1178
1179
1180
1181// common inner routine with file scope
1182//
1183// store vs
1184//
1185// input arguments:
1186// r10   <- D
1187// r11d   <- km
1188// xmm0 <- [z0 z1]
1189// xmm1 <- [z2 z3]
1190//
1191// output arguments:
1192// r10   <- D
1193// r11d   <- km
1194// xmm0 <- [z0 z1]
1195// xmm1 <- [z2 z3]
1196
1197#if MACRO_LEVEL>=1
1198	.macro INNER_STORE_4_VS_LIB4
1199#else
1200	.p2align 4,,15
1201	FUN_START(inner_store_4_vs_lib4)
1202#endif
1203
1204	cmpl	$0, %r11d
1205	jle		0f // return
1206
1207	movsd 	%xmm0, 0(%r10)
1208
1209	cmpl	$1, %r11d
1210	jle		0f // return
1211
1212	movhpd 	%xmm0, 8(%r10)
1213
1214	cmpl	$2, %r11d
1215	jle		0f // return
1216
1217	movsd 	%xmm1, 16(%r10)
1218
1219	cmpl	$3, %r11d
1220	jle		0f // return
1221
1222	movhpd 	%xmm1, 24(%r10)
1223
12240:
1225
1226#if MACRO_LEVEL>=1
1227	.endm
1228#else
1229	ret
1230
1231	FUN_END(inner_store_4_vs_lib4)
1232#endif
1233
1234
1235
1236
1237
1238//                            1      2              3          4          5             6          7
1239// void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
1240
1241	.p2align 4,,15
1242	GLOB_FUN_START(kernel_dgemv_n_4_lib4)
1243
1244	PROLOGUE
1245
1246	// zero accumulation registers
1247
1248	ZERO_ACC
1249
1250
1251	// call inner dgemv kernel n
1252
1253	movq	ARG1, %r10 // k
1254	movq	ARG3, %r11  // A
1255	movq	ARG4, %r12  // x
1256
1257#if MACRO_LEVEL>=2
1258	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
1259#else
1260	CALL(inner_kernel_dgemv_add_n_4_lib4)
1261#endif
1262
1263
1264	// call inner blend n scale ab
1265
1266	movq	ARG2, %r10 // alpha
1267	movq	ARG5, %r11   // beta
1268	movq	ARG6, %r12   // y
1269
1270#if MACRO_LEVEL>=1
1271	INNER_BLEND_N_SCALE_AB_4_LIB4
1272#else
1273	CALL(inner_blend_n_scale_ab_4_lib4)
1274#endif
1275
1276
1277	// store
1278
1279	movq	ARG7, %r10 // z
1280
1281#if MACRO_LEVEL>=1
1282	INNER_STORE_4_LIB4
1283#else
1284	CALL(inner_store_4_lib4)
1285#endif
1286
1287
1288	EPILOGUE
1289
1290	ret
1291
1292	FUN_END(kernel_dgemv_n_4_lib4)
1293
1294
1295
1296
1297
1298//                               1      2              3          4          5             6          7          8
1299// void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1);
1300
1301	.p2align 4,,15
1302	GLOB_FUN_START(kernel_dgemv_n_4_vs_lib4)
1303
1304	PROLOGUE
1305
1306	// zero accumulation registers
1307
1308	ZERO_ACC
1309
1310
1311	// call inner dgemv kernel n
1312
1313	movq	ARG1, %r10 // k
1314	movq	ARG3, %r11  // A
1315	movq	ARG4, %r12  // x
1316
1317#if MACRO_LEVEL>=2
1318	INNER_KERNEL_DGEMV_ADD_N_4_LIB4
1319#else
1320	CALL(inner_kernel_dgemv_add_n_4_lib4)
1321#endif
1322
1323
1324	// call inner blend n scale ab
1325
1326	movq	ARG2, %r10 // alpha
1327	movq	ARG5, %r11   // beta
1328	movq	ARG6, %r12   // y
1329
1330#if MACRO_LEVEL>=1
1331	INNER_BLEND_N_SCALE_AB_4_LIB4
1332#else
1333	CALL(inner_blend_n_scale_ab_4_lib4)
1334#endif
1335
1336
1337	// store
1338
1339	movq	ARG7, %r10 // z
1340	movq	ARG8, %r11 // k1
1341
1342#if MACRO_LEVEL>=1
1343	INNER_STORE_4_VS_LIB4
1344#else
1345	CALL(inner_store_4_vs_lib4)
1346#endif
1347
1348
1349	EPILOGUE
1350
1351	ret
1352
1353	FUN_END(kernel_dgemv_n_4_vs_lib4)
1354
1355
1356
1357
1358
1359//                            1      2              3          4        5          6             7         8          9
1360// void kernel_dgemv_t_4_lib4(int k, double *alpha, int offa, double *A, int sda, double *x, double *beta, double *y, double *z);
1361
1362	.p2align 4,,15
1363	GLOB_FUN_START(kernel_dgemv_t_4_lib4)
1364
1365	PROLOGUE
1366
1367	// zero accumulation registers
1368
1369	ZERO_ACC
1370
1371
1372	// call inner dgemv kernel n
1373
1374	movq	ARG1, %r10 // k
1375	movq	ARG4, %r11  // A
1376	movq	ARG5, %r12 // sda
1377	sall	$5, %r12d // 4*sda*sizeof(double)
1378//	movslq	%r12d, %r12
1379	movq	ARG6, %r13  // x
1380	movq	ARG3, %r14 // offA
1381
1382#if MACRO_LEVEL>=2
1383	INNER_EDGE_GEMV_ADD_T_4_LIB4
1384#else
1385	CALL(inner_edge_dgemv_add_t_4_lib4)
1386#endif
1387
1388#if MACRO_LEVEL>=2
1389	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
1390#else
1391	CALL(inner_kernel_dgemv_add_t_4_lib4)
1392#endif
1393
1394
1395	// call inner blender t
1396
1397	movq	ARG2, %r10 // alpha
1398	movq	ARG7, %r11   // beta
1399	movq	ARG8, %r12 // y
1400
1401#if MACRO_LEVEL>=1
1402	INNER_BLEND_T_SCALE_AB_4_LIB4
1403#else
1404	CALL(inner_blend_t_scale_ab_4_lib4)
1405#endif
1406
1407
1408	// store
1409
1410	movq	ARG9, %r10 // z
1411
1412#if MACRO_LEVEL>=1
1413	INNER_STORE_4_LIB4
1414#else
1415	CALL(inner_store_4_lib4)
1416#endif
1417
1418
1419	EPILOGUE
1420
1421	ret
1422
1423	FUN_END(kernel_dgemv_t_4_lib4)
1424
1425
1426
1427
1428
1429//                               1      2              3          4        5          6             7         8           9         10
1430// void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km);
1431
1432	.p2align 4,,15
1433	GLOB_FUN_START(kernel_dgemv_t_4_vs_lib4)
1434
1435	PROLOGUE
1436
1437	// zero accumulation registers
1438
1439	ZERO_ACC
1440
1441
1442	// call inner dgemv kernel n
1443
1444	movq	ARG1, %r10 // k
1445	movq	ARG4, %r11  // A
1446	movq	ARG5, %r12 // sda
1447	sall	$5, %r12d // 4*sda*sizeof(double)
1448//	movslq	%r12d, %r12
1449	movq	ARG6, %r13  // x
1450	movq	ARG3, %r14 // offA
1451
1452#if MACRO_LEVEL>=2
1453	INNER_EDGE_GEMV_ADD_T_4_LIB4
1454#else
1455	CALL(inner_edge_dgemv_add_t_4_lib4)
1456#endif
1457
1458#if MACRO_LEVEL>=2
1459	INNER_KERNEL_DGEMV_ADD_T_4_LIB4
1460#else
1461	CALL(inner_kernel_dgemv_add_t_4_lib4)
1462#endif
1463
1464
1465	// call inner blender t
1466
1467	movq	ARG2, %r10 // alpha
1468	movq	ARG7, %r11   // beta
1469	movq	ARG8, %r12 // y
1470
1471#if MACRO_LEVEL>=1
1472	INNER_BLEND_T_SCALE_AB_4_LIB4
1473#else
1474	CALL(inner_blend_t_scale_ab_4_lib4)
1475#endif
1476
1477
1478	// store
1479
1480	movq	ARG9, %r10 // z
1481	movq	ARG10, %r11 // km
1482
1483#if MACRO_LEVEL>=1
1484	INNER_STORE_4_VS_LIB4
1485#else
1486	CALL(inner_store_4_vs_lib4)
1487#endif
1488
1489
1490	EPILOGUE
1491
1492	ret
1493
1494	FUN_END(kernel_dgemv_t_4_vs_lib4)
1495
1496
1497
1498
1499
1500//                             1      2                3                4          5        6            7            8               9            10           11
1501// void kernel_dgemv_nt_4_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
1502
1503	.p2align 4,,15
1504	GLOB_FUN_START(kernel_dgemv_nt_4_lib4)
1505
1506	PROLOGUE
1507
1508	// zero accumulation registers y_t
1509
1510	ZERO_ACC
1511
1512	// initialize x_n
1513	movq	ARG2, %r10 // alpha_n
1514	movddup 0(%r10), %xmm15
1515
1516	movq	ARG6, %r10 // x_n
1517
1518	movddup 0(%r10), %xmm4
1519	mulpd	%xmm15, %xmm4
1520	movddup 8(%r10), %xmm5
1521	mulpd	%xmm15, %xmm5
1522	movddup 16(%r10), %xmm6
1523	mulpd	%xmm15, %xmm6
1524	movddup 24(%r10), %xmm7
1525	mulpd	%xmm15, %xmm7
1526
1527
1528	// inner kernel dgemv nt
1529
1530	movq	ARG1, %r10 // k
1531	movq	ARG4, %r11  // A
1532	movq	ARG5, %r12 // sda
1533	sall	$5, %r12d // 4*sda*sizeof(double)
1534//	movslq	%r12d, %r12
1535	movq	ARG7, %r13  // x_t
1536	movq	ARG10, %r14  // z_n
1537
1538#if MACRO_LEVEL>=2
1539	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
1540#else
1541	CALL(inner_kernel_dgemv_add_nt_4_lib4)
1542#endif
1543
1544
1545	// inner blend n scale ab
1546
1547	movq	ARG3, %r10 // alpha_t
1548	movq	ARG8, %r11   // beta_t
1549	movq	ARG9, %r12   // y_t
1550
1551#if MACRO_LEVEL>=1
1552	INNER_BLEND_T_SCALE_AB_4_LIB4
1553#else
1554	CALL(inner_blend_t_scale_ab_4_lib4)
1555#endif
1556
1557
1558	// store
1559
1560	movq	ARG11, %r10 // z_t
1561
1562#if MACRO_LEVEL>=1
1563	INNER_STORE_4_LIB4
1564#else
1565	CALL(inner_store_4_lib4)
1566#endif
1567
1568
1569	EPILOGUE
1570
1571	ret
1572
1573	FUN_END(kernel_dgemv_nt_4_lib4)
1574
1575
1576
1577
1578
1579//                                1      2                3                4          5        6            7            8               9            10           11           12
1580// void kernel_dgemv_nt_4_vs_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km);
1581
1582	.p2align 4,,15
1583	GLOB_FUN_START(kernel_dgemv_nt_4_vs_lib4)
1584
1585	PROLOGUE
1586
1587	// zero accumulation registers y_t
1588
1589	xorpd	%xmm0, %xmm0
1590	movapd	%xmm0, %xmm1
1591	movapd	%xmm0, %xmm2
1592	movapd	%xmm0, %xmm3
1593
1594	movapd	%xmm0, %xmm4
1595	movapd	%xmm0, %xmm5
1596	movapd	%xmm0, %xmm6
1597	movapd	%xmm0, %xmm7
1598
1599	// initialize x_n
1600	movq	ARG2, %r10 // alpha_n
1601	movddup 0(%r10), %xmm15
1602
1603	movq	ARG6, %r10 // x_n
1604	movq	ARG12, %r11 // km
1605
1606	movddup 0(%r10), %xmm4
1607	mulpd	%xmm15, %xmm4
1608	cmpl	$2, %r11d
1609	jl		0f
1610	movddup 8(%r10), %xmm5
1611	mulpd	%xmm15, %xmm5
1612	cmpl	$3, %r11d
1613	jl		0f
1614	movddup 16(%r10), %xmm6
1615	mulpd	%xmm15, %xmm6
1616	je		0f
1617	movddup 24(%r10), %xmm7
1618	mulpd	%xmm15, %xmm7
16190:
1620
1621	// inner kernel dgemv nt
1622
1623	movq	ARG1, %r10 // k
1624	movq	ARG4, %r11  // A
1625	movq	ARG5, %r12 // sda
1626	sall	$5, %r12d // 4*sda*sizeof(double)
1627//	movslq	%r12d, %r12
1628	movq	ARG7, %r13  // x_t
1629	movq	ARG10, %r14  // z_n
1630
1631#if MACRO_LEVEL>=2
1632	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
1633#else
1634	CALL(inner_kernel_dgemv_add_nt_4_lib4)
1635#endif
1636
1637
1638	// inner blend n scale ab
1639
1640	movq	ARG3, %r10 // alpha_t
1641	movq	ARG8, %r11   // beta_t
1642	movq	ARG9, %r12   // y_t
1643
1644#if MACRO_LEVEL>=1
1645	INNER_BLEND_T_SCALE_AB_4_LIB4
1646#else
1647	CALL(inner_blend_t_scale_ab_4_lib4)
1648#endif
1649
1650
1651	// store
1652
1653	movq	ARG11, %r10 // z_t
1654	movq	ARG12, %r11 // km
1655
1656#if MACRO_LEVEL>=1
1657	INNER_STORE_4_VS_LIB4
1658#else
1659	CALL(inner_store_4_vs_lib4)
1660#endif
1661
1662
1663	EPILOGUE
1664
1665	ret
1666
1667	FUN_END(kernel_dgemv_nt_4_vs_lib4)
1668
1669
1670
1671
1672
1673//                            1      2              3          4        5           6
1674// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *z);
1675
1676	.p2align 4,,15
1677	GLOB_FUN_START(kernel_dsymv_l_4_lib4)
1678
1679	PROLOGUE
1680
1681	// zero accumulation registers y_t
1682
1683	ZERO_ACC
1684
1685	// initialize x_n
1686	movq	ARG2, %r10 // alpha
1687	movddup 0(%r10), %xmm15
1688
1689	movq	ARG5, %r10 // x_n
1690
1691	movddup 0(%r10), %xmm4
1692	mulpd	%xmm15, %xmm4
1693	movddup 8(%r10), %xmm5
1694	mulpd	%xmm15, %xmm5
1695	movddup 16(%r10), %xmm6
1696	mulpd	%xmm15, %xmm6
1697	movddup 24(%r10), %xmm7
1698	mulpd	%xmm15, %xmm7
1699
1700
1701	// inner edge dsyrk & kernel dgemv nt
1702
1703	movq	ARG1, %r10 // k
1704	movq	ARG3, %r11  // A
1705	movq	ARG4, %r12 // sda
1706	sall	$5, %r12d // 4*sda*sizeof(double)
1707	movq	ARG5, %r13  // x_t
1708	movq	ARG6, %r14  // z_n
1709
1710#if MACRO_LEVEL>=2
1711	INNER_EDGE_DSYMV_ADD_NT_4_LIB4
1712#else
1713	CALL(inner_edge_dsymv_add_nt_4_lib4)
1714#endif
1715
1716#if MACRO_LEVEL>=2
1717	INNER_KERNEL_DGEMV_ADD_NT_4_LIB4
1718#else
1719	CALL(inner_kernel_dgemv_add_nt_4_lib4)
1720#endif
1721
1722
1723	// call inner blend n scale ab
1724
1725	movq	ARG2, %r10 // alpha
1726	movq	ARG6, %r11   // z_t
1727
1728#if MACRO_LEVEL>=1
1729	INNER_BLEND_T_SCALE_A1_4_LIB4
1730#else
1731	CALL(inner_blend_t_scale_a1_4_lib4)
1732#endif
1733
1734
1735	// store
1736
1737	movq	ARG6, %r10 // z_t
1738
1739#if MACRO_LEVEL>=1
1740	INNER_STORE_4_LIB4
1741#else
1742	CALL(inner_store_4_lib4)
1743#endif
1744
1745
1746	EPILOGUE
1747
1748	ret
1749
1750	FUN_END(kernel_dsymv_l_4_lib4)
1751
1752
1753
1754
1755
1756
1757