1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define RPREFETCHSIZE	12
26#define WPREFETCHSIZE (RPREFETCHSIZE * 2)
27#define PREFETCH      prefetcht0
28#define PREFETCHW     prefetcht2
29
30#define STACK	16
31#define ARGS	 8
32
33#define J	 0 + STACK(%esp)
34#define BOFFSET2 4 + STACK(%esp)
35
36#define M	 4 + STACK + ARGS(%esp)
37#define N	 8 + STACK + ARGS(%esp)
38#define A	12 + STACK + ARGS(%esp)
39#define LDA	16 + STACK + ARGS(%esp)
40#define B	20 + STACK + ARGS(%esp)
41
42	PROLOGUE
43
44	subl	$ARGS, %esp
45	pushl	%ebp
46	pushl	%edi
47	pushl	%esi
48	pushl	%ebx
49
50	PROFCODE
51
52	movl	A, %ebp
53	movl	B, %edi
54
55	movl	M,   %ebx
56	movl	N,   %eax
57	andl	$-2, %eax
58
59	imull	%ebx, %eax		# m * ( n & ~1)
60
61	leal	(%edi,%eax,SIZE), %eax	# boffset2 = b + m * (n & ~1)
62	movl	%eax, BOFFSET2
63
64	movl	M, %esi
65#ifdef DOUBLE
66	sall	$4,%esi
67#else
68	sall	$3,%esi
69#endif
70
71	sarl	$1,  %ebx		# if !(m & 1) goto L28
72	movl	%ebx, J
73	jle	.L28
74	ALIGN_4
75
76.L39:
77	movl	%ebp, %edx		# aoffset1 = a
78	movl	LDA,  %eax
79	movl	N,    %ebx
80
81	leal	(%ebp, %eax,SIZE), %ecx	# aoffset2 = a + lda
82	leal	(%ecx, %eax,SIZE), %ebp	# aoffset += 2 * lda
83	movl	%edi, %eax		# boffset1 = b_offset
84	addl	$4 * SIZE, %edi		# boffset += 4
85
86	sarl	$2, %ebx
87	jle	.L32
88	ALIGN_4
89
90.L36:
91	PREFETCH	RPREFETCHSIZE * SIZE(%edx)
92
93	movsd	0 * SIZE(%edx), %xmm0
94	movhps	1 * SIZE(%edx), %xmm0
95	movsd	0 * SIZE(%ecx), %xmm2
96	movhps	1 * SIZE(%ecx), %xmm2
97
98	PREFETCH	RPREFETCHSIZE * SIZE(%ecx)
99
100	movsd	2 * SIZE(%edx), %xmm4
101	movhps	3 * SIZE(%edx), %xmm4
102	movsd	2 * SIZE(%ecx), %xmm6
103	movhps	3 * SIZE(%ecx), %xmm6
104
105	movaps	%xmm0, 0 * SIZE(%eax)
106	movaps	%xmm2, 2 * SIZE(%eax)
107
108	addl	%esi, %eax
109
110	movaps	%xmm4, 0 * SIZE(%eax)
111	movaps	%xmm6, 2 * SIZE(%eax)
112
113	addl	$4 * SIZE, %ecx
114	addl	$4 * SIZE, %edx
115	addl	%esi, %eax
116	decl	%ebx
117	jne	.L36
118	ALIGN_4
119
120.L32:
121	movl	N,  %ebx
122	test	$2, %ebx
123	je	.L37
124
125	PREFETCH	RPREFETCHSIZE * SIZE(%edx)
126	movsd	0 * SIZE(%edx), %xmm0
127	movhps	1 * SIZE(%edx), %xmm0
128
129	PREFETCH	RPREFETCHSIZE * SIZE(%ecx)
130	movsd	0 * SIZE(%ecx), %xmm2
131	movhps	1 * SIZE(%ecx), %xmm2
132
133	movaps	%xmm0, 0 * SIZE(%eax)
134	movaps	%xmm2, 2 * SIZE(%eax)
135
136	addl	$2 * SIZE, %ecx
137	addl	$2 * SIZE, %edx
138	ALIGN_4
139
140.L37:
141	movl	N, %ebx
142	test	$1, %ebx
143	je	.L38
144
145	movl	BOFFSET2, %eax
146
147	movsd	0 * SIZE(%edx), %xmm0
148	movhps	0 * SIZE(%ecx), %xmm0
149	movaps	%xmm0, 0 * SIZE(%eax)
150
151	addl	$2 * SIZE, %eax
152	movl	%eax, BOFFSET2
153	ALIGN_4
154
155.L38:
156	decl	J
157	jg	.L39
158	ALIGN_4
159
160.L28:
161	movl	M,  %eax
162	movl	N, %ebx
163
164	testb	$1, %al
165	je	.L40
166
167	sarl	$2, %ebx
168	jle	.L41
169	ALIGN_4
170
171.L45:
172	movsd	0 * SIZE(%ebp), %xmm0
173	movhps	1 * SIZE(%ebp), %xmm0
174	movsd	2 * SIZE(%ebp), %xmm2
175	movhps	3 * SIZE(%ebp), %xmm2
176
177	movaps	%xmm0, 0 * SIZE(%edi)
178
179	addl	%esi, %edi
180
181	movaps  %xmm2, 0 * SIZE(%edi)
182
183	addl	%esi,%edi
184	addl	$4 * SIZE, %ebp
185	decl	%ebx
186	jg	.L45
187	ALIGN_4
188
189.L41:
190	movl	N,  %ebx
191	test	$2, %ebx
192	je	.L46
193
194	movsd	0 * SIZE(%ebp), %xmm0
195	movhps	1 * SIZE(%ebp), %xmm0
196	movaps	%xmm0, 0 * SIZE(%edi)
197	addl	$2 * SIZE, %ebp
198	ALIGN_4
199
200.L46:
201	movl	N,  %ebx
202	test	$1, %ebx
203	je	.L40
204
205	movl	BOFFSET2, %eax
206
207	movsd	0 * SIZE(%ebp), %xmm0
208	movsd	%xmm0, 0 * SIZE(%eax)
209	ALIGN_4
210
211.L40:
212	popl	%ebx
213	popl	%esi
214	popl	%edi
215	popl	%ebp
216	addl	$ARGS,%esp
217	ret
218
219	EPILOGUE
220