1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define RPREFETCHSIZE	8
26#define WPREFETCHSIZE (RPREFETCHSIZE * 4)
27#define PREFETCH      prefetcht0
28#define PREFETCHW     prefetcht2
29
30#define STACK	16
31#define ARGS	 0
32
33#define M	 4 + STACK + ARGS(%esp)
34#define N	 8 + STACK + ARGS(%esp)
35#define ARG_A	12 + STACK + ARGS(%esp)
36#define ARG_LDA	16 + STACK + ARGS(%esp)
37#define ARG_B	20 + STACK + ARGS(%esp)
38
39#define A	%eax
40#define B	%ebx
41#define LDA	%ebp
42#define A1	%ecx
43#define A2	%edx
44#define I	%esi
45#define J	%edi
46
47	PROLOGUE
48
49	pushl	%ebp
50	pushl	%edi
51	pushl	%esi
52	pushl	%ebx
53
54	PROFCODE
55
56	movl	ARG_A, A
57	movl	ARG_B, B
58	movl	ARG_LDA, LDA
59
60	sall	$BASE_SHIFT, LDA
61
62	movl	N,  J
63	sarl	$2, J
64	je	.L20
65	ALIGN_3
66
67.L10:
68	movl	A, A1
69	leal	(A, LDA, 2), A2
70	addl	$4 * SIZE, A
71
72	movl	M,  I
73	sarl	$2, I
74	je	.L15
75	ALIGN_3
76
77.L12:
78	PREFETCH	RPREFETCHSIZE * SIZE(A1)
79
80	movsd	0 * SIZE(A1)     , %xmm0
81	movhps	1 * SIZE(A1)     , %xmm0
82	movsd	2 * SIZE(A1)     , %xmm1
83	movhps	3 * SIZE(A1)     , %xmm1
84
85	PREFETCH	RPREFETCHSIZE * SIZE(A1, LDA)
86
87	movsd	0 * SIZE(A1, LDA), %xmm2
88	movhps	1 * SIZE(A1, LDA), %xmm2
89	movsd	2 * SIZE(A1, LDA), %xmm3
90	movhps	3 * SIZE(A1, LDA), %xmm3
91
92	PREFETCH	RPREFETCHSIZE * SIZE(A2)
93
94	movsd	0 * SIZE(A2)     , %xmm4
95	movhps	1 * SIZE(A2)     , %xmm4
96	movsd	2 * SIZE(A2)     , %xmm5
97	movhps	3 * SIZE(A2)     , %xmm5
98
99	PREFETCH	RPREFETCHSIZE * SIZE(A2, LDA)
100
101	movsd	0 * SIZE(A2, LDA), %xmm6
102	movhps	1 * SIZE(A2, LDA), %xmm6
103	movsd	2 * SIZE(A2, LDA), %xmm7
104	movhps	3 * SIZE(A2, LDA), %xmm7
105
106	PREFETCHW	(RPREFETCHSIZE + 0) * SIZE(B)
107
108	movaps	%xmm0, 0 * SIZE(B)
109	movaps	%xmm1, 2 * SIZE(B)
110	movaps	%xmm2, 4 * SIZE(B)
111	movaps	%xmm3, 6 * SIZE(B)
112
113	PREFETCHW	(RPREFETCHSIZE + 8) * SIZE(B)
114
115	movaps	%xmm4,  8 * SIZE(B)
116	movaps	%xmm5, 10 * SIZE(B)
117	movaps	%xmm6, 12 * SIZE(B)
118	movaps	%xmm7, 14 * SIZE(B)
119
120	leal	(A1, LDA, 4), A1
121	leal	(A2, LDA, 4), A2
122	subl	$-16 * SIZE, B
123	decl	I
124	jne	.L12
125	ALIGN_3
126
127.L15:
128	testl	$2, M
129	jle	.L16
130
131	movsd	0 * SIZE(A1)     , %xmm0
132	movhps	1 * SIZE(A1)     , %xmm0
133	movsd	2 * SIZE(A1)     , %xmm1
134	movhps	3 * SIZE(A1)     , %xmm1
135
136	movsd	0 * SIZE(A1, LDA), %xmm2
137	movhps	1 * SIZE(A1, LDA), %xmm2
138	movsd	2 * SIZE(A1, LDA), %xmm3
139	movhps	3 * SIZE(A1, LDA), %xmm3
140
141	movaps	%xmm0, 0 * SIZE(B)
142	movaps	%xmm1, 2 * SIZE(B)
143	movaps	%xmm2, 4 * SIZE(B)
144	movaps	%xmm3, 6 * SIZE(B)
145
146	leal	(A1, LDA, 2), A1
147	subl	$-8 * SIZE, B
148	ALIGN_4
149
150.L16:
151	testl	$1, M
152	jle	.L19
153
154	movsd	0 * SIZE(A1)     , %xmm0
155	movhps	1 * SIZE(A1)     , %xmm0
156	movsd	2 * SIZE(A1)     , %xmm1
157	movhps	3 * SIZE(A1)     , %xmm1
158
159	movaps	%xmm0, 0 * SIZE(B)
160	movaps	%xmm1, 2 * SIZE(B)
161	subl	$-4 * SIZE, B
162	ALIGN_4
163
164.L19:
165	decl	J
166	jne	.L10
167	ALIGN_3
168
169.L20:
170	testl	$2, N
171	jle	.L30
172
173	movl	A, A1
174	leal	(A, LDA, 2), A2
175	addl	$2 * SIZE, A
176
177	movl	M,  I
178	sarl	$2, I
179	je	.L25
180	ALIGN_3
181
182.L22:
183	movsd	0 * SIZE(A1)     , %xmm0
184	movhps	1 * SIZE(A1)     , %xmm0
185	movsd	0 * SIZE(A1, LDA), %xmm1
186	movhps	1 * SIZE(A1, LDA), %xmm1
187
188	movsd	0 * SIZE(A2)     , %xmm2
189	movhps	1 * SIZE(A2)     , %xmm2
190	movsd	0 * SIZE(A2, LDA), %xmm3
191	movhps	1 * SIZE(A2, LDA), %xmm3
192
193	movaps	%xmm0, 0 * SIZE(B)
194	movaps	%xmm1, 2 * SIZE(B)
195	movaps	%xmm2, 4 * SIZE(B)
196	movaps	%xmm3, 6 * SIZE(B)
197
198	leal	(A1, LDA, 4), A1
199	leal	(A2, LDA, 4), A2
200	subl	$-8 * SIZE, B
201	decl	I
202	jne	.L22
203	ALIGN_3
204
205.L25:
206	testl	$2, M
207	jle	.L26
208
209	movsd	0 * SIZE(A1)     , %xmm0
210	movhps	1 * SIZE(A1)     , %xmm0
211	movsd	0 * SIZE(A1, LDA), %xmm1
212	movhps	1 * SIZE(A1, LDA), %xmm1
213
214	movaps	%xmm0, 0 * SIZE(B)
215	movaps	%xmm1, 2 * SIZE(B)
216
217	leal	(A1, LDA, 2), A1
218	subl	$-4 * SIZE, B
219	ALIGN_4
220
221.L26:
222	testl	$1, M
223	jle	.L30
224
225	movsd	0 * SIZE(A1)     , %xmm0
226	movhps	1 * SIZE(A1)     , %xmm0
227
228	movaps	%xmm0, 0 * SIZE(B)
229	subl	$-2 * SIZE, B
230	ALIGN_4
231
232.L30:
233	testl	$1, N
234	jle	.L999
235
236	movl	A, A1
237	leal	(A, LDA, 2), A2
238
239	movl	M,  I
240	sarl	$2, I
241	je	.L35
242	ALIGN_3
243
244.L32:
245	movsd	0 * SIZE(A1)     , %xmm0
246	movhps	0 * SIZE(A1, LDA), %xmm0
247	movsd	0 * SIZE(A2)     , %xmm1
248	movhps	0 * SIZE(A2, LDA), %xmm1
249
250	movaps	%xmm0, 0 * SIZE(B)
251	movaps	%xmm1, 2 * SIZE(B)
252
253	leal	(A1, LDA, 4), A1
254	leal	(A2, LDA, 4), A2
255	subl	$-4 * SIZE, B
256	decl	I
257	jne	.L32
258	ALIGN_3
259
260.L35:
261	testl	$2, M
262	jle	.L36
263
264	movsd	0 * SIZE(A1)     , %xmm0
265	movhps	0 * SIZE(A1, LDA), %xmm0
266
267	movaps	%xmm0, 0 * SIZE(B)
268
269	leal	(A1, LDA, 2), A1
270	subl	$-2 * SIZE, B
271	ALIGN_4
272
273.L36:
274	testl	$1, M
275	jle	.L999
276
277	movsd	0 * SIZE(A1)     , %xmm0
278	movsd	%xmm0, 0 * SIZE(B)
279	ALIGN_4
280
281.L999:
282	popl	%ebx
283	popl	%esi
284	popl	%edi
285	popl	%ebp
286	ret
287
288	EPILOGUE
289