1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef PENTIUM
43#define P 88
44#endif
45
46#ifndef P
47#define P 400
48#endif
49
50#define STACK	16
51#define ARGS	24
52
53#define NLDA	  0 + STACK(%esp)
54#define XP	  4 + STACK(%esp)
55#define MIN_M	  8 + STACK(%esp)
56#define J	 12 + STACK(%esp)
57#define IS	 16 + STACK(%esp)
58
59#define M	 4 + STACK + ARGS(%esp)
60#define N	 8 + STACK + ARGS(%esp)
61#define K	12 + STACK + ARGS(%esp)
62#ifdef DOUBLE
63#define ALPHA_R	16 + STACK + ARGS(%esp)
64#define ALPHA_I	24 + STACK + ARGS(%esp)
65#define A	32 + STACK + ARGS(%esp)
66#define LDA	36 + STACK + ARGS(%esp)
67#define X	40 + STACK + ARGS(%esp)
68#define INCX	44 + STACK + ARGS(%esp)
69#define Y	48 + STACK + ARGS(%esp)
70#define INCY	52 + STACK + ARGS(%esp)
71#define BUFFER	56 + STACK + ARGS(%esp)
72#else
73#define ALPHA_R	16 + STACK + ARGS(%esp)
74#define ALPHA_I	20 + STACK + ARGS(%esp)
75#define A	24 + STACK + ARGS(%esp)
76#define LDA	28 + STACK + ARGS(%esp)
77#define X	32 + STACK + ARGS(%esp)
78#define INCX	36 + STACK + ARGS(%esp)
79#define Y	40 + STACK + ARGS(%esp)
80#define INCY	44 + STACK + ARGS(%esp)
81#define BUFFER	48 + STACK + ARGS(%esp)
82#endif
83
84	PROLOGUE
85
86	subl	$ARGS,	%esp
87	pushl	%ebp
88	pushl	%edi
89	pushl	%esi
90	pushl	%ebx
91
92	PROFCODE
93
94	FLD	ALPHA_I
95	FLD	ALPHA_R
96
97	movl	X, %edi			# X
98
99	movl	$0, IS
100
101	movl	M, %ebx
102	movl	N, %ecx
103	testl	%ebx, %ebx
104	jle	.L79
105
106	testl	%ecx, %ecx
107	jle	.L79
108
109	movl	INCX, %esi
110	addl	%esi, %esi
111	leal	(,%esi,SIZE), %esi
112	movl	%esi, INCX
113
114	movl	INCY, %esi
115	addl	%esi, %esi
116	leal	(, %esi, SIZE), %esi
117	movl	%esi, INCY
118
119	movl	LDA,  %ebx
120
121	movl	N,    %eax
122	imull	%ebx, %eax
123	movl	$P,   %esi
124	subl	%eax, %esi
125	leal	(, %esi, SIZE), %esi
126	addl	%esi, %esi
127	movl	%esi, NLDA
128
129	leal	(,%ebx,SIZE), %esi
130	addl	%esi, %esi
131	movl	%esi, LDA
132	ALIGN_2
133
134.L32:
135	movl	IS,   %esi
136
137	movl	$P,   %edx
138	movl	M,    %eax
139	subl	%esi, %eax
140	cmpl	%edx, %eax
141#ifdef PENTIUM
142	jle	.L33
143	movl	%edx, %eax
144.L33:
145#else
146	cmovg	%edx, %eax
147#endif
148	movl	%eax, MIN_M
149
150	movl	IS,   %ecx
151	addl	%ecx, %ecx
152	leal	(%edi,%ecx,SIZE), %ecx		# xp = x + is
153	movl	INCX, %ebx
154	movl	%ecx, XP
155	cmpl	$2 * SIZE, %ebx
156	je	.L34
157
158	movl	BUFFER, %esi
159	movl	MIN_M, %eax
160	movl	%esi, XP
161	sarl	$1, %eax
162	jle	.L35
163
164	ALIGN_3
165
166.L36:
167	FLD	0 * SIZE(%edi)
168	FLD	1 * SIZE(%edi)
169	addl	%ebx,%edi		# x += incx
170	FLD	0 * SIZE(%edi)
171	FLD	1 * SIZE(%edi)
172	addl	%ebx,%edi		# x += incx
173
174	FST	3 * SIZE(%esi)
175	FST	2 * SIZE(%esi)
176	FST	1 * SIZE(%esi)
177	FST	0 * SIZE(%esi)
178
179	addl	$4 * SIZE, %esi		# xp += 4
180	decl	%eax
181	jg	.L36
182	ALIGN_3
183
184.L35:
185	movl	MIN_M, %eax
186	andl	$1,%eax
187	jle	.L34
188
189	FLD	0 * SIZE(%edi)
190	FLD	1 * SIZE(%edi)
191	addl	%ebx,%edi		# x += incx
192	FST	1 * SIZE(%esi)
193	FST	0 * SIZE(%esi)
194	ALIGN_3
195
196/* Main Routine */
197
198.L34:
199	movl	Y, %ebp			# coffset = y
200
201	movl	N, %ecx
202	testl	%ecx, %ecx
203	jle	.L60
204	ALIGN_2
205
206.L61:
207	movl	A, %ebx			# a_offset = a
208	fldz				# ct1 = ZERO
209	movl	LDA, %edx
210	fldz				# ct1 = ZERO
211
212	addl	%ebx, %edx
213	fldz				# ct1 = ZERO
214	movl	%edx, A
215	fldz				# ct1 = ZERO
216
217	movl	XP, %esi
218
219	FLD	(%esi)			#  bt1 = *(b_offset + 0)
220
221	movl	MIN_M, %eax
222	sarl	$1,    %eax
223	jle	.L64
224	ALIGN_3
225
226#define PRESIZE 8
227
228.L65:
229#ifdef HAS_PREFETCH
230       prefetcht0	PRESIZE * SIZE(%ebx)
231       prefetcht0	PRESIZE * SIZE(%esi)
232#endif
233
234	FLD	 0 * SIZE(%ebx)		# at1  = *(a_offset + 0)
235	fmul	%st(1), %st			# at1 *= bt1
236	faddp	%st, %st(2)		# ct1 += at1
237
238	FMUL	 1 * SIZE(%ebx)		# bt1 *= *(a_offset + 1)
239#ifndef CONJ
240	faddp	%st, %st(2)		# ct2 += bt1
241#else
242	fsubrp	%st, %st(2)		# ct2 -= bt1
243#endif
244	FLD	 1 * SIZE(%esi)		# bt1  = *(b_offset + 1)
245
246	FLD	 0 * SIZE(%ebx)		# at1  = *(a_offset + 0)
247	fmul	%st(1), %st		# at1 *= bt1
248	faddp	%st, %st(4)		# ct3 += at1
249
250	FMUL	 1 * SIZE(%ebx)		# bt1 *= *(a_offset + 1)
251	faddp	%st, %st(4)		# ct4 += bt1
252	FLD	 2 * SIZE(%esi)		# bt1  = *(b_offset + 1)
253
254	FLD	 2 * SIZE(%ebx)		# at1  = *(a_offset + 0)
255	fmul	%st(1), %st		# at1 *= bt1
256	faddp	%st, %st(2)		# ct1 += at1
257
258	FMUL	 3 * SIZE(%ebx)		# bt1 *= *(a_offset + 1)
259#ifndef CONJ
260	faddp	%st, %st(2)		# ct2 += bt1
261#else
262	fsubrp	%st, %st(2)		# ct2 -= bt1
263#endif
264	FLD	 3 * SIZE(%esi)		# bt1  = *(b_offset + 1)
265
266	FLD	 2 * SIZE(%ebx)		# at1  = *(a_offset + 0)
267	fmul	%st(1), %st		# at1 *= bt1
268	faddp	%st, %st(4)		# ct3 += at1
269
270	FMUL	 3 * SIZE(%ebx)		# bt1 *= *(a_offset + 1)
271	faddp	%st, %st(4)		# ct4 += bt1
272	FLD	 4 * SIZE(%esi)		# bt1  = *(b_offset + 1)
273
274	addl	$4 * SIZE, %esi
275	addl	$4 * SIZE, %ebx
276	decl	%eax
277	jg	.L65
278	ALIGN_3
279
280.L64:
281	movl	MIN_M, %eax
282	andl	$1, %eax
283	jle	.L70
284	ALIGN_3
285
286.L71:
287	FLD	 0 * SIZE(%ebx)		# at1  = *(a_offset + 0)
288	fmul	%st(1), %st		# at1 *= bt1
289	faddp	%st, %st(2)		# ct1 += at1
290
291	FMUL	 1 * SIZE(%ebx)		# bt1 *= *(a_offset + 1)
292#ifndef CONJ
293	faddp	%st, %st(2)		# ct2 += bt1
294#else
295	fsubrp	%st, %st(2)		# ct2 -= bt1
296#endif
297	FLD	 1 * SIZE(%esi)		# bt1  = *(b_offset + 1)
298
299	FLD	 0 * SIZE(%ebx)		# at1  = *(a_offset + 0)
300	fmul	%st(1), %st		# at1 *= bt1
301	faddp	%st, %st(4)		# ct3 += at1
302
303	FMUL	 1 * SIZE(%ebx)		# bt1 *= *(a_offset + 1)
304	faddp	%st, %st(4)		# ct4 += bt1
305	fldz
306	ALIGN_3
307
308.L70:
309#ifndef C_SUN
310	ffreep	%st(0)
311#else
312	.byte	0xdf
313	.byte	0xc0
314#endif
315
316#ifndef XCONJ
317#ifndef CONJ
318	fsubp	%st, %st(3)
319	faddp	%st, %st(1)
320#else
321	faddp	%st, %st(3)
322	faddp	%st, %st(1)
323#endif
324#else
325#ifndef CONJ
326	faddp	%st, %st(3)
327	fsubp	%st, %st(1)
328#else
329	fsubp	%st, %st(3)
330	fsubp	%st, %st(1)
331#endif
332#endif
333
334	fld	%st(0)		# ct4 = ct2
335	fmul	%st(4), %st
336	fld	%st(2)
337	fmul	%st(4), %st
338	fsubp	%st, %st(1)
339
340	FADD	0 * SIZE(%ebp)
341	FST	0 * SIZE(%ebp)
342
343	fmul	%st(2), %st
344	fxch	%st(1)
345	fmul	%st(3), %st
346	faddp	%st, %st(1)
347
348	FADD	1 * SIZE(%ebp)
349	FST	1 * SIZE(%ebp)
350	addl	INCY, %ebp
351
352	decl	%ecx
353	jg	.L61
354	ALIGN_3
355
356.L60:
357	movl	A, %ebx
358	addl	NLDA, %ebx
359	movl	%ebx, A
360
361	addl	$P,  IS
362	movl	M, %esi
363	cmpl	%esi, IS
364	jl	.L32
365	ALIGN_3
366
367.L79:
368#ifndef C_SUN
369	ffreep	%st(0)
370	ffreep	%st(0)
371#else
372	.byte	0xdf
373	.byte	0xc0
374	.byte	0xdf
375	.byte	0xc0
376#endif
377
378	popl	%ebx
379	popl	%esi
380	popl	%edi
381	popl	%ebp
382	addl	$ARGS, %esp
383	ret
384
385	EPILOGUE
386
387