1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifdef PENTIUM
43#define P 88
44#endif
45
46#ifndef P
47#define P 1000
48#endif
49
50#define STACK	16
51#define ARGS	24
52
53#define NLDA	  0 + STACK(%esp)
54#define XP	  4 + STACK(%esp)
55#define MIN_M	  8 + STACK(%esp)
56#define J	 12 + STACK(%esp)
57#define IS	 16 + STACK(%esp)
58
59#define M	 4 + STACK + ARGS(%esp)
60#define N	 8 + STACK + ARGS(%esp)
61#define K	12 + STACK + ARGS(%esp)
62#define ALPHA	16 + STACK + ARGS(%esp)
63#ifdef DOUBLE
64#define A	24 + STACK + ARGS(%esp)
65#define LDA	28 + STACK + ARGS(%esp)
66#define X	32 + STACK + ARGS(%esp)
67#define INCX	36 + STACK + ARGS(%esp)
68#define Y	40 + STACK + ARGS(%esp)
69#define INCY	44 + STACK + ARGS(%esp)
70#define BUFFER	48 + STACK + ARGS(%esp)
71#else
72#define A	20 + STACK + ARGS(%esp)
73#define LDA	24 + STACK + ARGS(%esp)
74#define X	28 + STACK + ARGS(%esp)
75#define INCX	32 + STACK + ARGS(%esp)
76#define Y	36 + STACK + ARGS(%esp)
77#define INCY	40 + STACK + ARGS(%esp)
78#define BUFFER	44 + STACK + ARGS(%esp)
79#endif
80
81	PROLOGUE
82
83	subl	$ARGS,	%esp
84	pushl	%ebp
85	pushl	%edi
86	pushl	%esi
87	pushl	%ebx
88
89	PROFCODE
90
91	FLD	ALPHA
92
93	movl	X, %edi			# X
94
95	movl	$0, IS
96
97	movl	M, %ebx
98	movl	N, %eax
99
100	testl	%ebx, %ebx
101	jle	.L79
102	testl	%eax, %eax
103	jle	.L79
104
105	movl	INCX, %esi
106	leal	(,%esi,SIZE), %esi
107	movl	%esi, INCX
108
109	movl	INCY, %esi
110	leal	(, %esi, SIZE), %esi
111	movl	%esi, INCY
112
113	movl	LDA,  %ebx
114
115	imull	%ebx, %eax
116	movl	$P,   %esi
117	subl	%eax, %esi
118	leal	(, %esi, SIZE), %esi
119	movl	%esi, NLDA
120
121	leal	(,%ebx,SIZE), %esi
122	movl	%esi, LDA
123	ALIGN_2
124
125.L32:
126	movl	IS,  %esi
127
128	movl	$P, %edx
129	movl	M,    %eax
130	subl	%esi, %eax
131	cmpl	%edx, %eax
132#ifdef PENTIUM
133	jle	.L33
134	movl	%edx, %eax
135.L33:
136#else
137	cmovg	%edx, %eax
138#endif
139	movl	%eax, MIN_M
140
141	movl	IS, %ecx
142	leal	(%edi,%ecx,SIZE), %ecx		# xp = x + is
143	movl	INCX, %ebx
144	movl	%ecx, XP
145	cmpl	$SIZE, %ebx
146	je	.L34
147
148	movl	BUFFER, %esi
149	movl	MIN_M, %ecx
150	movl	%esi, XP
151	sarl	$2, %ecx
152	jle	.L35
153
154	ALIGN_3
155
156.L36:
157	FLD	(%edi)
158	addl	%ebx, %edi
159	FST	0 * SIZE(%esi)
160
161	FLD	(%edi)
162	addl	%ebx, %edi
163	FST	1 * SIZE(%esi)
164
165	FLD	(%edi)
166	addl	%ebx, %edi
167	FST	2 * SIZE(%esi)
168
169	FLD	(%edi)
170	addl	%ebx, %edi
171	FST	3 * SIZE(%esi)
172
173	addl	$4 * SIZE, %esi
174	decl	%ecx
175	jg	.L36
176	ALIGN_3
177
178.L35:
179	movl	MIN_M, %ecx
180	andl	$3,%ecx
181	jle	.L34
182	ALIGN_2
183
184.L42:
185	FLD	(%edi)
186	addl	%ebx,  %edi
187	FST	(%esi)
188	addl	$SIZE, %esi
189	decl	%ecx
190	jg	.L42
191	ALIGN_3
192
193/* Main Routine */
194
195.L34:
196	movl	Y, %ebp			# coffset = y
197
198	movl	N, %esi
199	sarl	$2, %esi
200	movl	%esi, J
201	jle	.L47
202	ALIGN_3
203
204.L48:
205	movl	A, %ebx				# a_offset = a
206	fldz
207	movl	LDA, %edx
208	fldz
209
210	leal	(%ebx, %edx), %ecx		# a_offset2 = a + lda
211	fldz
212	leal	(%ebx, %edx, 4), %eax
213	fldz
214
215	movl	%eax, A
216	movl	XP, %esi
217	FLD	(%esi)
218
219	movl	MIN_M, %eax
220	sarl	$2,%eax
221	jle	.L51
222	ALIGN_3
223
224#define PRESIZE 8
225
226.L80:
227#ifdef PENTIUM3
228	prefetcht0	PRESIZE * SIZE(%ebx, %edx, 2)
229	FLD	0 * SIZE(%ebx)		# at   = *(a_offset  + 0 * lda)
230	fmul	%st(1),%st		# at1 *= bt1
231
232	prefetcht0	PRESIZE * SIZE(%ecx)
233	faddp	%st,%st(2)		# ct1 += at1
234	FLD	0 * SIZE(%ecx)		# at1  = *(a_offset2 + 0 * lda)
235
236	prefetcht0	PRESIZE * SIZE(%ecx, %edx, 2)
237	fmul	%st(1),%st		# at1 *= bt1
238	faddp	%st,%st(3)		# ct2 += at1
239
240	prefetcht0	PRESIZE * SIZE(%ebx)
241	FLD	0 * SIZE(%ebx, %edx, 2)	# at   = *(a_offset  + 2 * lda)
242	fmul	%st(1),%st
243
244	faddp	%st,%st(4)
245	FLD	0 * SIZE(%ecx, %edx, 2)	# at1  = *(a_offset2 + 2 * lda)
246	fmulp	%st, %st(1)
247
248	faddp	%st,%st(4)
249	FLD	1 * SIZE(%esi)
250	FLD	1 * SIZE(%ebx)		# at   = *(a_offset  + 0 * lda)
251
252	fmul	%st(1),%st		# at1 *= bt1
253	faddp	%st,%st(2)		# ct1 += at1
254	FLD	1 * SIZE(%ecx)		# at1  = *(a_offset2 + 0 * lda)
255
256	fmul	%st(1),%st		# at1 *= bt1
257	faddp	%st,%st(3)		# ct2 += at1
258	FLD	1 * SIZE(%ebx, %edx, 2)	# at   = *(a_offset  + 2 * lda)
259
260	fmul	%st(1),%st
261	faddp	%st,%st(4)
262	FLD	1 * SIZE(%ecx, %edx, 2)	# at1  = *(a_offset2 + 2 * lda)
263
264	fmulp	%st, %st(1)
265	faddp	%st,%st(4)
266	FLD	2 * SIZE(%esi)
267
268	FLD	2 * SIZE(%ebx)		# at   = *(a_offset  + 0 * lda)
269	fmul	%st(1),%st		# at1 *= bt1
270	faddp	%st,%st(2)		# ct1 += at1
271
272	FLD	2 * SIZE(%ecx)		# at1  = *(a_offset2 + 0 * lda)
273	fmul	%st(1),%st		# at1 *= bt1
274	faddp	%st,%st(3)		# ct2 += at1
275
276	FLD	2 * SIZE(%ebx, %edx, 2)	# at   = *(a_offset  + 2 * lda)
277	fmul	%st(1),%st
278	faddp	%st,%st(4)
279
280	FLD	2 * SIZE(%ecx, %edx, 2)	# at1  = *(a_offset2 + 2 * lda)
281	fmulp	%st, %st(1)
282	faddp	%st,%st(4)
283
284	FLD	3 * SIZE(%esi)
285	FLD	3 * SIZE(%ebx)		# at   = *(a_offset  + 0 * lda)
286	fmul	%st(1),%st		# at1 *= bt1
287
288	faddp	%st,%st(2)		# ct1 += at1
289	FLD	3 * SIZE(%ecx)		# at1  = *(a_offset2 + 0 * lda)
290	fmul	%st(1),%st		# at1 *= bt1
291
292	faddp	%st,%st(3)		# ct2 += at1
293	FLD	3 * SIZE(%ebx, %edx, 2)	# at   = *(a_offset  + 2 * lda)
294	fmul	%st(1),%st
295
296	faddp	%st,%st(4)
297	FLD	3 * SIZE(%ecx, %edx, 2)	# at1  = *(a_offset2 + 2 * lda)
298	fmulp	%st, %st(1)
299
300	addl	$4 * SIZE, %ebx
301	faddp	%st,%st(4)
302	addl	$4 * SIZE, %ecx
303
304	FLD	4 * SIZE(%esi)
305	addl	$4 * SIZE, %esi
306
307#else
308
309#if defined(HAS_PREFETCH)
310       prefetcht0	PRESIZE * SIZE(%ebx)
311       prefetcht0	PRESIZE * SIZE(%ebx, %edx, 2)
312       prefetcht0	PRESIZE * SIZE(%ecx)
313       prefetcht0	PRESIZE * SIZE(%ecx, %edx, 2)
314#endif
315
316	FLD	0 * SIZE(%ebx)		# at   = *(a_offset  + 0 * lda)
317	fmul	%st(1),%st		# at1 *= bt1
318	faddp	%st,%st(2)		# ct1 += at1
319
320	FLD	0 * SIZE(%ecx)		# at1  = *(a_offset2 + 0 * lda)
321	fmul	%st(1),%st		# at1 *= bt1
322	faddp	%st,%st(3)		# ct2 += at1
323
324	FLD	0 * SIZE(%ebx, %edx, 2)	# at   = *(a_offset  + 2 * lda)
325	fmul	%st(1),%st
326	faddp	%st,%st(4)
327
328	FMUL	0 * SIZE(%ecx, %edx, 2)	# at1  = *(a_offset2 + 2 * lda)
329	faddp	%st,%st(4)
330	FLD	1 * SIZE(%esi)
331
332	FLD	1 * SIZE(%ebx)		# at   = *(a_offset  + 0 * lda)
333	fmul	%st(1),%st		# at1 *= bt1
334	faddp	%st,%st(2)		# ct1 += at1
335
336	FLD	1 * SIZE(%ecx)		# at1  = *(a_offset2 + 0 * lda)
337	fmul	%st(1),%st		# at1 *= bt1
338	faddp	%st,%st(3)		# ct2 += at1
339
340	FLD	1 * SIZE(%ebx, %edx, 2)	# at   = *(a_offset  + 2 * lda)
341	fmul	%st(1),%st
342	faddp	%st,%st(4)
343
344	FMUL	1 * SIZE(%ecx, %edx, 2)	# at1  = *(a_offset2 + 2 * lda)
345	faddp	%st,%st(4)
346	FLD	2 * SIZE(%esi)
347
348	FLD	2 * SIZE(%ebx)		# at   = *(a_offset  + 0 * lda)
349	fmul	%st(1),%st		# at1 *= bt1
350	faddp	%st,%st(2)		# ct1 += at1
351
352	FLD	2 * SIZE(%ecx)		# at1  = *(a_offset2 + 0 * lda)
353	fmul	%st(1),%st		# at1 *= bt1
354	faddp	%st,%st(3)		# ct2 += at1
355
356	FLD	2 * SIZE(%ebx, %edx, 2)	# at   = *(a_offset  + 2 * lda)
357	fmul	%st(1),%st
358	faddp	%st,%st(4)
359
360	FMUL	2 * SIZE(%ecx, %edx, 2)	# at1  = *(a_offset2 + 2 * lda)
361	faddp	%st,%st(4)
362	FLD	3 * SIZE(%esi)
363
364	FLD	3 * SIZE(%ebx)		# at   = *(a_offset  + 0 * lda)
365	fmul	%st(1),%st		# at1 *= bt1
366	faddp	%st,%st(2)		# ct1 += at1
367
368	FLD	3 * SIZE(%ecx)		# at1  = *(a_offset2 + 0 * lda)
369	fmul	%st(1),%st		# at1 *= bt1
370	faddp	%st,%st(3)		# ct2 += at1
371
372	FLD	3 * SIZE(%ebx, %edx, 2)	# at   = *(a_offset  + 2 * lda)
373	fmul	%st(1),%st
374	faddp	%st,%st(4)
375
376	FMUL	3 * SIZE(%ecx, %edx, 2)	# at1  = *(a_offset2 + 2 * lda)
377	faddp	%st,%st(4)
378	FLD	4 * SIZE(%esi)
379
380	addl	$4 * SIZE, %ebx
381	addl	$4 * SIZE, %ecx
382	addl	$4 * SIZE, %esi
383#endif
384
385	decl	%eax
386	jg	.L80
387	ALIGN_3
388
389.L51:
390	movl	MIN_M, %eax
391	andl	$3,    %eax
392	je	.L81
393	ALIGN_3
394
395.L52:
396
397	FLD	(%ebx)			# at   = *(a_offset  + 0 * lda)
398	fmul	%st(1),%st		# at1 *= bt1
399	faddp	%st,%st(2)		# ct1 += at1
400
401	FLD	(%ecx)			# at1  = *(a_offset2 + 0 * lda)
402	fmul	%st(1),%st		# at1 *= bt1
403	faddp	%st,%st(3)		# ct2 += at1
404
405	FLD	(%ebx, %edx, 2)		# at   = *(a_offset  + 2 * lda)
406	fmul	%st(1),%st
407	faddp	%st,%st(4)
408
409	FMUL	(%ecx, %edx, 2)		# at1  = *(a_offset2 + 2 * lda)
410	faddp	%st,%st(4)
411	FLD	1 * SIZE(%esi)
412
413	addl	$SIZE, %ebx
414	addl	$SIZE, %ecx
415	addl	$SIZE, %esi
416	decl	%eax
417	jg	.L52
418	ALIGN_3
419
420.L81:
421#ifndef C_SUN
422	ffreep	%st(0)
423#else
424	.byte	0xdf
425	.byte	0xc0
426#endif
427
428	fxch	%st(4)
429	fmul	%st, %st(4)
430	fmul	%st, %st(1)
431	fmul	%st, %st(2)
432	fmul	%st, %st(3)
433	fxch	%st(4)
434
435	movl	INCY, %eax
436
437	FADD	(%ebp)
438	FST	(%ebp)
439	addl	%eax, %ebp
440
441	FADD	(%ebp)
442	FST	(%ebp)
443	addl	%eax, %ebp
444
445	FADD	(%ebp)
446	FST	(%ebp)
447	addl	%eax, %ebp
448
449	FADD	(%ebp)
450	FST	(%ebp)
451	addl	%eax, %ebp
452
453	decl	J
454	jg	.L48
455	ALIGN_3
456
457.L47:
458	movl	N, %esi
459	andl	$3,%esi
460	movl	%esi, J
461	jle	.L60
462	ALIGN_2
463
464.L61:
465	movl	A, %ebx			# a_offset = a
466	fldz				# ct1 = ZERO
467	movl	LDA, %edx
468	fldz				# ct1 = ZERO
469
470	addl	%ebx, %edx
471	fldz				# ct1 = ZERO
472	movl	%edx, A
473	fldz				# ct1 = ZERO
474
475	movl	XP, %esi
476
477	movl	MIN_M, %eax
478	sarl	$3,%eax
479	jle	.L64
480	ALIGN_3
481
482.L65:
483#ifdef HAS_PREFETCH
484       prefetcht0	PRESIZE * 2 * SIZE(%ebx)
485       prefetcht0	PRESIZE * 2 * SIZE(%ebx)
486#endif
487
488	FLD	0 * SIZE(%esi)
489	FMUL	0 * SIZE(%ebx)
490	faddp	%st,%st(1)
491
492	FLD	1 * SIZE(%esi)
493	FMUL	1 * SIZE(%ebx)
494	faddp	%st,%st(2)
495
496	FLD	2 * SIZE(%esi)
497	FMUL	2 * SIZE(%ebx)
498	faddp	%st,%st(3)
499
500	FLD	3 * SIZE(%esi)
501	FMUL	3 * SIZE(%ebx)
502	faddp	%st,%st(4)
503
504	FLD	4 * SIZE(%esi)
505	FMUL	4 * SIZE(%ebx)
506	faddp	%st,%st(1)
507
508	FLD	5 * SIZE(%esi)
509	FMUL	5 * SIZE(%ebx)
510	faddp	%st,%st(2)
511
512	FLD	6 * SIZE(%esi)
513	FMUL	6 * SIZE(%ebx)
514	faddp	%st,%st(3)
515
516	FLD	7 * SIZE(%esi)
517	FMUL	7 * SIZE(%ebx)
518	faddp	%st,%st(4)
519
520	addl	$8 * SIZE, %esi
521	addl	$8 * SIZE, %ebx
522
523	decl	%eax
524	jg	.L65
525	ALIGN_3
526
527.L64:
528	movl	MIN_M, %eax
529	andl	$7, %eax
530	jle	.L70
531	ALIGN_3
532
533.L71:
534	FLD	(%esi)
535	FMUL	(%ebx)
536	faddp	%st,%st(1)
537
538	addl	$SIZE, %esi
539	addl	$SIZE, %ebx
540	decl	%eax
541	jg	.L71
542	ALIGN_3
543
544.L70:
545	faddp	%st, %st(1)
546	faddp	%st, %st(1)
547	faddp	%st, %st(1)
548
549	fmul	%st(1),%st
550	FADD	(%ebp)
551	FST	(%ebp)
552	addl	INCY, %ebp
553	decl	J
554	jg	.L61
555	ALIGN_3
556
557.L60:
558	movl	A, %ebx
559	addl	NLDA, %ebx
560	movl	%ebx, A
561
562	addl	$P,  IS
563	movl	M, %esi
564	cmpl	%esi, IS
565	jl	.L32
566	ALIGN_3
567
568.L79:
569#ifndef C_SUN
570	ffreep	%st(0)
571#else
572	.byte	0xdf
573	.byte	0xc0
574#endif
575
576	popl	%ebx
577	popl	%esi
578	popl	%edi
579	popl	%ebp
580	addl	$ARGS, %esp
581	ret
582
583	EPILOGUE
584