1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#ifdef PENTIUM
26#define P 32
27#endif
28
29#if defined(ATHLON) || defined(OPTERON)
30#define P 32
31#endif
32
33#ifndef P
34#define P DTB_ENTRIES
35#endif
36
37#define STACK	16
38#define ARGS	16
39
40#define PLDA_M	  0 + STACK(%esp)
41#define XP	  4 + STACK(%esp)
42#define MIN_N	  8 + STACK(%esp)
43#define IS	 12 + STACK(%esp)
44
45#define M	 4 + STACK + ARGS(%esp)
46#define N	 8 + STACK + ARGS(%esp)
47#define K	12 + STACK + ARGS(%esp)
48#define ALPHA	16 + STACK + ARGS(%esp)
49
50#define A	32 + STACK + ARGS(%esp)
51#define LDA	36 + STACK + ARGS(%esp)
52#define X	40 + STACK + ARGS(%esp)
53#define INCX	44 + STACK + ARGS(%esp)
54#define Y	48 + STACK + ARGS(%esp)
55#define INCY	52 + STACK + ARGS(%esp)
56#define BUFFER	56 + STACK + ARGS(%esp)
57
58
59	PROLOGUE
60
61	subl	$ARGS, %esp
62	pushl	%ebp
63	pushl	%edi
64	pushl	%esi
65	pushl	%ebx
66
67	PROFCODE
68
69	FLD	ALPHA
70	movl	X, %edi
71
72	movl	LDA, %ebx
73	sall	$BASE_SHIFT, %ebx
74
75	movl	$0, IS
76	movl	M, %edx
77	movl	N, %esi
78
79	test	%esi, %esi
80	jle	.L79			# goto END
81	test	%edx, %edx
82	jle	.L79			# goto END
83
84	movl	INCY, %eax
85	sall	$BASE_SHIFT, %eax
86	movl	%eax, INCY
87
88	movl	LDA, %eax
89	imull	$P,  %eax		# P * lda
90	subl	M   ,%eax		# P * lda - m
91	sall	$BASE_SHIFT, %eax
92	movl	%eax, PLDA_M
93	ALIGN_2
94
95.L32:
96	movl	IS, %esi
97	movl	$P, %edx
98	movl	N,   %eax
99	subl	%esi,%eax		# n - is
100	cmpl	%edx,  %eax
101#ifdef PENTIUM
102	jle	.L33
103	movl	%edx,  %eax
104.L33:
105#else
106	cmovg	%edx,  %eax
107#endif
108
109	movl	%eax, MIN_N
110	movl	INCX, %edx
111
112	sall	$BASE_SHIFT, %esi
113	leal	(%edi, %esi, 1), %esi
114
115	movl	%esi, XP
116	cmpl	$1, %edx
117	je	.L34			# if incx == 1 goto L34
118
119	movl	BUFFER, %esi
120	sall	$BASE_SHIFT, %edx
121	movl	%esi, XP		# xp = buffer
122	sarl	$2,%eax
123	jle	.L35
124	ALIGN_2
125
126.L36:
127	FLD	(%edi)
128	addl	%edx,%edi		# x += incx
129	FLD	(%edi)
130	addl	%edx,%edi		# x += incx
131	FLD	(%edi)
132	addl	%edx,%edi		# x += incx
133	FLD	(%edi)
134	addl	%edx,%edi		# x += incx
135
136	FST	3 * SIZE(%esi)
137	FST	2 * SIZE(%esi)
138	FST	1 * SIZE(%esi)
139	FST	0 * SIZE(%esi)
140
141	addl	$4 * SIZE, %esi		# xp += 4
142	decl	%eax
143	jg	.L36
144	ALIGN_3
145
146.L35:
147	movl	MIN_N, %eax
148	andl	$3,    %eax
149	jle	.L34
150	ALIGN_2
151
152.L42:
153	FLD	(%edi)
154	addl	%edx,  %edi
155	FST	(%esi)
156	addl	$SIZE, %esi
157	decl	%eax
158	jg	.L42
159	ALIGN_3
160
161/* Main Routine */
162.L34:
163	movl	 Y, %ecx		# c_offset
164	movl	 M, %ebp
165	sarl	$2, %ebp		# j = (m >> 2)
166	jle	.L47
167	ALIGN_2
168
169.L48:
170	movl	A, %edx			# a_offset = a
171	fldz
172	addl	$4 * SIZE, A		# a += 4
173	fldz
174	movl	XP, %esi		# b_offset = xp
175	fldz
176	movl	MIN_N, %eax		# i = min_n
177	fldz
178	FLD	(%esi)			# bt1 = b_offset
179	sarl	$1,    %eax
180	jle	.L51
181	ALIGN_2
182
183#ifdef PENTIUM3
184#define PRESIZE  8
185#else
186#define PRESIZE 24
187#endif
188
189.L80:
190#ifdef PENTIUM3
191       prefetcht1	PRESIZE * SIZE(%edx, %ebx, 1)
192	FLD	0 * SIZE(%edx)		# at1  = *(a_offset + 0)
193	fmul	%st(1), %st		# at1 *= bt1
194
195       prefetcht1	PRESIZE * SIZE(%esi)
196	faddp	%st, %st(2)		# ct1 += at1
197	FLD	1 * SIZE(%edx)		# at1  = *(a_offset + 1)
198
199	fmul	%st(1), %st		# at1 *= bt1
200	faddp	%st, %st(3)		# ct2 += at1
201	FLD	2 * SIZE(%edx)		# at1  = *(a_offset + 2)
202
203	fmul	%st(1), %st		# at1 *= bt1
204	faddp	%st, %st(4)		# ct3 += at1
205	FLD	3 * SIZE(%edx)		# bt1 *= *(a_offset + 3)
206
207	fmulp	%st, %st(1)
208	faddp	%st, %st(4)		# ct4 += at1
209	FLD	1 * SIZE(%esi)		# bt1 = b_offset
210
211       prefetcht1	PRESIZE * SIZE(%edx, %ebx, 2)
212	addl	%ebx, %edx		# a_offset += lda
213	FLD	0 * SIZE(%edx)		# at1  = *(a_offset + 0)
214
215	fmul	%st(1), %st		# at1 *= bt1
216	faddp	%st, %st(2)		# ct1 += at1
217	FLD	1 * SIZE(%edx)		# at1  = *(a_offset + 1)
218
219	fmul	%st(1), %st		# at1 *= bt1
220	faddp	%st, %st(3)		# ct2 += at1
221	FLD	2 * SIZE(%edx)		# at1  = *(a_offset + 2)
222
223	fmul	%st(1), %st		# at1 *= bt1
224	faddp	%st, %st(4)		# ct3 += at1
225	FLD	3 * SIZE(%edx)		# bt1 *= *(a_offset + 3)
226
227	fmulp	%st, %st(1)
228	addl	%ebx, %edx
229	faddp	%st, %st(4)		# ct4 += at1
230
231	FLD	2 * SIZE(%esi)		# bt1 = b_offset
232	addl	$2 * SIZE, %esi		# b_offset += 2
233
234#else
235#ifdef PENTIUM4
236       prefetchnta	 8 * SIZE(%esi)
237#endif
238	FLD	0 * SIZE(%edx)		# at1  = *(a_offset + 0)
239	fmul	%st(1), %st		# at1 *= bt1
240	faddp	%st, %st(2)		# ct1 += at1
241
242	FLD	1 * SIZE(%edx)		# at1  = *(a_offset + 1)
243	fmul	%st(1), %st		# at1 *= bt1
244	faddp	%st, %st(3)		# ct2 += at1
245
246	FLD	2 * SIZE(%edx)		# at1  = *(a_offset + 2)
247	fmul	%st(1), %st		# at1 *= bt1
248	faddp	%st, %st(4)		# ct3 += at1
249
250	FLD	3 * SIZE(%edx)		# bt1 *= *(a_offset + 3)
251	fmulp	%st, %st(1)
252	faddp	%st, %st(4)		# ct4 += at1
253	FLD	1 * SIZE(%esi)		# bt1 = b_offset
254
255	addl	%ebx, %edx		# a_offset += lda
256
257	FLD	0 * SIZE(%edx)		# at1  = *(a_offset + 0)
258	fmul	%st(1), %st		# at1 *= bt1
259	faddp	%st, %st(2)		# ct1 += at1
260
261	FLD	1 * SIZE(%edx)		# at1  = *(a_offset + 1)
262	fmul	%st(1), %st		# at1 *= bt1
263	faddp	%st, %st(3)		# ct2 += at1
264
265	FLD	2 * SIZE(%edx)		# at1  = *(a_offset + 2)
266	fmul	%st(1), %st		# at1 *= bt1
267	faddp	%st, %st(4)		# ct3 += at1
268
269	FLD	3 * SIZE(%edx)		# bt1 *= *(a_offset + 3)
270	fmulp	%st, %st(1)
271	faddp	%st, %st(4)		# ct4 += at1
272	FLD	2 * SIZE(%esi)		# bt1 = b_offset
273
274	addl	%ebx, %edx
275	addl	$2 * SIZE, %esi		# b_offset += 2
276#endif
277	decl	%eax
278	jg	.L80
279
280.L51:
281	movl	MIN_N,%eax
282	andl	$1, %eax
283	je	.L57
284
285	FLD	0 * SIZE(%edx)		# at1  = *(a_offset + 0)
286	fmul	%st(1), %st		# at1 *= bt1
287	faddp	%st, %st(2)		# ct1 += at1
288
289	FLD	1 * SIZE(%edx)		# at1  = *(a_offset + 1)
290	fmul	%st(1), %st		# at1 *= bt1
291	faddp	%st, %st(3)		# ct2 += at1
292
293	FLD	2 * SIZE(%edx)		# at1  = *(a_offset + 2)
294	fmul	%st(1), %st		# at1 *= bt1
295	faddp	%st, %st(4)		# ct3 += at1
296
297	FLD	3 * SIZE(%edx)		# bt1 *= *(a_offset + 3)
298	fmulp	%st, %st(1)
299	faddp	%st, %st(4)		# ct4 += at1
300	fldz
301	ALIGN_2
302
303.L57:
304	ffreep	%st(0)
305
306	fxch	%st(4)
307	fmul	%st, %st(4)
308	fmul	%st, %st(1)
309	fmul	%st, %st(2)
310	fmul	%st, %st(3)
311	fxch	%st(4)
312
313	movl	INCY, %eax
314
315	FLD	(%ecx)
316	faddp	%st, %st(1)
317	FST	(%ecx)
318	addl	%eax, %ecx
319
320	FLD	(%ecx)
321	faddp	%st, %st(1)
322	FST	(%ecx)
323	addl	%eax, %ecx
324
325	FLD	(%ecx)
326	faddp	%st, %st(1)
327	FST	(%ecx)
328	addl	%eax, %ecx
329
330	FLD	(%ecx)
331	faddp	%st, %st(1)
332	FST	(%ecx)
333	addl	%eax, %ecx
334
335	decl	%ebp		# j --
336	jg	.L48
337	ALIGN_3
338
339.L47:
340	movl	M,  %ebp
341	andl	$3, %ebp		# j = (m & 3)
342	jle	.L60
343	ALIGN_2
344
345.L61:
346
347	movl	A, %edx			# a_offset = a
348	fldz
349	addl	$SIZE, A		# a++
350	fldz
351	movl	XP,%esi
352	fldz
353	movl	MIN_N,%eax
354	fldz
355	sarl	$3,%eax
356	jle	.L64
357	ALIGN_2
358
359.L65:
360	FLD	0 * SIZE(%esi)
361	FLD	(%edx)
362	fmulp	%st, %st(1)
363	faddp	%st, %st(1)
364	addl	%ebx, %edx
365
366	FLD	1 * SIZE(%esi)
367	FLD	(%edx)
368	fmulp	%st, %st(1)
369	faddp	%st, %st(2)
370	addl	%ebx ,%edx
371
372	FLD	2 * SIZE(%esi)
373	FLD	(%edx)
374	fmulp	%st, %st(1)
375	faddp	%st, %st(3)
376	addl	%ebx, %edx
377
378	FLD	3 * SIZE(%esi)
379	FLD	(%edx)
380	fmulp	%st, %st(1)
381	faddp	%st, %st(4)
382	addl	%ebx, %edx
383
384	FLD	4 * SIZE(%esi)
385	FLD	(%edx)
386	fmulp	%st, %st(1)
387	faddp	%st,%st(1)
388	addl	%ebx, %edx
389
390	FLD	5 * SIZE(%esi)
391	FLD	(%edx)
392	fmulp	%st, %st(1)
393	faddp	%st, %st(2)
394	addl	%ebx, %edx
395
396	FLD	6 * SIZE(%esi)
397	FLD	(%edx)
398	fmulp	%st, %st(1)
399	faddp	%st,%st(3)
400	addl	%ebx, %edx
401
402	FLD	7 * SIZE(%esi)
403	FLD	(%edx)
404	fmulp	%st, %st(1)
405	faddp	%st,%st(4)
406	addl	%ebx, %edx
407
408	addl	$8 * SIZE, %esi
409	decl	%eax
410	jg	.L65
411
412.L64:
413	movl	MIN_N,%eax
414	andl	$7, %eax
415	jle	.L70
416	ALIGN_2
417
418.L71:
419	FLD	(%esi)
420	addl	$SIZE, %esi	 # b_offset ++
421	FLD	(%edx)
422	fmulp	%st, %st(1)
423	addl	%ebx,  %edx	 # a_offset += lda
424	faddp	%st, %st(1)
425	decl	%eax
426	jg	.L71
427	ALIGN_2
428
429.L70:
430	faddp	%st, %st(1)
431	faddp	%st, %st(1)
432	faddp	%st, %st(1)
433
434	fmul	%st(1), %st
435	movl	INCY,  %eax
436	FLD	(%ecx)
437	faddp	%st, %st(1)
438	FST	(%ecx)
439	addl	%eax, %ecx
440	decl	%ebp
441	jg	.L61
442
443.L60:
444	movl	PLDA_M, %esi
445	addl	%esi, A		# a += P * lda - m
446	addl	$P, IS
447	movl	N, %esi
448	cmpl	%esi,IS
449	jl	.L32
450
451.L79:
452	ffreep	%st(0)
453	popl	%ebx
454	popl	%esi
455	popl	%edi
456	popl	%ebp
457	addl	$ARGS, %esp
458	ret
459
460	EPILOGUE
461