1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define P 1020
26
27#define M	%i0
28#define N	%i1
29
30#if defined(DOUBLE) && !defined(__64BIT__)
31#define A	%i5
32#define LDA	%i2
33#define X	%i3
34#define INCX	%i4
35#else
36#define A	%i4
37#define LDA	%i5
38#define X	%i2
39#define INCX	%i3
40#endif
41
42#define Y	%l0
43#define INCY	%l1
44#define BUFFER	%l2
45
46#define I	%l3
47#define IS	%l4
48#define J	%l5
49#define MIN_M	%l6
50#define XP	%l7
51
52#define A1	%o0
53#define A2	%o1
54#define A3	%o2
55#define A4	%o3
56#define X1	%o4
57#define Y1	%o5
58#define PNLDA	%g1
59#define Y2	%o7	/* Danger? */
60
61#ifdef DOUBLE
62#define t1	%f0
63#define	t2 	%f2
64#define t3	%f4
65#define	t4 	%f6
66
67#define c1	%f8
68#define c2	%f10
69#define c3	%f12
70#define c4	%f14
71
72#define a1	%f16
73#define a2	%f18
74#define a3	%f20
75#define a4	%f22
76#define a5	%f24
77#define a6	%f26
78#define a7	%f28
79#define a8	%f30
80
81#define a9	%f32
82#define a10	%f34
83#define a11	%f36
84#define a12	%f38
85#define a13	%f40
86#define a14	%f42
87#define a15	%f44
88#define a16	%f46
89
90#define b1	%f48
91#define b2	%f50
92#define b3	%f52
93#define b4	%f54
94#define b5	%f56
95#define b6	%f58
96#define b7	%f60
97#define b8	%f62
98
99#define FZERO	%f60
100#define ALPHA	%f62
101
102#else
103#define t1	%f0
104#define	t2 	%f1
105#define t3	%f2
106#define	t4 	%f3
107
108#define c1	%f4
109#define c2	%f5
110#define c3	%f6
111#define c4	%f7
112
113#define a1	%f8
114#define a2	%f9
115#define a3	%f10
116#define a4	%f11
117#define a5	%f12
118#define a6	%f13
119#define a7	%f14
120#define a8	%f15
121
122#define a9	%f16
123#define a10	%f17
124#define a11	%f18
125#define a12	%f19
126#define a13	%f20
127#define a14	%f21
128#define a15	%f22
129#define a16	%f23
130
131#define b1	%f24
132#define b2	%f25
133#define b3	%f26
134#define b4	%f27
135#define b5	%f28
136#define b6	%f29
137#define b7	%f30
138#define b8	%f31
139
140#define FZERO	%f30
141#define ALPHA	%f31
142#endif
143
144#ifndef __64BIT__
145#define STACK_FZERO	[%sp + STACK_START +  8]
146#define STACK_ALPHA	[%sp + STACK_START + 16]
147#else
148#define STACK_FZERO	[%sp + STACK_START + 32]
149#define STACK_ALPHA	[%sp + STACK_START + 40]
150#endif
151
152#ifdef DOUBLE
153#define PREFETCHSIZE 36
154#else
155#define PREFETCHSIZE 72
156#endif
157
158	PROLOGUE
159	SAVESP
160	nop
161
162#ifndef __64BIT__
163
164#ifdef DOUBLE
165	st	%i3, [%sp + STACK_START + 16]   /* ALPHA */
166	st	%i4, [%sp + STACK_START + 20]
167
168	ld	[%sp + STACK_START + 28], LDA
169	ld	[%sp + STACK_START + 32], X
170	ld	[%sp + STACK_START + 36], INCX
171	ld	[%sp + STACK_START + 40], Y
172	ld	[%sp + STACK_START + 44], INCY
173	ld	[%sp + STACK_START + 48], BUFFER
174#else
175	st	%i3, [%sp + STACK_START + 16]   /* ALPHA */
176
177	ld	[%sp + STACK_START + 28], X
178	ld	[%sp + STACK_START + 32], INCX
179	ld	[%sp + STACK_START + 36], Y
180	ld	[%sp + STACK_START + 40], INCY
181	ld	[%sp + STACK_START + 44], BUFFER
182#endif
183	LDF	[%sp + STACK_START + 16], ALPHA
184#else
185	ldx	[%sp+  STACK_START + 56], X
186	ldx	[%sp+  STACK_START + 64], INCX
187	ldx	[%sp+  STACK_START + 72], Y
188	ldx	[%sp+  STACK_START + 80], INCY
189	ldx	[%sp+  STACK_START + 88], BUFFER
190#ifdef DOUBLE
191	FMOV	%f6, ALPHA
192	STF	%f6, STACK_ALPHA
193#else
194	FMOV	%f7, ALPHA
195	STF	%f7, STACK_ALPHA
196#endif
197#endif
198
199#ifdef DOUBLE
200	FCLR(29)
201#else
202	FCLR(30)
203#endif
204
205	clr	IS
206	mov	P, I
207	sll	LDA, BASE_SHIFT, LDA
208	sll	I, BASE_SHIFT, I
209	smul	LDA, N, PNLDA
210	sll	INCX, BASE_SHIFT, INCX
211	sll	INCY, BASE_SHIFT, INCY
212	sub	I, PNLDA, PNLDA
213
214.LL10:
215	sll	IS, BASE_SHIFT, I
216	sub	M, IS, MIN_M
217	cmp	MIN_M, P
218	nop
219	movg	%icc, P, MIN_M
220	nop
221	cmp	INCX, SIZE
222	beq	.LL100
223	add	X, I, XP
224
225	sra	MIN_M, 2, I
226	mov	BUFFER, XP
227	cmp	I, 0
228	ble,pn	%icc, .LL15
229	mov	BUFFER, Y1
230
231.LL11:
232	LDF	[X], a1
233	add	X, INCX, X
234	LDF	[X], a2
235	add	X, INCX, X
236	LDF	[X], a3
237	add	X, INCX, X
238	LDF	[X], a4
239	add	X, INCX, X
240
241	STF	a1, [Y1 + 0 * SIZE]
242	add	I, -1, I
243	STF	a2, [Y1 + 1 * SIZE]
244	cmp	I, 0
245	STF	a3, [Y1 + 2 * SIZE]
246	STF	a4, [Y1 + 3 * SIZE]
247	bg,pn	%icc, .LL11
248	add	Y1, 4 * SIZE, Y1
249
250.LL15:
251	and	MIN_M, 3, I
252	cmp	I, 0
253	ble,pn	%icc, .LL100
254	nop
255
256.LL16:
257	LDF	[X], a1
258	add	X, INCX, X
259	add	I, -1, I
260	cmp	I, 0
261	nop
262	STF	a1, [Y1]
263	bg,pn	%icc, .LL16
264	add	Y1, 1 * SIZE, Y1
265
266.LL100:
267	sra	N, 1, J
268	cmp	J, 0
269	ble	%icc, .LL200
270	mov	Y, Y1
271
272.LL110:
273#ifdef DOUBLE
274	FCLR(29)
275#else
276	FCLR(30)
277#endif
278
279	FMOV	FZERO, c1
280	FMOV	FZERO, c2
281	FMOV	FZERO, c3
282	FMOV	FZERO, c4
283
284	FMOV	FZERO, t1
285	FMOV	FZERO, t2
286	FMOV	FZERO, t3
287	FMOV	FZERO, t4
288
289	mov	A,  A1
290	add	A,  LDA, A2
291	add	A2, LDA, A
292
293	mov	XP, X1
294
295	sra	MIN_M, 3, I
296	cmp	I, 0
297	ble	%icc, .LL115
298	prefetch [Y1 + 2 * SIZE], 0
299
300	LDF	[A1 +  0 * SIZE], a1
301	deccc	I
302	LDF	[A1 +  1 * SIZE], a2
303	LDF	[A1 +  2 * SIZE], a3
304	LDF	[A1 +  3 * SIZE], a4
305	LDF	[A1 +  4 * SIZE], a5
306	LDF	[A1 +  5 * SIZE], a6
307	LDF	[A1 +  6 * SIZE], a7
308	LDF	[A1 +  7 * SIZE], a8
309
310	LDF	[A2 +  0 * SIZE], a9
311	LDF	[A2 +  1 * SIZE], a10
312	LDF	[A2 +  2 * SIZE], a11
313	LDF	[A2 +  3 * SIZE], a12
314	LDF	[A2 +  4 * SIZE], a13
315	LDF	[A2 +  5 * SIZE], a14
316	LDF	[A2 +  6 * SIZE], a15
317	LDF	[A2 +  7 * SIZE], a16
318
319	LDF	[X1 +  0 * SIZE], b1
320	LDF	[X1 +  1 * SIZE], b2
321	LDF	[X1 +  2 * SIZE], b3
322	LDF	[X1 +  3 * SIZE], b4
323	LDF	[X1 +  4 * SIZE], b5
324	LDF	[X1 +  5 * SIZE], b6
325
326	ble	%icc, .LL112
327	LDF	[X1 +  6 * SIZE], b7
328
329.LL111:
330	FADD	c1,  t1,  c1
331	prefetch  [A1 +  PREFETCHSIZE * SIZE], 1
332	FMUL	a1,  b1,  t1
333	LDF	[A1 +  8 * SIZE], a1
334
335	FADD	c2,  t2,  c2
336	LDF	[X1 +  7 * SIZE], b8
337	FMUL	a9,  b1,  t2
338	LDF	[A2 +  8 * SIZE], a9
339
340 	FADD	c3,  t3,  c3
341	LDF	[X1 +  8 * SIZE], b1
342	FMUL	a2,  b2,  t3
343	LDF	[A1 +  9 * SIZE], a2
344
345	FADD	c4,  t4,  c4
346	deccc	I
347	FMUL	a10, b2,  t4
348	LDF	[A2 +  9 * SIZE], a10
349
350	FADD	c1,  t1,  c1
351	LDF	[X1 +  9 * SIZE], b2
352	FMUL	a3,  b3,  t1
353	LDF	[A1 + 10 * SIZE], a3
354
355	FADD	c2,  t2,  c2
356	nop
357	FMUL	a11, b3,  t2
358	LDF	[A2 + 10 * SIZE], a11
359
360	FADD	c3,  t3,  c3
361	LDF	[X1 + 10 * SIZE], b3
362	FMUL	a4,  b4,  t3
363	LDF	[A1 + 11 * SIZE], a4
364
365	FADD	c4,  t4,  c4
366	nop
367	FMUL	a12, b4,  t4
368	LDF	[A2 + 11 * SIZE], a12
369
370	FADD	c1,  t1,  c1
371	LDF	[X1 + 11 * SIZE], b4
372	FMUL	a5,  b5,  t1
373	LDF	[A1 + 12 * SIZE], a5
374
375	FADD	c2,  t2,  c2
376	prefetch  [A2 +  (PREFETCHSIZE + 4) * SIZE], 1
377	FMUL	a13, b5,  t2
378	LDF	[A2 + 12 * SIZE], a13
379
380	FADD	c3,  t3,  c3
381	LDF	[X1 + 12 * SIZE], b5
382	FMUL	a6,  b6,  t3
383	LDF	[A1 + 13 * SIZE], a6
384
385	FADD	c4,  t4,  c4
386	FMUL	a14, b6,  t4
387	LDF	[A2 + 13 * SIZE], a14
388
389	FADD	c1,  t1,  c1
390	LDF	[X1 + 13 * SIZE], b6
391	FMUL	a7,  b7,  t1
392	LDF	[A1 + 14 * SIZE], a7
393
394	FADD	c2,  t2,  c2
395	add	X1, 8 * SIZE, X1
396	FMUL	a15, b7,  t2
397	LDF	[A2 + 14 * SIZE], a15
398
399	FADD	c3,  t3,  c3
400	LDF	[X1 +  6 * SIZE], b7
401	FMUL	a8,  b8,  t3
402	LDF	[A1 + 15 * SIZE], a8
403
404	FADD	c4,  t4,  c4
405	add	A1, 8 * SIZE, A1
406	FMUL	a16, b8,  t4
407	LDF	[A2 + 15 * SIZE], a16
408
409	bg,pn	%icc, .LL111
410	add	A2, 8 * SIZE, A2
411
412.LL112:
413	FADD	c1,  t1,  c1
414	LDF	[X1 + 7 * SIZE], b8
415	FMUL	a1,  b1,  t1
416	add	A1, 8 * SIZE, A1
417
418	FADD	c2,  t2,  c2
419	add	A2, 8 * SIZE, A2
420	FMUL	a9,  b1,  t2
421	add	X1, 8 * SIZE, X1
422
423	FADD	c3,  t3,  c3
424	FMUL	a2,  b2,  t3
425	FADD	c4,  t4,  c4
426	FMUL	a10, b2,  t4
427
428	FADD	c1,  t1,  c1
429	FMUL	a3,  b3,  t1
430	FADD	c2,  t2,  c2
431	FMUL	a11, b3,  t2
432
433	FADD	c3,  t3,  c3
434	FMUL	a4,  b4,  t3
435	FADD	c4,  t4,  c4
436	FMUL	a12, b4,  t4
437
438	FADD	c1,  t1,  c1
439	FMUL	a5,  b5,  t1
440	FADD	c2,  t2,  c2
441	FMUL	a13, b5,  t2
442
443	FADD	c3,  t3,  c3
444	FMUL	a6,  b6,  t3
445	FADD	c4,  t4,  c4
446	FMUL	a14, b6,  t4
447
448	FADD	c1,  t1,  c1
449	FMUL	a7,  b7,  t1
450	FADD	c2,  t2,  c2
451	FMUL	a15, b7,  t2
452
453	FADD	c3,  t3,  c3
454	FMUL	a8,  b8,  t3
455	FADD	c4,  t4,  c4
456	FMUL	a16, b8,  t4
457
458.LL115:
459	andcc	MIN_M, 7, I
460	ble	%icc, .LL119
461	mov	Y1, Y2
462
463	LDF	[X1 + 0 * SIZE], b1
464	deccc	I
465	LDF	[A1 + 0 * SIZE], a1
466	ble	%icc, .LL117
467	LDF	[A2 + 0 * SIZE], a2
468
469.LL116:
470	FADD	c1, t1, c1
471	add	X1, 1 * SIZE, X1
472	FMUL	a1, b1, t1
473	LDF	[A1 + 1 * SIZE], a1
474
475	FADD	c2, t2, c2
476	add	A1, 1 * SIZE, A1
477	FMUL	a2, b1, t2
478	LDF	[X1 + 0 * SIZE], b1
479
480	add	A2, 1 * SIZE, A2
481	deccc	I
482	bg,pn	%icc, .LL116
483	LDF	[A2 + 0 * SIZE], a2
484
485.LL117:
486	FADD	c1, t1, c1
487	add	X1, 1 * SIZE, X1
488	FADD	c2, t2, c2
489	add	A1, 1 * SIZE, A1
490
491	FMUL	a1, b1, t1
492	add	A2, 1 * SIZE, A2
493	FMUL	a2, b1, t2
494	nop
495
496.LL119:
497	FADD	c1, t1, c1
498	FADD	c2, t2, c2
499	FADD	c3, t3, c3
500	FADD	c4, t4, c4
501
502	FADD	c1, c3, c1
503	FADD	c2, c4, c2
504
505
506	LDF	[Y1], a1
507	LDF	[Y1 + INCY], a2
508
509	add	Y1, INCY, Y1
510	add	Y1, INCY, Y1
511
512	LDF	STACK_ALPHA, ALPHA
513
514	FMUL	ALPHA, c1, c1
515	FMUL	ALPHA, c2, c2
516	FADD	a1, c1, a1
517	FADD	a2, c2, a2
518
519	STF	a1, [Y2]
520	STF	a2, [Y2 + INCY]
521
522	deccc	J
523	bg	%icc, .LL110
524#ifdef DOUBLE
525	FCLR(29)
526#else
527	FCLR(30)
528#endif
529
530.LL200:
531	andcc	N, 1, J
532	nop
533	ble	%icc, .LL400
534	FMOV	FZERO, c1
535
536.LL310:
537	FMOV	FZERO, t1
538	sra	MIN_M, 3, I
539	FMOV	FZERO, c2
540	mov	A, A1
541	FMOV	FZERO, t2
542	add	A, LDA, A
543	FMOV	FZERO, t3
544	cmp	I, 0
545	FMOV	FZERO, t4
546	ble	%icc, .LL315
547	mov	XP, X1
548
549	LDF	[A1 + 0 * SIZE], a1
550	LDF	[A1 + 1 * SIZE], a2
551	LDF	[A1 + 2 * SIZE], a3
552	LDF	[A1 + 3 * SIZE], a4
553	LDF	[A1 + 4 * SIZE], a5
554	LDF	[A1 + 5 * SIZE], a6
555	LDF	[A1 + 6 * SIZE], a7
556	LDF	[A1 + 7 * SIZE], a8
557	add	A1, 8 * SIZE, A1
558
559	LDF	[X1 + 0 * SIZE], a9
560	add	I, -1, I
561	LDF	[X1 + 1 * SIZE], a10
562	cmp	I, 0
563	LDF	[X1 + 2 * SIZE], a11
564	LDF	[X1 + 3 * SIZE], a12
565	LDF	[X1 + 4 * SIZE], a13
566	LDF	[X1 + 5 * SIZE], a14
567	LDF	[X1 + 6 * SIZE], a15
568	LDF	[X1 + 7 * SIZE], a16
569	ble	%icc, .LL312
570	add	X1, 8 * SIZE, X1
571
572.LL311:
573	prefetch [A1 + PREFETCHSIZE * SIZE], 1
574
575	FADD	c1, t1, c1
576	FMUL	a1, a9, t1
577	LDF	[A1 + 0 * SIZE], a1
578	LDF	[X1 + 0 * SIZE], a9
579
580	FADD	c2, t2, c2
581	FMUL	a2, a10, t2
582	LDF	[A1 + 1 * SIZE], a2
583	LDF	[X1 + 1 * SIZE], a10
584
585	FADD	c1, t3, c1
586	add	I, -1, I
587	FMUL	a3, a11, t3
588	LDF	[A1 + 2 * SIZE], a3
589	LDF	[X1 + 2 * SIZE], a11
590
591	FADD	c2, t4, c2
592	cmp	I, 0
593	FMUL	a4, a12, t4
594	LDF	[A1 + 3 * SIZE], a4
595	LDF	[X1 + 3 * SIZE], a12
596
597	FADD	c1, t1, c1
598	nop
599	FMUL	a5, a13, t1
600	LDF	[A1 + 4 * SIZE], a5
601	LDF	[X1 + 4 * SIZE], a13
602
603	FADD	c2, t2, c2
604	nop
605	FMUL	a6, a14, t2
606	LDF	[A1 + 5 * SIZE], a6
607	LDF	[X1 + 5 * SIZE], a14
608
609	FADD	c1, t3, c1
610	FMUL	a7, a15, t3
611	LDF	[A1 + 6 * SIZE], a7
612	LDF	[X1 + 6 * SIZE], a15
613
614	FADD	c2, t4, c2
615	add	X1, 8 * SIZE, X1
616	FMUL	a8, a16, t4
617	LDF	[A1 + 7 * SIZE], a8
618	add	A1, 8 * SIZE, A1
619	bg,pn	%icc, .LL311
620	LDF	[X1 - 1 * SIZE], a16
621
622.LL312:
623	FADD	c1, t1, c1
624	FMUL	a1, a9, t1
625	FADD	c2, t2, c2
626	FMUL	a2, a10, t2
627	FADD	c1, t3, c1
628	FMUL	a3, a11, t3
629	FADD	c2, t4, c2
630	FMUL	a4, a12, t4
631
632	FADD	c1, t1, c1
633	FMUL	a5, a13, t1
634	FADD	c2, t2, c2
635	FMUL	a6, a14, t2
636	FADD	c1, t3, c1
637	FMUL	a7, a15, t3
638	FADD	c2, t4, c2
639	FMUL	a8, a16, t4
640
641.LL315:
642	and	MIN_M, 7, I
643	cmp	I, 0
644	ble	%icc, .LL319
645	nop
646
647.LL316:
648	LDF	[A1 + 0 * SIZE], a1
649	add	A1, 1 * SIZE, A1
650	LDF	[X1 + 0 * SIZE], b1
651	nop
652
653	FADD	c1, t1, c1
654	nop
655	add	I, -1, I
656	FMUL	a1, b1, t1
657	nop
658	cmp	I, 0
659	bg,pn	%icc, .LL316
660	add	X1, 1 * SIZE, X1
661
662.LL319:
663	FADD	c1, t1, c1
664	nop
665	FADD	c2, t2, c2
666	nop
667	FADD	c1, t3, c1
668	FADD	c2, t4, c2
669
670	FADD	c1, c2, c1
671
672	FMUL	ALPHA, c1, c1
673	LDF	[Y1 + 0 * SIZE], a1
674	FADD	a1, c1, a1
675	STF	a1, [Y1 + 0 * SIZE]
676	add	Y1, INCY, Y1
677
678.LL400:
679	add	IS, P, IS
680	cmp	IS, M
681	bl	%icc, .LL10
682	add	A, PNLDA, A
683
684.LL999:
685	return	%i7 + 8
686	clr	%o0
687
688	EPILOGUE
689