1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define N	%i0
26#define X	%i1
27#define INCX	%i2
28#define Y	%i3
29#define INCY	%i4
30#define I	%i5
31
32#define XX	%l0
33#define YY	%l1
34
35#ifdef DOUBLE
36#define a1	%f4
37#define a2	%f6
38#define a3	%f8
39#define a4	%f10
40#define a5	%f12
41#define a6	%f14
42#define a7	%f16
43#define a8	%f18
44#define b1	%f20
45#define b2	%f22
46#define b3	%f24
47#define b4	%f26
48#define b5	%f28
49#define b6	%f30
50#define b7	%f32
51#define b8	%f34
52
53#define c1	%f36
54#define c2	%f38
55#define c3	%f40
56#define c4	%f42
57#define c5	%f44
58#define c6	%f46
59#define c7	%f48
60#define c8	%f50
61
62#define t1	%f52
63#define t2	%f54
64#define t3	%f56
65#define t4	%f58
66#else
67#define a1	%f2
68#define a2	%f3
69#define a3	%f4
70#define a4	%f5
71#define a5	%f6
72#define a6	%f7
73#define a7	%f8
74#define a8	%f9
75#define b1	%f10
76#define b2	%f11
77#define b3	%f12
78#define b4	%f13
79#define b5	%f14
80#define b6	%f15
81#define b7	%f16
82#define b8	%f17
83
84#define c1	%f18
85#define c2	%f19
86#define c3	%f20
87#define c4	%f21
88#define c5	%f22
89#define c6	%f23
90#define c7	%f24
91#define c8	%f25
92
93#define t1	%f26
94#define t2	%f27
95#define t3	%f28
96#define t4	%f29
97#endif
98
99#ifdef DOUBLE
100#define C	%f0
101#define S	%f2
102#else
103#define C	%f0
104#define S	%f1
105#endif
106
107	PROLOGUE
108	SAVESP
109
110#ifndef __64BIT__
111#ifdef DOUBLE
112	st	%i5, [%sp + STACK_START + 24]
113
114	LDF	[%sp + STACK_START + 24], C
115	LDF	[%sp + STACK_START + 32], S
116#else
117	st	%i5, [%sp + STACK_START + 24]
118
119	LDF	[%sp + STACK_START + 24], C
120	LDF	[%sp + STACK_START + 28], S
121#endif
122#else
123#ifdef DOUBLE
124	FMOV	%f10, C
125	FMOV	%f12, S
126#else
127	FMOV	%f11, C
128	FMOV	%f13, S
129#endif
130#endif
131
132	cmp	N, 0
133	ble	.LL19
134	nop
135
136	sll	INCX, ZBASE_SHIFT, INCX
137	sll	INCY, ZBASE_SHIFT, INCY
138
139	cmp	INCX, 2 * SIZE
140	bne	.LL50
141	nop
142
143	cmp	INCY, 2 * SIZE
144	bne	.LL50
145	nop
146
147	sra	N, 2, I
148	cmp	I, 0
149	ble,pn	%icc, .LL15
150	nop
151
152	LDF	[X +  0 * SIZE], a1
153	LDF	[Y +  0 * SIZE], b1
154	LDF	[X +  1 * SIZE], a2
155	LDF	[Y +  1 * SIZE], b2
156	LDF	[X +  2 * SIZE], a3
157	LDF	[Y +  2 * SIZE], b3
158	LDF	[X +  3 * SIZE], a4
159	LDF	[Y +  3 * SIZE], b4
160
161	LDF	[X +  4 * SIZE], a5
162	LDF	[Y +  4 * SIZE], b5
163	LDF	[X +  5 * SIZE], a6
164	LDF	[Y +  5 * SIZE], b6
165	LDF	[X +  6 * SIZE], a7
166	LDF	[Y +  6 * SIZE], b7
167	LDF	[X +  7 * SIZE], a8
168	LDF	[Y +  7 * SIZE], b8
169
170	FMUL	C, a1, c1
171	FMUL	S, b1, c2
172	FMUL	C, b1, c3
173	LDF	[Y +  8 * SIZE], b1
174	FMUL	S, a1, c4
175	LDF	[X +  8 * SIZE], a1
176
177	FMUL	C, a2, c5
178	FMUL	S, b2, c6
179	FADD	c1, c2, t1
180
181	FMUL	C, b2, c7
182	LDF	[Y +  9 * SIZE], b2
183	FMUL	S, a2, c8
184	LDF	[X +  9 * SIZE], a2
185	FSUB	c3, c4, t2
186
187	addcc	I, -1, I
188	ble,pt	%icc, .LL12
189	nop
190
191#define PREFETCHSIZE 64
192
193.LL11:
194	FMUL	C, a3, c1
195	nop
196	prefetch [Y  + PREFETCHSIZE * SIZE], 1
197	nop
198
199	FMUL	S, b3, c2
200	STF	t1, [X +  0 * SIZE]
201	FADD	c5, c6, t3
202	nop
203
204	FMUL	C, b3, c3
205	LDF	[Y + 10 * SIZE], b3
206	nop
207	nop
208
209	FMUL	S, a3, c4
210	STF	t2, [Y +  0 * SIZE]
211	FSUB	c7, c8, t4
212	nop
213
214	FMUL	C, a4, c5
215	LDF	[X + 10 * SIZE], a3
216	nop
217	nop
218
219	FMUL	S, b4, c6
220	STF	t3, [X +  1 * SIZE]
221	FADD	c1, c2, t1
222	nop
223
224	FMUL	C, b4, c7
225	LDF	[Y + 11 * SIZE], b4
226	nop
227	nop
228
229	FMUL	S, a4, c8
230	STF	t4, [Y +  1 * SIZE]
231	FSUB	c3, c4, t2
232	nop
233
234	FMUL	C, a5, c1
235	LDF	[X + 11 * SIZE], a4
236	nop
237	nop
238
239	FMUL	S, b5, c2
240	STF	t1, [X +  2 * SIZE]
241	FADD	c5, c6, t3
242	nop
243
244	FMUL	C, b5, c3
245	LDF	[Y + 12 * SIZE], b5
246	nop
247	nop
248
249	FMUL	S, a5, c4
250	STF	t2, [Y +  2 * SIZE]
251	FSUB	c7, c8, t4
252	nop
253
254	FMUL	C, a6, c5
255	LDF	[X + 12 * SIZE], a5
256	nop
257	nop
258
259	FMUL	S, b6, c6
260	STF	t3, [X +  3 * SIZE]
261	FADD	c1, c2, t1
262	nop
263
264	FMUL	C, b6, c7
265	LDF	[Y + 13 * SIZE], b6
266	nop
267	nop
268
269	FMUL	S, a6, c8
270	STF	t4, [Y +  3 * SIZE]
271	FSUB	c3, c4, t2
272	nop
273
274	FMUL	C, a7, c1
275	LDF	[X + 13 * SIZE], a6
276	nop
277	nop
278
279	FMUL	S, b7, c2
280	STF	t1, [X +  4 * SIZE]
281	FADD	c5, c6, t3
282	nop
283
284	FMUL	C, b7, c3
285	LDF	[Y + 14 * SIZE], b7
286	nop
287	nop
288
289	FMUL	S, a7, c4
290	STF	t2, [Y +  4 * SIZE]
291	FSUB	c7, c8, t4
292	nop
293
294	FMUL	C, a8, c5
295	LDF	[X + 14 * SIZE], a7
296	nop
297	nop
298
299	FMUL	S, b8, c6
300	STF	t3, [X +  5 * SIZE]
301	FADD	c1, c2, t1
302	nop
303
304	FMUL	C, b8, c7
305	LDF	[Y + 15 * SIZE], b8
306	nop
307	nop
308
309	FMUL	S, a8, c8
310	STF	t4, [Y +  5 * SIZE]
311	FSUB	c3, c4, t2
312	nop
313
314	FMUL	C, a1, c1
315	LDF	[X + 15 * SIZE], a8
316	addcc	I, -1, I
317	nop
318
319	FMUL	S, b1, c2
320	STF	t1, [X +  6 * SIZE]
321	FADD	c5, c6, t3
322	nop
323
324	FMUL	C, b1, c3
325	LDF	[Y + 16 * SIZE], b1
326	nop
327	nop
328
329	FMUL	S, a1, c4
330	STF	t2, [Y +  6 * SIZE]
331	FSUB	c7, c8, t4
332	nop
333
334	FMUL	C, a2, c5
335	LDF	[X + 16 * SIZE], a1
336	add	Y, 8 * SIZE, Y
337	nop
338
339	FMUL	S, b2, c6
340	STF	t3, [X +  7 * SIZE]
341	FADD	c1, c2, t1
342	nop
343
344	FMUL	C, b2, c7
345	LDF	[Y +  9 * SIZE], b2
346	add	X, 8 * SIZE, X
347	nop
348
349	FMUL	S, a2, c8
350	STF	t4, [Y -  1 * SIZE]
351	FSUB	c3, c4, t2
352	nop
353
354	bg,pt	%icc, .LL11
355	LDF	[X +  9 * SIZE], a2
356
357
358.LL12:
359	FMUL	C, a3, c1
360	FMUL	S, b3, c2
361	STF	t1, [X +  0 * SIZE]
362	FADD	c5, c6, t3
363
364	FMUL	C, b3, c3
365	FMUL	S, a3, c4
366	STF	t2, [Y +  0 * SIZE]
367	FSUB	c7, c8, t4
368
369
370	FMUL	C, a4, c5
371	FMUL	S, b4, c6
372	STF	t3, [X +  1 * SIZE]
373	FADD	c1, c2, t1
374
375	FMUL	C, b4, c7
376	FMUL	S, a4, c8
377	STF	t4, [Y +  1 * SIZE]
378	FSUB	c3, c4, t2
379
380
381	FMUL	C, a5, c1
382	FMUL	S, b5, c2
383	STF	t1, [X +  2 * SIZE]
384	FADD	c5, c6, t3
385
386	FMUL	C, b5, c3
387	FMUL	S, a5, c4
388	STF	t2, [Y +  2 * SIZE]
389	FSUB	c7, c8, t4
390
391	FMUL	C, a6, c5
392	FMUL	S, b6, c6
393	STF	t3, [X +  3 * SIZE]
394	FADD	c1, c2, t1
395
396	FMUL	C, b6, c7
397	FMUL	S, a6, c8
398	STF	t4, [Y +  3 * SIZE]
399	FSUB	c3, c4, t2
400
401	FMUL	C, a7, c1
402	FMUL	S, b7, c2
403	STF	t1, [X +  4 * SIZE]
404	FADD	c5, c6, t3
405
406	FMUL	C, b7, c3
407	FMUL	S, a7, c4
408	STF	t2, [Y +  4 * SIZE]
409	FSUB	c7, c8, t4
410
411	FMUL	C, a8, c5
412	FMUL	S, b8, c6
413	STF	t3, [X +  5 * SIZE]
414	FADD	c1, c2, t1
415
416	FMUL	C, b8, c7
417	FMUL	S, a8, c8
418	STF	t4, [Y +  5 * SIZE]
419	FSUB	c3, c4, t2
420
421	FADD	c5, c6, t3
422	STF	t1, [X +  6 * SIZE]
423
424	FSUB	c7, c8, t4
425	STF	t2, [Y +  6 * SIZE]
426
427	STF	t3, [X +  7 * SIZE]
428	STF	t4, [Y +  7 * SIZE]
429
430	add	X, 8 * SIZE, X
431	add	Y, 8 * SIZE, Y
432
433
434.LL15:
435	andcc	N, 3, I
436	nop
437	ble,a,pn %icc, .LL19
438	nop
439
440.LL16:
441	LDF	[X + 0 * SIZE], a1
442	LDF	[Y + 0 * SIZE], b1
443	LDF	[X + 1 * SIZE], a2
444	LDF	[Y + 1 * SIZE], b2
445
446	FMUL	C, a1, c1
447	add	X, 2 * SIZE, X
448	FMUL	S, b1, c2
449	add	Y, 2 * SIZE, Y
450
451	FMUL	C, b1, c3
452	addcc	I, -1, I
453	FMUL	S, a1, c4
454	nop
455
456	FMUL	C, a2, c5
457	FMUL	S, b2, c6
458	FADD	c1, c2, c2
459
460	FMUL	C, b2, c7
461	FMUL	S, a2, c8
462	FSUB	c3, c4, c4
463
464	STF	c2, [X - 2 * SIZE]
465	FADD	c5, c6, c6
466	STF	c4, [Y - 2 * SIZE]
467	FSUB	c7, c8, c8
468
469	STF	c6, [X - 1 * SIZE]
470	bg,pt	%icc, .LL16
471	STF	c8, [Y - 1 * SIZE]
472
473.LL19:
474	return	%i7 + 8
475	nop
476
477.LL50:
478	mov	X, XX
479	mov	Y, YY
480
481	sra	N, 2, I
482	cmp	I, 0
483	ble,pn	%icc, .LL55
484	nop
485
486.LL51:
487	LDF	[X + 0 * SIZE], a1
488	LDF	[Y + 0 * SIZE], b1
489	LDF	[X + 1 * SIZE], a2
490	LDF	[Y + 1 * SIZE], b2
491
492	FMUL	C, a1, c1
493	FMUL	S, b1, c2
494	FMUL	C, b1, c3
495	FMUL	S, a1, c4
496
497	FMUL	C, a2, c5
498	nop
499	FMUL	S, b2, c6
500	FADD	c1, c2, c2
501
502	FMUL	C, b2, c7
503	nop
504	FMUL	S, a2, c8
505	FSUB	c3, c4, c4
506
507	STF	c2, [X + 0 * SIZE]
508	FADD	c5, c6, c6
509	STF	c4, [Y + 0 * SIZE]
510	FSUB	c7, c8, c8
511
512	STF	c6, [X + 1 * SIZE]
513	add	X, INCX, X
514	STF	c8, [Y + 1 * SIZE]
515	add	Y, INCY, Y
516
517	LDF	[X + 0 * SIZE], a1
518	LDF	[Y + 0 * SIZE], b1
519	LDF	[X + 1 * SIZE], a2
520	LDF	[Y + 1 * SIZE], b2
521
522	FMUL	C, a1, c1
523	FMUL	S, b1, c2
524	FMUL	C, b1, c3
525	FMUL	S, a1, c4
526
527	FMUL	C, a2, c5
528	nop
529	FMUL	S, b2, c6
530	FADD	c1, c2, c2
531
532	FMUL	C, b2, c7
533	nop
534	FMUL	S, a2, c8
535	FSUB	c3, c4, c4
536
537	STF	c2, [X + 0 * SIZE]
538	FADD	c5, c6, c6
539	STF	c4, [Y + 0 * SIZE]
540	FSUB	c7, c8, c8
541
542	STF	c6, [X + 1 * SIZE]
543	add	X, INCX, X
544	STF	c8, [Y + 1 * SIZE]
545	add	Y, INCY, Y
546
547	LDF	[X + 0 * SIZE], a1
548	LDF	[Y + 0 * SIZE], b1
549	LDF	[X + 1 * SIZE], a2
550	LDF	[Y + 1 * SIZE], b2
551
552	FMUL	C, a1, c1
553	FMUL	S, b1, c2
554	FMUL	C, b1, c3
555	FMUL	S, a1, c4
556
557	FMUL	C, a2, c5
558	nop
559	FMUL	S, b2, c6
560	FADD	c1, c2, c2
561
562	FMUL	C, b2, c7
563	nop
564	FMUL	S, a2, c8
565	FSUB	c3, c4, c4
566
567	STF	c2, [X + 0 * SIZE]
568	FADD	c5, c6, c6
569	STF	c4, [Y + 0 * SIZE]
570	FSUB	c7, c8, c8
571
572	STF	c6, [X + 1 * SIZE]
573	add	X, INCX, X
574	STF	c8, [Y + 1 * SIZE]
575	add	Y, INCY, Y
576
577	LDF	[X + 0 * SIZE], a1
578	LDF	[Y + 0 * SIZE], b1
579	LDF	[X + 1 * SIZE], a2
580	LDF	[Y + 1 * SIZE], b2
581
582	FMUL	C, a1, c1
583	FMUL	S, b1, c2
584	FMUL	C, b1, c3
585	FMUL	S, a1, c4
586
587	FMUL	C, a2, c5
588	nop
589	FMUL	S, b2, c6
590	FADD	c1, c2, c2
591
592	FMUL	C, b2, c7
593	nop
594	FMUL	S, a2, c8
595	FSUB	c3, c4, c4
596
597	STF	c2, [X + 0 * SIZE]
598	FADD	c5, c6, c6
599	STF	c4, [Y + 0 * SIZE]
600	FSUB	c7, c8, c8
601
602	STF	c6, [X + 1 * SIZE]
603	add	X, INCX, X
604	STF	c8, [Y + 1 * SIZE]
605	add	Y, INCY, Y
606
607	addcc	I, -1, I
608	bg,pt	%icc, .LL51
609	nop
610
611
612.LL55:
613	andcc	N, 3, I
614	nop
615	ble	%icc, .LL59
616	nop
617
618.LL56:
619	LDF	[X + 0 * SIZE], a1
620	LDF	[Y + 0 * SIZE], b1
621	LDF	[X + 1 * SIZE], a2
622	LDF	[Y + 1 * SIZE], b2
623
624	FMUL	C, a1, c1
625	FMUL	S, b1, c2
626	FMUL	C, b1, c3
627	FMUL	S, a1, c4
628
629	FMUL	C, a2, c5
630	addcc	I, -1, I
631	FMUL	S, b2, c6
632	FADD	c1, c2, c2
633
634	FMUL	C, b2, c7
635	nop
636	FMUL	S, a2, c8
637	FSUB	c3, c4, c4
638
639	STF	c2, [X + 0 * SIZE]
640	FADD	c5, c6, c6
641	STF	c4, [Y + 0 * SIZE]
642	FSUB	c7, c8, c8
643
644	STF	c6, [X + 1 * SIZE]
645	add	X, INCX, X
646	STF	c8, [Y + 1 * SIZE]
647
648	bg	%icc, .LL56
649	add	Y, INCY, Y
650
651
652.LL59:
653	return	%i7 + 8
654	nop
655
656	EPILOGUE
657