1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41#include "version.h"
42
43#define N	$16
44#define	X	$17
45#define INCX	$18
46#define Y	$19
47#define INCY	$20
48#define I	$21
49#define XX	$23
50#define YY	$24
51
52#define C	$f10
53#define S	$f11
54
55#define PREFETCH_SIZE 80
56
57	PROLOGUE
58	PROFCODE
59	.frame	$sp, 0, $26, 0
60
61#ifndef PROFILE
62	.prologue 0
63#else
64	.prologue 1
65#endif
66
67	fmov	$f21,   C
68	LD	S, 0($sp)
69
70	addq	INCX, INCX, INCX
71	addq	INCY, INCY, INCY
72
73	cmpeq	INCX, 2,  $23
74	cmpeq	INCY, 2,  $24
75	ble	N,  $L998
76
77	and	$23, $24, $23
78	beq	$23, $L50
79
80	sra	N, 2, I
81	ble	I, $L15
82
83	LD	$f12,   0*SIZE(X)
84	LD	$f13,   0*SIZE(Y)
85	LD	$f14,   1*SIZE(X)
86	LD	$f15,   1*SIZE(Y)
87
88	LD	$f16,   2*SIZE(X)
89	LD	$f17,   2*SIZE(Y)
90	LD	$f18,   3*SIZE(X)
91	LD	$f19,   3*SIZE(Y)
92
93	MUL	C, $f12, $f21
94	unop
95	MUL	S, $f13, $f22
96	MUL	C, $f13, $f23
97
98	LD	$f13,   4*SIZE(Y)
99	MUL	S, $f12, $f24
100	LD	$f12,   4*SIZE(X)
101	MUL	C, $f14, $f25
102
103	lda	I, -1(I)
104	MUL	S, $f15, $f26
105	ADD	$f21, $f22, $f22
106	MUL	C, $f15, $f27
107
108	LD	$f15,   5*SIZE(Y)
109	MUL	S, $f14, $f28
110	SUB	$f23, $f24, $f24
111	ble	I, $L13
112	.align 4
113
114$L12:
115	MUL	C, $f16, $f21
116	lds	$f31, (PREFETCH_SIZE) * SIZE(X)
117	unop
118	LD	$f14,   5*SIZE(X)
119
120	ST	$f22,   0*SIZE(X)
121	MUL	S, $f17, $f22
122	unop
123	ADD	$f25, $f26, $f26
124
125	MUL	C, $f17, $f23
126	lds	$f31, (PREFETCH_SIZE) * SIZE(Y)
127	unop
128	LD	$f17,   6*SIZE(Y)
129
130	ST	$f24,   0*SIZE(Y)
131	MUL	S, $f16, $f24
132	unop
133	SUB	$f27, $f28, $f28
134
135	MUL	C, $f18, $f25
136	LD	$f16,   6*SIZE(X)
137	unop
138	unop
139
140	ST	$f26,   1*SIZE(X)
141	MUL	S, $f19, $f26
142	unop
143	ADD	$f21, $f22, $f22
144
145	MUL	C, $f19, $f27
146	unop
147	unop
148	LD	$f19,   7*SIZE(Y)
149
150	ST	$f28,   1*SIZE(Y)
151	MUL	S, $f18, $f28
152	unop
153	SUB	$f23, $f24, $f24
154
155	MUL	C, $f12, $f21
156	LD	$f18,   7*SIZE(X)
157	unop
158	unop
159
160	ST	$f22,   2*SIZE(X)
161	unop
162	MUL	S, $f13, $f22
163	ADD	$f25, $f26, $f26
164
165	MUL	C, $f13, $f23
166	LD	$f13,   8*SIZE(Y)
167	unop
168	unop
169
170	ST	$f24,   2*SIZE(Y)
171	MUL	S, $f12, $f24
172	unop
173	SUB	$f27, $f28, $f28
174
175	MUL	C, $f14, $f25
176	LD	$f12,   8*SIZE(X)
177	unop
178	unop
179
180	ST	$f26,   3*SIZE(X)
181	MUL	S, $f15, $f26
182	unop
183	ADD	$f21, $f22, $f22
184
185	MUL	C, $f15, $f27
186	LD	$f15,   9*SIZE(Y)
187	unop
188	unop
189
190	ST	$f28,   3*SIZE(Y)
191	MUL	S, $f14, $f28
192	unop
193	SUB	$f23, $f24, $f24
194
195	MUL	C, $f16, $f21
196	LD	$f14,   9*SIZE(X)
197	unop
198	unop
199
200	ST	$f22,   4*SIZE(X)
201	MUL	S, $f17, $f22
202	unop
203	ADD	$f25, $f26, $f26
204
205	MUL	C, $f17, $f23
206	LD	$f17,  10*SIZE(Y)
207	unop
208	unop
209
210	ST	$f24,   4*SIZE(Y)
211	MUL	S, $f16, $f24
212	unop
213	SUB	$f27, $f28, $f28
214
215	MUL	C, $f18, $f25
216	LD	$f16,  10*SIZE(X)
217	unop
218	unop
219
220	ST	$f26,   5*SIZE(X)
221	MUL	S, $f19, $f26
222	unop
223	ADD	$f21, $f22, $f22
224
225	MUL	C, $f19, $f27
226	LD	$f19,  11*SIZE(Y)
227	unop
228	unop
229
230	ST	$f28,   5*SIZE(Y)
231	MUL	S, $f18, $f28
232	lda	I, -1(I)
233	SUB	$f23, $f24, $f24
234
235	MUL	C, $f12, $f21
236	LD	$f18,  11*SIZE(X)
237	unop
238	unop
239
240	ST	$f22,   6*SIZE(X)
241	MUL	S, $f13, $f22
242	unop
243	ADD	$f25, $f26, $f26
244
245	MUL	C, $f13, $f23
246	LD	$f13,  12*SIZE(Y)
247	lda	X,   8*SIZE(X)
248	unop
249
250	ST	$f24,   6*SIZE(Y)
251	MUL	S, $f12, $f24
252	unop
253	SUB	$f27, $f28, $f28
254
255	MUL	C, $f14, $f25
256	LD	$f12,   4*SIZE(X)
257	lda	Y,   8*SIZE(Y)
258	unop
259
260	ST	$f26,  -1*SIZE(X)
261	MUL	S, $f15, $f26
262	unop
263	ADD	$f21, $f22, $f22
264
265	MUL	C, $f15, $f27
266	LD	$f15,   5*SIZE(Y)
267	unop
268	unop
269
270	ST	$f28,  -1*SIZE(Y)
271	MUL	S, $f14, $f28
272	SUB	$f23, $f24, $f24
273	bgt	I, $L12
274	.align 4
275
276$L13:
277	MUL	C, $f16, $f21
278	LD	$f14,   5*SIZE(X)
279	unop
280	unop
281
282	ST	$f22,   0*SIZE(X)
283	MUL	S, $f17, $f22
284	unop
285	ADD	$f25, $f26, $f26
286
287	MUL	C, $f17, $f23
288	unop
289	unop
290	LD	$f17,   6*SIZE(Y)
291
292	ST	$f24,   0*SIZE(Y)
293	MUL	S, $f16, $f24
294	LD	$f16,   6*SIZE(X)
295	SUB	$f27, $f28, $f28
296
297	MUL	C, $f18, $f25
298	unop
299	unop
300	unop
301
302	ST	$f26,   1*SIZE(X)
303	MUL	S, $f19, $f26
304	unop
305	ADD	$f21, $f22, $f22
306
307	MUL	C, $f19, $f27
308	unop
309	unop
310	LD	$f19,   7*SIZE(Y)
311
312	ST	$f28,   1*SIZE(Y)
313	MUL	S, $f18, $f28
314	LD	$f18,   7*SIZE(X)
315	SUB	$f23, $f24, $f24
316
317	MUL	C, $f12, $f21
318	unop
319	unop
320	unop
321
322	ST	$f22,   2*SIZE(X)
323	unop
324	MUL	S, $f13, $f22
325	ADD	$f25, $f26, $f26
326
327	MUL	C, $f13, $f23
328	unop
329	unop
330	unop
331
332	ST	$f24,   2*SIZE(Y)
333	MUL	S, $f12, $f24
334	unop
335	SUB	$f27, $f28, $f28
336
337	MUL	C, $f14, $f25
338	unop
339	unop
340	unop
341
342	ST	$f26,   3*SIZE(X)
343	MUL	S, $f15, $f26
344	unop
345	ADD	$f21, $f22, $f22
346
347	MUL	C, $f15, $f27
348	unop
349	unop
350	unop
351
352	ST	$f28,   3*SIZE(Y)
353	MUL	S, $f14, $f28
354	unop
355	SUB	$f23, $f24, $f24
356
357	MUL	C, $f16, $f21
358	unop
359	unop
360	unop
361
362	ST	$f22,   4*SIZE(X)
363	MUL	S, $f17, $f22
364	unop
365	ADD	$f25, $f26, $f26
366
367	MUL	C, $f17, $f23
368	unop
369	unop
370	unop
371
372	ST	$f24,   4*SIZE(Y)
373	MUL	S, $f16, $f24
374	unop
375	SUB	$f27, $f28, $f28
376
377	MUL	C, $f18, $f25
378	unop
379	unop
380	unop
381
382	ST	$f26,   5*SIZE(X)
383	MUL	S, $f19, $f26
384	unop
385	ADD	$f21, $f22, $f22
386
387	MUL	C, $f19, $f27
388	unop
389	unop
390	unop
391
392	ST	$f28,   5*SIZE(Y)
393	MUL	S, $f18, $f28
394	unop
395	SUB	$f23, $f24, $f24
396
397	ST	$f22,   6*SIZE(X)
398	ADD	$f25, $f26, $f26
399	ST	$f24,   6*SIZE(Y)
400	SUB	$f27, $f28, $f28
401
402	ST	$f26,   7*SIZE(X)
403	lda	X,   8*SIZE(X)
404	ST	$f28,   7*SIZE(Y)
405	lda	Y,   8*SIZE(Y)
406	.align 4
407
408
409$L15:
410	and	N, 3, I
411	ble	I, $L998
412	.align 4
413
414$L16:
415	LD	$f12,   0*SIZE(X)
416	LD	$f13,   0*SIZE(Y)
417	LD	$f14,   1*SIZE(X)
418	LD	$f15,   1*SIZE(Y)
419
420	MUL	C, $f12, $f21
421	MUL	S, $f13, $f22
422	MUL	C, $f13, $f23
423	MUL	S, $f12, $f24
424
425	ADD	$f21, $f22, $f22
426	SUB	$f23, $f24, $f24
427
428	MUL	C, $f14, $f25
429	MUL	S, $f15, $f26
430	MUL	C, $f15, $f27
431	MUL	S, $f14, $f28
432
433	ADD	$f25, $f26, $f26
434	SUB	$f27, $f28, $f28
435
436	ST	$f22,   0*SIZE(X)
437	ST	$f24,   0*SIZE(Y)
438	lda	I, -1(I)
439
440	ST	$f26,   1*SIZE(X)
441	lda	X, 2 * SIZE(X)
442	ST	$f28,   1*SIZE(Y)
443	lda	Y, 2 * SIZE(Y)
444
445	bgt	I, $L16
446	.align 4
447
448$L998:
449	clr	$0
450	ret
451	.align 4
452
453$L50:
454	mov	X, XX
455	mov	Y, YY
456
457	sra	N, 2, I
458	ble	I, $L55
459	.align 4
460
461$L51:
462	LD	$f12,   0*SIZE(X)
463	LD	$f13,   0*SIZE(Y)
464	LD	$f14,   1*SIZE(X)
465	SXADDQ	INCX, X, X
466	LD	$f15,   1*SIZE(Y)
467	SXADDQ	INCY, Y, Y
468
469	MUL	C, $f12, $f21
470	MUL	S, $f13, $f22
471	MUL	C, $f13, $f23
472	MUL	S, $f12, $f24
473
474	ADD	$f21, $f22, $f22
475	SUB	$f23, $f24, $f24
476
477	MUL	C, $f14, $f25
478	MUL	S, $f15, $f26
479	MUL	C, $f15, $f27
480	MUL	S, $f14, $f28
481
482	ADD	$f25, $f26, $f26
483	SUB	$f27, $f28, $f28
484
485	ST	$f22,   0*SIZE(XX)
486	ST	$f24,   0*SIZE(YY)
487	ST	$f26,   1*SIZE(XX)
488	SXADDQ	INCX, XX, XX
489	ST	$f28,   1*SIZE(YY)
490	SXADDQ	INCY, YY, YY
491
492
493	LD	$f12,   0*SIZE(X)
494	LD	$f13,   0*SIZE(Y)
495	LD	$f14,   1*SIZE(X)
496	SXADDQ	INCX, X, X
497	LD	$f15,   1*SIZE(Y)
498	SXADDQ	INCY, Y, Y
499
500	MUL	C, $f12, $f21
501	MUL	S, $f13, $f22
502	MUL	C, $f13, $f23
503	MUL	S, $f12, $f24
504
505	ADD	$f21, $f22, $f22
506	SUB	$f23, $f24, $f24
507
508	MUL	C, $f14, $f25
509	MUL	S, $f15, $f26
510	MUL	C, $f15, $f27
511	MUL	S, $f14, $f28
512
513	ADD	$f25, $f26, $f26
514	SUB	$f27, $f28, $f28
515
516	ST	$f22,   0*SIZE(XX)
517	ST	$f24,   0*SIZE(YY)
518	ST	$f26,   1*SIZE(XX)
519	SXADDQ	INCX, XX, XX
520	ST	$f28,   1*SIZE(YY)
521	SXADDQ	INCY, YY, YY
522
523
524	LD	$f12,   0*SIZE(X)
525	LD	$f13,   0*SIZE(Y)
526	LD	$f14,   1*SIZE(X)
527	SXADDQ	INCX, X, X
528	LD	$f15,   1*SIZE(Y)
529	SXADDQ	INCY, Y, Y
530
531	MUL	C, $f12, $f21
532	MUL	S, $f13, $f22
533	MUL	C, $f13, $f23
534	MUL	S, $f12, $f24
535
536	ADD	$f21, $f22, $f22
537	SUB	$f23, $f24, $f24
538
539	MUL	C, $f14, $f25
540	MUL	S, $f15, $f26
541	MUL	C, $f15, $f27
542	MUL	S, $f14, $f28
543
544	ADD	$f25, $f26, $f26
545	SUB	$f27, $f28, $f28
546
547	ST	$f22,   0*SIZE(XX)
548	ST	$f24,   0*SIZE(YY)
549	ST	$f26,   1*SIZE(XX)
550	SXADDQ	INCX, XX, XX
551	ST	$f28,   1*SIZE(YY)
552	SXADDQ	INCY, YY, YY
553
554
555	LD	$f12,   0*SIZE(X)
556	LD	$f13,   0*SIZE(Y)
557	LD	$f14,   1*SIZE(X)
558	SXADDQ	INCX, X, X
559	LD	$f15,   1*SIZE(Y)
560	SXADDQ	INCY, Y, Y
561
562	MUL	C, $f12, $f21
563	MUL	S, $f13, $f22
564	MUL	C, $f13, $f23
565	MUL	S, $f12, $f24
566
567	ADD	$f21, $f22, $f22
568	SUB	$f23, $f24, $f24
569
570	MUL	C, $f14, $f25
571	MUL	S, $f15, $f26
572	MUL	C, $f15, $f27
573	MUL	S, $f14, $f28
574
575	ADD	$f25, $f26, $f26
576	SUB	$f27, $f28, $f28
577
578	ST	$f22,   0*SIZE(XX)
579	ST	$f24,   0*SIZE(YY)
580	ST	$f26,   1*SIZE(XX)
581	SXADDQ	INCX, XX, XX
582	ST	$f28,   1*SIZE(YY)
583	SXADDQ	INCY, YY, YY
584
585	lda	I, -1(I)
586	bgt	I, $L51
587	.align 4
588
589$L55:
590	and	N, 3, I
591	ble	I, $L999
592	.align 4
593
594$L56:
595	LD	$f12,   0*SIZE(X)
596	LD	$f13,   0*SIZE(Y)
597	LD	$f14,   1*SIZE(X)
598	LD	$f15,   1*SIZE(Y)
599
600	MUL	C, $f12, $f21
601	MUL	S, $f13, $f22
602	MUL	C, $f13, $f23
603	MUL	S, $f12, $f24
604
605	ADD	$f21, $f22, $f22
606	SUB	$f23, $f24, $f24
607
608	MUL	C, $f14, $f25
609	MUL	S, $f15, $f26
610	MUL	C, $f15, $f27
611	MUL	S, $f14, $f28
612
613	ADD	$f25, $f26, $f26
614	SUB	$f27, $f28, $f28
615
616	ST	$f22,   0*SIZE(X)
617	ST	$f24,   0*SIZE(Y)
618	lda	I, -1(I)
619
620	ST	$f26,   1*SIZE(X)
621	ST	$f28,   1*SIZE(Y)
622	SXADDQ	INCX, X, X
623	SXADDQ	INCY, Y, Y
624
625	bgt	I, $L56
626	.align 4
627
628$L999:
629	clr	$0
630	ret
631	EPILOGUE
632