1/*****************************************************************************
2Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9   1. Redistributions of source code must retain the above copyright
10      notice, this list of conditions and the following disclaimer.
11
12   2. Redistributions in binary form must reproduce the above copyright
13      notice, this list of conditions and the following disclaimer in
14      the documentation and/or other materials provided with the
15      distribution.
16   3. Neither the name of the ISCAS nor the names of its contributors may
17      be used to endorse or promote products derived from this software
18      without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
29USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31**********************************************************************************/
32
33/*********************************************************************/
34/* Copyright 2009, 2010 The University of Texas at Austin.           */
35/* All rights reserved.                                              */
36/*                                                                   */
37/* Redistribution and use in source and binary forms, with or        */
38/* without modification, are permitted provided that the following   */
39/* conditions are met:                                               */
40/*                                                                   */
41/*   1. Redistributions of source code must retain the above         */
42/*      copyright notice, this list of conditions and the following  */
43/*      disclaimer.                                                  */
44/*                                                                   */
45/*   2. Redistributions in binary form must reproduce the above      */
46/*      copyright notice, this list of conditions and the following  */
47/*      disclaimer in the documentation and/or other materials       */
48/*      provided with the distribution.                              */
49/*                                                                   */
50/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
51/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
52/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
53/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
54/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
55/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
56/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
57/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
58/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
59/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
60/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
61/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
62/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
63/*    POSSIBILITY OF SUCH DAMAGE.                                    */
64/*                                                                   */
65/* The views and conclusions contained in the software and           */
66/* documentation are those of the authors and should not be          */
67/* interpreted as representing official policies, either expressed   */
68/* or implied, of The University of Texas at Austin.                 */
69/*********************************************************************/
70
71#define ASSEMBLER
72#include "common.h"
73
74
75#define PREFETCH_DISTANCE 2016
76
77#define N	$4
78
79#define X	$8
80#define INCX	$9
81
82#define Y	$10
83#define INCY	$11
84
85#define I	$2
86#define TEMP	$3
87
88#define YY	$5
89
90#define ALPHA	$f15
91
92#define a1	$f0
93#define a2	$f1
94#define a3	$f2
95#define a4	$f3
96#define a5	$f4
97#define a6	$f5
98#define a7	$f6
99#define a8	$f7
100
101#define a9	$f8
102#define a10	$f9
103#define a11	$f10
104#define a12	$f11
105#define a13	$f12
106#define a14	$f13
107#define a15	$f14
108#define a16	$f17
109
110#define t1	$f18
111#define t2	$f19
112#define t3	$f20
113#define t4	$f21
114
115#define b1	$f22
116#define b2	$f23
117#define b3	$f24
118#define b4	$f25
119
120#define b5	$f26
121#define b6	$f27
122#define b7	$f28
123#define b8	$f29
124
125
126#define A1	 0
127#define A2	 1
128#define A3	 2
129#define A4	 3
130#define A5	 4
131#define A6	 5
132#define A7	 6
133#define A8	 7
134
135#define A9	 8
136#define A10	 9
137#define A11	 10
138#define A12	 11
139#define A13	 12
140#define A14	 13
141#define A15	 14
142#define A16	 17
143
144#define T1	 18
145#define T2	 19
146#define T3	 20
147#define T4	 21
148
149#define B1	 22
150#define B2	 23
151#define B3	 24
152#define B4	 25
153
154#define B5	 26
155#define B6	 27
156#define B7	 28
157#define B8	 29
158
159#define X_BASE 8
160#define Y_BASE 10
161
162#define gsLQC1_(base,fq,ft,offset) .word (0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
163#define gsLQC1(base,fq,ft,offset) gsLQC1_((base), (fq), (ft), (offset))
164
165#define gsSQC1_(base,fq,ft,offset) .word (0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
166#define gsSQC1(base,fq,ft,offset) gsSQC1_((base), (fq), (ft), (offset))
167
168	PROLOGUE
169
170#ifndef __64BIT__
171	daddiu	$sp, $sp, -40
172	sdc1	$f20, 0($sp)
173	sdc1	$f22, 8($sp)
174	sdc1	$f24, 16($sp)
175	sdc1	$f26, 24($sp)
176	sdc1	$f28, 32($sp)
177#else
178	daddiu	$sp, $sp, -48
179	sdc1	$f24, 0($sp)
180	sdc1	$f25, 8($sp)
181	sdc1	$f26, 16($sp)
182	sdc1	$f27, 24($sp)
183	sdc1	$f28, 32($sp)
184	sdc1	$f29, 40($sp)
185#endif
186
187
188
189	li	TEMP, SIZE
190
191	blez	N, .L999
192	dsll	INCX, INCX, BASE_SHIFT
193
194	bne	INCX, TEMP, .L20
195	dsll	INCY, INCY, BASE_SHIFT
196
197	bne	INCY, TEMP, .L20
198
199	//Dose  the address of Y algin 16 bytes?
200	andi	TEMP,  Y, 8
201	beq	TEMP, $0, .L10
202	//Y unalgin. Compute this unalgined element.
203	LD	a1,  0 * SIZE(X)
204	LD	b1,  0 * SIZE(Y)
205
206	daddiu	X, X, SIZE
207	daddiu	Y, Y, SIZE
208
209	MADD	t1, b1, ALPHA, a1
210	daddiu	N, N, -1
211
212	ST	t1, -1 * SIZE(Y)
213	blez	N, .L999
214	.align 5
215
216.L10:
217
218	dsra	I, N, 4
219
220	blez	I, .L15
221	daddiu	I, I, -1
222
223	//Y algin. We need test X address
224	//Dose  the address of X algin 16 bytes?
225	andi	TEMP,  X, 8
226	bne	TEMP, $0, .L30  ///
227	.align 5
228
229.L11:
230	//X & Y algin
231	gsLQC1(X_BASE,A2,A1,0)
232	gsLQC1(X_BASE,A4,A3,1)
233	gsLQC1(X_BASE,A6,A5,2)
234	gsLQC1(X_BASE,A8,A7,3)
235
236	gsLQC1(X_BASE,A10,A9,4)
237	gsLQC1(X_BASE,A12,A11,5)
238	gsLQC1(X_BASE,A14,A13,6)
239	gsLQC1(X_BASE,A16,A15,7)
240
241	gsLQC1(Y_BASE,B2,B1,0)
242	gsLQC1(Y_BASE,B4,B3,1)
243	gsLQC1(Y_BASE,B6,B5,2)
244	gsLQC1(Y_BASE,B8,B7,3)
245
246	blez	I, .L13
247	NOP
248	.align 5
249
250.L12:
251
252	MADD	t1, b1, ALPHA, a1
253	MADD	t2, b2, ALPHA, a2
254	gsSQC1(Y_BASE, T2, T1, 0)
255	gsLQC1(Y_BASE,B2,B1,4)
256
257	MADD	t3, b3, ALPHA, a3
258	MADD	t4, b4, ALPHA, a4
259	gsSQC1(Y_BASE, T4, T3, 1)
260	gsLQC1(Y_BASE,B4,B3,5)
261
262	PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
263	PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
264
265	MADD	t1, b5, ALPHA, a5
266	MADD	t2, b6, ALPHA, a6
267	gsSQC1(Y_BASE, T2, T1, 2)
268	gsLQC1(Y_BASE,B6,B5,6)
269
270	MADD	t3, b7, ALPHA, a7
271	MADD	t4, b8, ALPHA, a8
272	gsSQC1(Y_BASE, T4, T3, 3)
273	gsLQC1(Y_BASE,B8,B7, 7)
274
275	PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
276	PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
277
278	MADD	t1, b1, ALPHA, a9
279	MADD	t2, b2, ALPHA, a10
280	gsSQC1(Y_BASE, T2, T1, 4)
281	gsLQC1(Y_BASE,B2,B1,8)
282
283	MADD	t3, b3, ALPHA, a11
284	MADD	t4, b4, ALPHA, a12
285	gsSQC1(Y_BASE, T4, T3, 5)
286	gsLQC1(Y_BASE,B4,B3,9)
287
288	PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
289	PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
290
291	MADD	t1, b5, ALPHA, a13
292	MADD	t2, b6, ALPHA, a14
293	gsSQC1(Y_BASE, T2, T1, 6)
294	gsLQC1(Y_BASE,B6,B5,10)
295
296	MADD	t3, b7, ALPHA, a15
297	MADD	t4, b8, ALPHA, a16
298	gsSQC1(Y_BASE, T4, T3, 7)
299	gsLQC1(Y_BASE,B8,B7,11)
300
301	PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
302	PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
303
304	gsLQC1(X_BASE,A2,A1,8)
305	gsLQC1(X_BASE,A4,A3,9)
306	gsLQC1(X_BASE,A6,A5,10)
307	gsLQC1(X_BASE,A8,A7,11)
308
309	gsLQC1(X_BASE,A10,A9,12)
310	gsLQC1(X_BASE,A12,A11,13)
311	gsLQC1(X_BASE,A14,A13,14)
312	gsLQC1(X_BASE,A16,A15,15)
313
314
315	daddiu	I, I, -1
316	daddiu	Y, Y, 16 * SIZE
317
318	daddiu	X, X, 16 * SIZE
319	bgtz	I, .L12
320
321	.align 5
322
323.L13:
324
325	MADD	t1, b1, ALPHA, a1
326	MADD	t2, b2, ALPHA, a2
327	gsSQC1(Y_BASE, T2, T1, 0)
328	gsLQC1(Y_BASE,B2,B1,4)
329
330	MADD	t3, b3, ALPHA, a3
331	MADD	t4, b4, ALPHA, a4
332	gsSQC1(Y_BASE, T4, T3, 1)
333	gsLQC1(Y_BASE,B4,B3,5)
334
335
336	MADD	t1, b5, ALPHA, a5
337	MADD	t2, b6, ALPHA, a6
338	gsSQC1(Y_BASE, T2, T1, 2)
339	gsLQC1(Y_BASE,B6,B5,6)
340
341	MADD	t3, b7, ALPHA, a7
342	MADD	t4, b8, ALPHA, a8
343	gsSQC1(Y_BASE, T4, T3, 3)
344	gsLQC1(Y_BASE,B8,B7,7)
345
346
347	MADD	t1, b1, ALPHA, a9
348	MADD	t2, b2, ALPHA, a10
349	gsSQC1(Y_BASE, T2, T1, 4)
350
351
352	MADD	t3, b3, ALPHA, a11
353	MADD	t4, b4, ALPHA, a12
354	gsSQC1(Y_BASE, T4, T3, 5)
355
356
357	MADD	t1, b5, ALPHA, a13
358	MADD	t2, b6, ALPHA, a14
359	gsSQC1(Y_BASE, T2, T1, 6)
360
361
362	MADD	t3, b7, ALPHA, a15
363	MADD	t4, b8, ALPHA, a16
364	gsSQC1(Y_BASE, T4, T3, 7)
365
366
367	daddiu	X, X, 16 * SIZE
368	daddiu	Y, Y, 16 * SIZE
369	.align 5
370
371.L15:
372	andi	I,  N, 15
373
374	blez	I, .L999
375	NOP
376	.align	5
377
378.L16:
379	LD	a1,  0 * SIZE(X)
380	LD	b1,  0 * SIZE(Y)
381
382	daddiu	X, X, SIZE
383	daddiu	Y, Y, SIZE
384
385	MADD	t1, b1, ALPHA, a1
386	daddiu	I, I, -1
387
388	bgtz	I, .L16
389	ST	t1, -1 * SIZE(Y)
390
391
392#ifndef __64BIT__
393	ldc1	$f20, 0($sp)
394	ldc1	$f22, 8($sp)
395	ldc1	$f24, 16($sp)
396	ldc1	$f26, 24($sp)
397	ldc1	$f28, 32($sp)
398	daddiu	$sp, $sp, 40
399#else
400	ldc1	$f24, 0($sp)
401	ldc1	$f25, 8($sp)
402	ldc1	$f26, 16($sp)
403	ldc1	$f27, 24($sp)
404	ldc1	$f28, 32($sp)
405	ldc1	$f29, 40($sp)
406	daddiu	$sp, $sp, 48
407#endif
408
409	j	$31
410	NOP
411	.align 5
412
413.L30:
414	//Y align, X unalign, INCX==INCY==1
415	//unloop 16
416
417	LD	a1,  0 * SIZE(X)
418	daddiu	X, X, SIZE
419	gsLQC1(X_BASE,A3,A2,0)
420	gsLQC1(X_BASE,A5,A4,1)
421	gsLQC1(X_BASE,A7,A6,2)
422	gsLQC1(X_BASE,A9,A8,3)
423
424	gsLQC1(X_BASE,A11,A10,4)
425	gsLQC1(X_BASE,A13,A12,5)
426	gsLQC1(X_BASE,A15,A14,6)
427	LD	a16,  14 * SIZE(X)
428
429
430	gsLQC1(Y_BASE,B2,B1,0)
431	gsLQC1(Y_BASE,B4,B3,1)
432	gsLQC1(Y_BASE,B6,B5,2)
433	gsLQC1(Y_BASE,B8,B7,3)
434
435	blez	I, .L32
436	NOP
437	.align 5
438
439.L31:
440	MADD	t1, b1, ALPHA, a1
441	MADD	t2, b2, ALPHA, a2
442	gsSQC1(Y_BASE, T2, T1, 0)
443	gsLQC1(Y_BASE,B2,B1,4)
444
445	MADD	t3, b3, ALPHA, a3
446	MADD	t4, b4, ALPHA, a4
447	gsSQC1(Y_BASE, T4, T3, 1)
448	gsLQC1(Y_BASE,B4,B3,5)
449
450	PREFETCHD(PREFETCH_DISTANCE*SIZE(Y))
451	PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y))
452
453	MADD	t1, b5, ALPHA, a5
454	MADD	t2, b6, ALPHA, a6
455	gsSQC1(Y_BASE, T2, T1, 2)
456	gsLQC1(Y_BASE,B6,B5,6)
457
458	MADD	t3, b7, ALPHA, a7
459	MADD	t4, b8, ALPHA, a8
460	gsSQC1(Y_BASE, T4, T3, 3)
461	gsLQC1(Y_BASE,B8,B7,7)
462
463	PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y))
464	PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y))
465
466	MADD	t1, b1, ALPHA, a9
467	MADD	t2, b2, ALPHA, a10
468	gsSQC1(Y_BASE, T2, T1, 4)
469	gsLQC1(Y_BASE,B2,B1,8)
470
471	MADD	t3, b3, ALPHA, a11
472	MADD	t4, b4, ALPHA, a12
473	gsSQC1(Y_BASE, T4, T3, 5)
474	gsLQC1(Y_BASE,B4,B3,9)
475
476	PREFETCHD(PREFETCH_DISTANCE*SIZE(X))
477	PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X))
478
479	MADD	t1, b5, ALPHA, a13
480	MADD	t2, b6, ALPHA, a14
481	gsSQC1(Y_BASE, T2, T1, 6)
482	gsLQC1(Y_BASE,B6,B5,10)
483
484	MADD	t3, b7, ALPHA, a15
485	MADD	t4, b8, ALPHA, a16
486	gsSQC1(Y_BASE, T4, T3, 7)
487	gsLQC1(Y_BASE,B8,B7,11)
488
489	PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X))
490	PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X))
491
492	LD	a1,  15 * SIZE(X)
493	gsLQC1(X_BASE,A3,A2,8)
494	gsLQC1(X_BASE,A5,A4,9)
495	gsLQC1(X_BASE,A7,A6,10)
496	gsLQC1(X_BASE,A9,A8,11)
497
498	gsLQC1(X_BASE,A11,A10,12)
499	gsLQC1(X_BASE,A13,A12,13)
500	gsLQC1(X_BASE,A15,A14,14)
501	LD	a16,  30 * SIZE(X)
502
503	daddiu	I, I, -1
504	daddiu	Y, Y, 16 * SIZE
505
506	daddiu	X, X, 16 * SIZE
507	bgtz	I, .L31
508
509	.align 5
510//Loop end:
511.L32:
512
513	MADD	t1, b1, ALPHA, a1
514	MADD	t2, b2, ALPHA, a2
515	gsSQC1(Y_BASE, T2, T1, 0)
516	gsLQC1(Y_BASE,B2,B1,4)
517
518	MADD	t3, b3, ALPHA, a3
519	MADD	t4, b4, ALPHA, a4
520	gsSQC1(Y_BASE, T4, T3, 1)
521	gsLQC1(Y_BASE,B4,B3,5)
522
523
524	MADD	t1, b5, ALPHA, a5
525	MADD	t2, b6, ALPHA, a6
526	gsSQC1(Y_BASE, T2, T1, 2)
527	gsLQC1(Y_BASE,B6,B5,6)
528
529	MADD	t3, b7, ALPHA, a7
530	MADD	t4, b8, ALPHA, a8
531	gsSQC1(Y_BASE, T4, T3, 3)
532	gsLQC1(Y_BASE,B8,B7,7)
533
534
535	MADD	t1, b1, ALPHA, a9
536	MADD	t2, b2, ALPHA, a10
537	gsSQC1(Y_BASE, T2, T1, 4)
538
539
540	MADD	t3, b3, ALPHA, a11
541	MADD	t4, b4, ALPHA, a12
542	gsSQC1(Y_BASE, T4, T3, 5)
543
544
545	MADD	t1, b5, ALPHA, a13
546	MADD	t2, b6, ALPHA, a14
547	gsSQC1(Y_BASE, T2, T1, 6)
548
549
550	MADD	t3, b7, ALPHA, a15
551	MADD	t4, b8, ALPHA, a16
552	gsSQC1(Y_BASE, T4, T3, 7)
553
554
555	daddiu	X, X, 15 * SIZE
556	daddiu	Y, Y, 16 * SIZE
557
558	//jump back to the remain process.
559	b	.L15
560	.align 5
561
562//INCX!=1 or INCY != 1
563.L20:
564	dsra	I, N, 3
565	move	YY, Y
566
567	blez	I, .L25
568	daddiu	I, I, -1
569
570	LD	a1,  0 * SIZE(X)
571	daddu	X, X, INCX
572	LD	b1,  0 * SIZE(Y)
573	daddu	Y, Y, INCY
574	LD	a2,  0 * SIZE(X)
575	daddu	X, X, INCX
576	LD	b2,  0 * SIZE(Y)
577	daddu	Y, Y, INCY
578	LD	a3,  0 * SIZE(X)
579	daddu	X, X, INCX
580	LD	b3,  0 * SIZE(Y)
581	daddu	Y, Y, INCY
582	LD	a4,  0 * SIZE(X)
583	daddu	X, X, INCX
584	LD	b4,  0 * SIZE(Y)
585	daddu	Y, Y, INCY
586	LD	a5,  0 * SIZE(X)
587	daddu	X, X, INCX
588	LD	b5,  0 * SIZE(Y)
589	daddu	Y, Y, INCY
590	LD	a6,  0 * SIZE(X)
591	daddu	X, X, INCX
592	LD	b6,  0 * SIZE(Y)
593	daddu	Y, Y, INCY
594	LD	a7,  0 * SIZE(X)
595	daddu	X, X, INCX
596	LD	b7,  0 * SIZE(Y)
597	daddu	Y, Y, INCY
598	LD	a8,  0 * SIZE(X)
599	daddu	X, X, INCX
600	LD	b8,  0 * SIZE(Y)
601	daddu	Y, Y, INCY
602
603	blez	I, .L23
604	NOP
605	.align 5
606
607.L22:
608	MADD	t1, b1, ALPHA, a1
609	LD	a1,  0 * SIZE(X)
610	LD	b1,  0 * SIZE(Y)
611	daddu	X, X, INCX
612	daddu	Y, Y, INCY
613
614	MADD	t2, b2, ALPHA, a2
615	LD	a2,  0 * SIZE(X)
616	LD	b2,  0 * SIZE(Y)
617	daddu	X, X, INCX
618	daddu	Y, Y, INCY
619
620	MADD	t3, b3, ALPHA, a3
621	LD	a3,  0 * SIZE(X)
622	LD	b3,  0 * SIZE(Y)
623	daddu	X, X, INCX
624	daddu	Y, Y, INCY
625
626	MADD	t4, b4, ALPHA, a4
627	LD	a4,  0 * SIZE(X)
628	LD	b4,  0 * SIZE(Y)
629	daddu	X, X, INCX
630	daddu	Y, Y, INCY
631
632	ST	t1,  0 * SIZE(YY)
633	daddu	YY, YY, INCY
634	MADD	t1, b5, ALPHA, a5
635
636	LD	a5,  0 * SIZE(X)
637	LD	b5,  0 * SIZE(Y)
638	daddu	X, X, INCX
639	daddu	Y, Y, INCY
640
641	ST	t2,  0 * SIZE(YY)
642	daddu	YY, YY, INCY
643	MADD	t2, b6, ALPHA, a6
644
645	LD	a6,  0 * SIZE(X)
646	LD	b6,  0 * SIZE(Y)
647	daddu	X, X, INCX
648	daddu	Y, Y, INCY
649
650	ST	t3,  0 * SIZE(YY)
651	daddu	YY, YY, INCY
652	MADD	t3, b7, ALPHA, a7
653
654	LD	a7,  0 * SIZE(X)
655	LD	b7,  0 * SIZE(Y)
656	daddu	X, X, INCX
657	daddu	Y, Y, INCY
658
659	ST	t4,  0 * SIZE(YY)
660	daddu	YY, YY, INCY
661	MADD	t4, b8, ALPHA, a8
662
663	LD	a8,  0 * SIZE(X)
664	daddu	X, X, INCX
665
666	LD	b8,  0 * SIZE(Y)
667	daddu	Y, Y, INCY
668
669	ST	t1,  0 * SIZE(YY)
670	daddu	YY, YY, INCY
671	ST	t2,  0 * SIZE(YY)
672	daddu	YY, YY, INCY
673	ST	t3,  0 * SIZE(YY)
674	daddu	YY, YY, INCY
675	ST	t4,  0 * SIZE(YY)
676	daddiu	I, I, -1
677
678	bgtz	I, .L22
679	daddu	YY, YY, INCY
680	.align 5
681
682.L23:
683	MADD	t1, b1, ALPHA, a1
684	MADD	t2, b2, ALPHA, a2
685	MADD	t3, b3, ALPHA, a3
686	MADD	t4, b4, ALPHA, a4
687
688	ST	t1,  0 * SIZE(YY)
689	daddu	YY, YY, INCY
690	MADD	t1, b5, ALPHA, a5
691
692	ST	t2,  0 * SIZE(YY)
693	daddu	YY, YY, INCY
694	MADD	t2, b6, ALPHA, a6
695
696	ST	t3,  0 * SIZE(YY)
697	daddu	YY, YY, INCY
698	MADD	t3, b7, ALPHA, a7
699
700	ST	t4,  0 * SIZE(YY)
701	daddu	YY, YY, INCY
702	MADD	t4, b8, ALPHA, a8
703
704	ST	t1,  0 * SIZE(YY)
705	daddu	YY, YY, INCY
706	ST	t2,  0 * SIZE(YY)
707	daddu	YY, YY, INCY
708	ST	t3,  0 * SIZE(YY)
709	daddu	YY, YY, INCY
710	ST	t4,  0 * SIZE(YY)
711	daddu	YY, YY, INCY
712	.align 5
713
714.L25:
715	andi	I,  N, 7
716
717	blez	I, .L999
718	NOP
719	.align	5
720
721.L26:
722	LD	a1,  0 * SIZE(X)
723	LD	b1,  0 * SIZE(Y)
724
725	MADD	t1, b1, ALPHA, a1
726	daddu	X, X, INCX
727
728	ST	t1,  0 * SIZE(Y)
729	daddiu	I, I, -1
730
731	bgtz	I, .L26
732	daddu	Y, Y, INCY
733	.align 5
734
735.L999:
736
737#ifndef __64BIT__
738	ldc1	$f20, 0($sp)
739	ldc1	$f22, 8($sp)
740	ldc1	$f24, 16($sp)
741	ldc1	$f26, 24($sp)
742	ldc1	$f28, 32($sp)
743	daddiu	$sp, $sp, 40
744#else
745	ldc1	$f24, 0($sp)
746	ldc1	$f25, 8($sp)
747	ldc1	$f26, 16($sp)
748	ldc1	$f27, 24($sp)
749	ldc1	$f28, 32($sp)
750	ldc1	$f29, 40($sp)
751	daddiu	$sp, $sp, 48
752#endif
753
754	j	$31
755	NOP
756
757	EPILOGUE
758