1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r6
44#define INCX	r7
45#define Y	r8
46#define INCY	r9
47
48#define	YY	r4
49#define INCX2	r5
50#define INCY2	r10
51
52#define ALPHA	f1
53
54#define A1	f0
55#define A2	f8
56#define A3	f2
57#define A4	f3
58#define A5	f4
59#define A6	f5
60#define A7	f6
61#define A8	f7
62#define A9	f25
63
64#define B1	f9
65#define B2	f10
66#define B3	f11
67#define B4	f12
68#define B5	f13
69#define B6	f14
70#define B7	f15
71#define B8	f16
72
73#define C1	f17
74#define C2	f18
75#define C3	f19
76#define C4	f20
77#define C5	f21
78#define C6	f22
79#define C7	f23
80#define C8	f24
81
82
83	PROLOGUE
84	PROFCODE
85
86	li	r10, -16
87
88	stfpdux	f14, SP, r10
89	stfpdux	f15, SP, r10
90	stfpdux	f16, SP, r10
91	stfpdux	f17, SP, r10
92
93	stfpdux	f18, SP, r10
94	stfpdux	f19, SP, r10
95	stfpdux	f20, SP, r10
96	stfpdux	f21, SP, r10
97
98	stfpdux	f22, SP, r10
99	stfpdux	f23, SP, r10
100	stfpdux	f24, SP, r10
101	stfpdux	f25, SP, r10
102
103	fsmfp	ALPHA, ALPHA
104
105	slwi	INCX,  INCX, BASE_SHIFT
106	slwi	INCY,  INCY, BASE_SHIFT
107
108	add	INCX2, INCX, INCX
109	add	INCY2, INCY, INCY
110
111	cmpwi	cr0, N, 0
112	ble	LL(999)
113
114	cmpwi	cr0, INCX, SIZE
115	bne	LL(100)
116	cmpwi	cr0, INCY, SIZE
117	bne	LL(100)
118
119	andi.	r0, Y, 2 * SIZE - 1
120	beq	LL(05)
121
122	LFD	A1,   0 * SIZE(X)
123	LFD	B1,   0 * SIZE(Y)
124
125	addi	X, X, SIZE
126	addi	Y, Y, SIZE
127
128	fmadd	C1, ALPHA, A1, B1
129	addi	N, N, -1
130	STFD	C1,  -1 * SIZE(Y)
131
132LL(05):
133	andi.	r0, X, 2 * SIZE - 1
134	bne	LL(20)
135
136	sub	X,  X, INCX2
137	sub	Y,  Y, INCY2
138	mr	YY, Y
139
140	srawi.	r0, N, 4
141	mtspr	CTR,  r0
142	beq-	LL(15)
143
144	LFPDUX	A1,   X, INCX2
145	LFPDUX	B1,   Y, INCY2
146	LFPDUX	A2,   X, INCX2
147	LFPDUX	B2,   Y, INCY2
148	LFPDUX	A3,   X, INCX2
149	LFPDUX	B3,   Y, INCY2
150	LFPDUX	A4,   X, INCX2
151	LFPDUX	B4,   Y, INCY2
152
153	LFPDUX	A5,   X, INCX2
154	LFPDUX	B5,   Y, INCY2
155	LFPDUX	A6,   X, INCX2
156	LFPDUX	B6,   Y, INCY2
157	LFPDUX	A7,   X, INCX2
158	LFPDUX	B7,   Y, INCY2
159	LFPDUX	A8,   X, INCX2
160	LFPDUX	B8,   Y, INCY2
161	bdz	LL(13)
162	.align 4
163
164LL(12):
165	fpmadd	C1, ALPHA, A1, B1
166	LFPDUX	A1,   X, INCX2
167	LFPDUX	B1,   Y, INCY2
168	fpmadd	C2, ALPHA, A2, B2
169	LFPDUX	A2,   X, INCX2
170	LFPDUX	B2,   Y, INCY2
171
172	fpmadd	C3, ALPHA, A3, B3
173	LFPDUX	A3,   X, INCX2
174	LFPDUX	B3,   Y, INCY2
175	fpmadd	C4, ALPHA, A4, B4
176	LFPDUX	A4,   X, INCX2
177	LFPDUX	B4,   Y, INCY2
178
179	fpmadd	C5, ALPHA, A5, B5
180	LFPDUX	A5,   X, INCX2
181	LFPDUX	B5,   Y, INCY2
182	fpmadd	C6, ALPHA, A6, B6
183	LFPDUX	A6,   X, INCX2
184	LFPDUX	B6,   Y, INCY2
185
186	fpmadd	C7, ALPHA, A7, B7
187	LFPDUX	A7,   X, INCX2
188	LFPDUX	B7,   Y, INCY2
189	fpmadd	C8, ALPHA, A8, B8
190	LFPDUX	A8,   X, INCX2
191	LFPDUX	B8,   Y, INCY2
192
193	STFPDUX	C1,  YY, INCY2
194	STFPDUX	C2,  YY, INCY2
195	STFPDUX	C3,  YY, INCY2
196	STFPDUX	C4,  YY, INCY2
197
198	STFPDUX	C5,  YY, INCY2
199	STFPDUX	C6,  YY, INCY2
200	STFPDUX	C7,  YY, INCY2
201	STFPDUX	C8,  YY, INCY2
202	bdnz	LL(12)
203	.align 4
204
205LL(13):
206	fpmadd	C1, ALPHA, A1, B1
207	fpmadd	C2, ALPHA, A2, B2
208	fpmadd	C3, ALPHA, A3, B3
209	fpmadd	C4, ALPHA, A4, B4
210
211	fpmadd	C5, ALPHA, A5, B5
212	fpmadd	C6, ALPHA, A6, B6
213	STFPDUX	C1,  YY, INCY2
214	fpmadd	C7, ALPHA, A7, B7
215	STFPDUX	C2,  YY, INCY2
216	fpmadd	C8, ALPHA, A8, B8
217	STFPDUX	C3,  YY, INCY2
218	STFPDUX	C4,  YY, INCY2
219
220	STFPDUX	C5,  YY, INCY2
221	STFPDUX	C6,  YY, INCY2
222	STFPDUX	C7,  YY, INCY2
223	STFPDUX	C8,  YY, INCY2
224	.align 4
225
226LL(15):
227	andi.	r0,  N, 15
228	beq	LL(999)
229
230	andi.	r0,  N, 8
231	beq	LL(16)
232
233	LFPDUX	A1,   X, INCX2
234	LFPDUX	B1,   Y, INCY2
235	LFPDUX	A2,   X, INCX2
236	LFPDUX	B2,   Y, INCY2
237	LFPDUX	A3,   X, INCX2
238	LFPDUX	B3,   Y, INCY2
239	LFPDUX	A4,   X, INCX2
240	LFPDUX	B4,   Y, INCY2
241
242	fpmadd	C1, ALPHA, A1, B1
243	fpmadd	C2, ALPHA, A2, B2
244	fpmadd	C3, ALPHA, A3, B3
245	fpmadd	C4, ALPHA, A4, B4
246
247	STFPDUX	C1,  YY, INCY2
248	STFPDUX	C2,  YY, INCY2
249	STFPDUX	C3,  YY, INCY2
250	STFPDUX	C4,  YY, INCY2
251	.align 4
252
253LL(16):
254	andi.	r0,  N, 4
255	beq	LL(17)
256
257	LFPDUX	A1,   X, INCX2
258	LFPDUX	B1,   Y, INCY2
259	LFPDUX	A2,   X, INCX2
260	LFPDUX	B2,   Y, INCY2
261
262	fpmadd	C1, ALPHA, A1, B1
263	fpmadd	C2, ALPHA, A2, B2
264
265	STFPDUX	C1,  YY, INCY2
266	STFPDUX	C2,  YY, INCY2
267	.align 4
268
269LL(17):
270	andi.	r0,  N, 2
271	beq	LL(18)
272
273	LFPDUX	A1,   X, INCX2
274	LFPDUX	B1,   Y, INCY2
275
276	fpmadd	C1, ALPHA, A1, B1
277
278	STFPDUX	C1,  YY, INCY2
279	.align 4
280
281LL(18):
282	andi.	r0,  N, 1
283	beq	LL(999)
284
285	LFDUX	A1,   X, INCX2
286	LFDUX	B1,   Y, INCY2
287
288	fmadd	C1, ALPHA, A1, B1
289	STFDUX	C1,  YY, INCY2
290	b	LL(999)
291	.align 4
292
293/* X is unaliged */
294
295LL(20):
296	LFD	A1,  0 * SIZE(X)
297	addi	X,  X, SIZE
298	sub	X,  X, INCX2
299	sub	Y,  Y, INCY2
300	mr	YY, Y
301
302	srawi.	r0, N, 4
303	mtspr	CTR,  r0
304	beq-	LL(25)
305
306	LFXDUX	A2,   X, INCX2
307	LFPDUX	B1,   Y, INCY2
308	LFXDUX	A3,   X, INCX2
309	LFPDUX	B2,   Y, INCY2
310	LFXDUX	A4,   X, INCX2
311	LFPDUX	B3,   Y, INCY2
312	LFXDUX	A5,   X, INCX2
313	LFPDUX	B4,   Y, INCY2
314
315	LFXDUX	A6,   X, INCX2
316	LFPDUX	B5,   Y, INCY2
317	LFXDUX	A7,   X, INCX2
318	LFPDUX	B6,   Y, INCY2
319	fsmr	A1, A2
320	LFXDUX	A8,   X, INCX2
321  	fsmr	A2, A3
322	LFPDUX	B7,   Y, INCY2
323	fsmr	A3, A4
324	LFXDUX	A9,   X, INCX2
325	fsmr	A4, A5
326	LFPDUX	B8,   Y, INCY2
327	bdz	LL(23)
328	.align 4
329
330LL(22):
331	fpmadd	C1, ALPHA, A1, B1
332	fsmr	A5, A6
333	LFPDUX	B1,   Y, INCY2
334	fpmadd	C2, ALPHA, A2, B2
335	LFXDUX	A2,   X, INCX2
336	fsmr	A6, A7
337	LFPDUX	B2,   Y, INCY2
338	fpmadd	C3, ALPHA, A3, B3
339	LFXDUX	A3,   X, INCX2
340	fsmr	A7, A8
341	LFPDUX	B3,   Y, INCY2
342	fpmadd	C4, ALPHA, A4, B4
343	LFXDUX	A4,   X, INCX2
344	fsmr	A8, A9
345	LFPDUX	B4,   Y, INCY2
346
347	fpmadd	C5, ALPHA, A5, B5
348	LFXDUX	A5,   X, INCX2
349	LFPDUX	B5,   Y, INCY2
350	fpmadd	C6, ALPHA, A6, B6
351	LFXDUX	A6,   X, INCX2
352	LFPDUX	B6,   Y, INCY2
353
354	fpmadd	C7, ALPHA, A7, B7
355	LFXDUX	A7,   X, INCX2
356	LFPDUX	B7,   Y, INCY2
357	fpmadd	C8, ALPHA, A8, B8
358	LFXDUX	A8,   X, INCX2
359	LFPDUX	B8,   Y, INCY2
360
361	fpmr	A1, A9
362	LFXDUX	A9,   X, INCX2
363
364	STFPDUX	C1,  YY, INCY2
365	STFPDUX	C2,  YY, INCY2
366	STFPDUX	C3,  YY, INCY2
367	STFPDUX	C4,  YY, INCY2
368	fsmr	A1, A2
369
370	STFPDUX	C5,  YY, INCY2
371	fsmr	A2, A3
372	STFPDUX	C6,  YY, INCY2
373	fsmr	A3, A4
374	STFPDUX	C7,  YY, INCY2
375	fsmr	A4, A5
376	STFPDUX	C8,  YY, INCY2
377	bdnz	LL(22)
378	.align 4
379
380LL(23):
381	fpmadd	C1, ALPHA, A1, B1
382	fsmr	A5, A6
383	fpmadd	C2, ALPHA, A2, B2
384	fsmr	A6, A7
385	fpmadd	C3, ALPHA, A3, B3
386	fsmr	A7, A8
387	fpmadd	C4, ALPHA, A4, B4
388	fsmr	A8, A9
389
390	fpmadd	C5, ALPHA, A5, B5
391	fpmadd	C6, ALPHA, A6, B6
392	fpmadd	C7, ALPHA, A7, B7
393	fpmadd	C8, ALPHA, A8, B8
394	fpmr	A1, A9
395
396	STFPDUX	C1,  YY, INCY2
397	STFPDUX	C2,  YY, INCY2
398	STFPDUX	C3,  YY, INCY2
399	STFPDUX	C4,  YY, INCY2
400
401	STFPDUX	C5,  YY, INCY2
402	STFPDUX	C6,  YY, INCY2
403	STFPDUX	C7,  YY, INCY2
404	STFPDUX	C8,  YY, INCY2
405	.align 4
406
407LL(25):
408	andi.	r0,  N, 15
409	beq	LL(999)
410
411	andi.	r0,  N, 8
412	beq	LL(26)
413
414	LFXDUX	A2,   X, INCX2
415	LFPDUX	B1,   Y, INCY2
416	LFXDUX	A3,   X, INCX2
417	LFPDUX	B2,   Y, INCY2
418	LFXDUX	A4,   X, INCX2
419	LFPDUX	B3,   Y, INCY2
420	LFXDUX	A5,   X, INCX2
421	LFPDUX	B4,   Y, INCY2
422
423	fsmr	A1, A2
424	fsmr	A2, A3
425	fsmr	A3, A4
426	fsmr	A4, A5
427
428	fpmadd	C1, ALPHA, A1, B1
429	fpmadd	C2, ALPHA, A2, B2
430	fpmadd	C3, ALPHA, A3, B3
431	fpmadd	C4, ALPHA, A4, B4
432	fpmr	A1, A5
433
434	STFPDUX	C1,  YY, INCY2
435	STFPDUX	C2,  YY, INCY2
436	STFPDUX	C3,  YY, INCY2
437	STFPDUX	C4,  YY, INCY2
438	.align 4
439
440LL(26):
441	andi.	r0,  N, 4
442	beq	LL(27)
443
444	LFXDUX	A2,   X, INCX2
445	LFPDUX	B1,   Y, INCY2
446	LFXDUX	A3,   X, INCX2
447	LFPDUX	B2,   Y, INCY2
448
449	fsmr	A1, A2
450	fsmr	A2, A3
451	fpmadd	C1, ALPHA, A1, B1
452	fpmadd	C2, ALPHA, A2, B2
453	fpmr	A1, A3
454
455	STFPDUX	C1,  YY, INCY2
456	STFPDUX	C2,  YY, INCY2
457	.align 4
458
459LL(27):
460	andi.	r0,  N, 2
461	beq	LL(28)
462
463	LFXDUX	A2,   X, INCX2
464	LFPDUX	B1,   Y, INCY2
465
466	fsmr	A1, A2
467	fpmadd	C1, ALPHA, A1, B1
468	fpmr	A1, A2
469
470	STFPDUX	C1,  YY, INCY2
471	.align 4
472
473LL(28):
474	andi.	r0,  N, 1
475	beq	LL(999)
476
477	LFDUX	B1,   Y, INCY2
478
479	fmadd	C1, ALPHA, A1, B1
480	STFDUX	C1,  YY, INCY2
481	b	LL(999)
482	.align 4
483####
484
485
486LL(100):
487	sub	X,  X, INCX
488	sub	Y,  Y, INCY
489	mr	YY, Y
490
491	srawi.	r0, N, 3
492	mtspr	CTR,  r0
493	beq-	LL(115)
494
495	LFDUX	A1,   X, INCX
496	LFDUX	B1,   Y, INCY
497	LFDUX	A2,   X, INCX
498	LFDUX	B2,   Y, INCY
499
500	LFDUX	A3,   X, INCX
501	LFDUX	B3,   Y, INCY
502	LFDUX	A4,   X, INCX
503	LFDUX	B4,   Y, INCY
504
505	LFDUX	A5,   X, INCX
506	LFDUX	B5,   Y, INCY
507	LFDUX	A6,   X, INCX
508	LFDUX	B6,   Y, INCY
509
510	LFDUX	A7,   X, INCX
511	LFDUX	B7,   Y, INCY
512	LFDUX	A8,   X, INCX
513	LFDUX	B8,   Y, INCY
514	bdz	LL(113)
515	.align 4
516
517LL(112):
518	fmadd	C1, ALPHA, A1, B1
519	LFDUX	A1,   X, INCX
520	LFDUX	B1,   Y, INCY
521
522	fmadd	C2, ALPHA, A2, B2
523	LFDUX	A2,   X, INCX
524	LFDUX	B2,   Y, INCY
525
526	fmadd	C3, ALPHA, A3, B3
527	LFDUX	A3,   X, INCX
528	LFDUX	B3,   Y, INCY
529
530	fmadd	C4, ALPHA, A4, B4
531	LFDUX	A4,   X, INCX
532	LFDUX	B4,   Y, INCY
533
534	fmadd	C5, ALPHA, A5, B5
535	LFDUX	A5,   X, INCX
536	LFDUX	B5,   Y, INCY
537	fmadd	C6, ALPHA, A6, B6
538	LFDUX	A6,   X, INCX
539	LFDUX	B6,   Y, INCY
540	fmadd	C7, ALPHA, A7, B7
541	LFDUX	A7,   X, INCX
542	LFDUX	B7,   Y, INCY
543	fmadd	C8, ALPHA, A8, B8
544	LFDUX	A8,   X, INCX
545	LFDUX	B8,   Y, INCY
546
547	STFDUX	C1,  YY, INCY
548	STFDUX	C2,  YY, INCY
549	STFDUX	C3,  YY, INCY
550	STFDUX	C4,  YY, INCY
551
552	STFDUX	C5,  YY, INCY
553	STFDUX	C6,  YY, INCY
554	STFDUX	C7,  YY, INCY
555	STFDUX	C8,  YY, INCY
556	bdnz	LL(112)
557	.align 4
558
559LL(113):
560	fmadd	C1, ALPHA, A1, B1
561	fmadd	C2, ALPHA, A2, B2
562	fmadd	C3, ALPHA, A3, B3
563	fmadd	C4, ALPHA, A4, B4
564
565	fmadd	C5, ALPHA, A5, B5
566	fmadd	C6, ALPHA, A6, B6
567	STFDUX	C1,  YY, INCY
568	fmadd	C7, ALPHA, A7, B7
569	STFDUX	C2,  YY, INCY
570	fmadd	C8, ALPHA, A8, B8
571	STFDUX	C3,  YY, INCY
572
573	STFDUX	C4,  YY, INCY
574	STFDUX	C5,  YY, INCY
575	STFDUX	C6,  YY, INCY
576	STFDUX	C7,  YY, INCY
577	STFDUX	C8,  YY, INCY
578	.align 4
579
580LL(115):
581	andi.	r0,  N, 7
582	beq	LL(999)
583	andi.	r0,  N, 4
584	beq	LL(117)
585
586	LFDUX	A1,   X, INCX
587	LFDUX	B1,   Y, INCY
588	LFDUX	A2,   X, INCX
589	LFDUX	B2,   Y, INCY
590
591	LFDUX	A3,   X, INCX
592	LFDUX	B3,   Y, INCY
593	LFDUX	A4,   X, INCX
594	LFDUX	B4,   Y, INCY
595
596	fmadd	C1, ALPHA, A1, B1
597	fmadd	C2, ALPHA, A2, B2
598	fmadd	C3, ALPHA, A3, B3
599	fmadd	C4, ALPHA, A4, B4
600
601	STFDUX	C1,  YY, INCY
602	STFDUX	C2,  YY, INCY
603	STFDUX	C3,  YY, INCY
604	STFDUX	C4,  YY, INCY
605	.align 4
606
607LL(117):
608	andi.	r0,  N, 2
609	beq	LL(118)
610
611	LFDUX	A1,   X, INCX
612	LFDUX	B1,   Y, INCY
613	LFDUX	A2,   X, INCX
614	LFDUX	B2,   Y, INCY
615
616	fmadd	C1, ALPHA, A1, B1
617	fmadd	C2, ALPHA, A2, B2
618
619	STFDUX	C1,  YY, INCY
620	STFDUX	C2,  YY, INCY
621	.align 4
622
623LL(118):
624	andi.	r0,  N, 1
625	beq	LL(999)
626
627	LFDUX	A1,   X, INCX
628	LFDUX	B1,   Y, INCY
629
630	fmadd	C1, ALPHA, A1, B1
631	STFDUX	C1,  YY, INCY
632	.align 4
633
634LL(999):
635	li	r10, 16
636	subi	SP, SP, 16
637
638	lfpdux	f25, SP, r10
639	lfpdux	f24, SP, r10
640	lfpdux	f23, SP, r10
641	lfpdux	f22, SP, r10
642
643	lfpdux	f21, SP, r10
644	lfpdux	f20, SP, r10
645	lfpdux	f19, SP, r10
646	lfpdux	f18, SP, r10
647
648	lfpdux	f17, SP, r10
649	lfpdux	f16, SP, r10
650	lfpdux	f15, SP, r10
651	lfpdux	f14, SP, r10
652
653	addi	SP, SP,  16
654	blr
655
656	EPILOGUE
657