1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r4
44#define INCX	r5
45#define Y	r6
46#define INCY	r7
47
48#define INCX2	r8
49#define INCY2	r9
50#define X2	r10
51#define Y2	r11
52
53#define A1	f0
54#define A2	f1
55#define A3	f2
56#define A4	f3
57#define A5	f4
58#define A6	f5
59#define A7	f6
60#define A8	f7
61#define A9	f8
62
63#define T1	f9
64#define T2	f10
65#define T3	f11
66#define T4	f12
67#define T5	f13
68#define T6	f14
69#define T7	f15
70
71	PROLOGUE
72	PROFCODE
73
74	li	r10, -16
75
76	stfpdux	f14, SP, r10
77	stfpdux	f15, SP, r10
78
79	slwi	INCX,  INCX, BASE_SHIFT
80	slwi	INCY,  INCY, BASE_SHIFT
81	add	INCX2, INCX, INCX
82	add	INCY2, INCY, INCY
83
84	cmpwi	cr0, N, 0
85	ble	LL(999)
86
87	sub	X, X, INCX2
88	sub	Y, Y, INCY2
89
90	cmpwi	cr0, INCX, SIZE
91	bne	LL(100)
92	cmpwi	cr0, INCY, SIZE
93	bne	LL(100)
94
95	andi.	r0, X, 2 * SIZE - 1
96	bne	LL(30)
97	andi.	r0, Y, 2 * SIZE - 1
98	bne	LL(20)
99	.align 4
100
101LL(10):  /* X ): aligned     Y ): aligned */
102	srawi.	r0, N, 3
103	mtspr	CTR,  r0
104	beq-	LL(15)
105
106	LFPDUX	A1,   X, INCX2
107	LFPDUX	A2,   X, INCX2
108	LFPDUX	A3,   X, INCX2
109	LFPDUX	A4,   X, INCX2
110	LFPDUX	A5,   X, INCX2
111	LFPDUX	A6,   X, INCX2
112	LFPDUX	A7,   X, INCX2
113	LFPDUX	A8,   X, INCX2
114	bdz	LL(13)
115	.align 4
116
117LL(12):
118	STFPDUX	A1,   Y, INCY2
119	LFPDUX	A1,   X, INCX2
120	STFPDUX	A2,   Y, INCY2
121	LFPDUX	A2,   X, INCX2
122	STFPDUX	A3,   Y, INCY2
123	LFPDUX	A3,   X, INCX2
124	STFPDUX	A4,   Y, INCY2
125	LFPDUX	A4,   X, INCX2
126
127	STFPDUX	A5,   Y, INCY2
128	LFPDUX	A5,   X, INCX2
129	STFPDUX	A6,   Y, INCY2
130	LFPDUX	A6,   X, INCX2
131	STFPDUX	A7,   Y, INCY2
132	LFPDUX	A7,   X, INCX2
133	STFPDUX	A8,   Y, INCY2
134	LFPDUX	A8,   X, INCX2
135	bdnz	LL(12)
136	.align 4
137
138LL(13):
139	STFPDUX	A1,   Y, INCY2
140	STFPDUX	A2,   Y, INCY2
141	STFPDUX	A3,   Y, INCY2
142	STFPDUX	A4,   Y, INCY2
143	STFPDUX	A5,   Y, INCY2
144	STFPDUX	A6,   Y, INCY2
145	STFPDUX	A7,   Y, INCY2
146	STFPDUX	A8,   Y, INCY2
147	.align 4
148
149LL(15):
150	andi.	r0,  N, 7
151	beq	LL(999)
152
153	andi.	r0,  N, 4
154	beq	LL(16)
155
156	LFPDUX	A1,    X, INCX2
157	LFPDUX	A2,    X, INCX2
158	LFPDUX	A3,    X, INCX2
159	LFPDUX	A4,    X, INCX2
160
161	STFPDUX	A1,    Y, INCY2
162	STFPDUX	A2,    Y, INCY2
163	STFPDUX	A3,    Y, INCY2
164	STFPDUX	A4,    Y, INCY2
165	.align 4
166
167LL(16):
168	andi.	r0,  N, 2
169	beq	LL(17)
170
171	LFPDUX	A1,    X, INCX2
172	LFPDUX	A2,    X, INCX2
173
174	STFPDUX	A1,    Y, INCY2
175	STFPDUX	A2,    Y, INCY2
176	.align 4
177
178LL(17):
179	andi.	r0,  N, 1
180	beq	LL(999)
181
182	LFPDUX	A1,    X, INCX2
183	STFPDUX	A1,    Y, INCY2
184	b  LL(999)
185	.align 4
186
187LL(20):  /* X : aligned     Y : unaligned */
188
189	LFXDUX	A1, X, INCX2
190	addi	N, N, -1
191	cmpwi	cr0, N, 0
192	STFSDX	A1, Y, INCY2
193	add	Y, Y, INCY
194	ble	LL(29)
195	.align 4
196
197	srawi.	r0, N, 3
198	mtspr	CTR,  r0
199	beq-	LL(25)
200
201	LFXDUX	T1,   X, INCX2
202	LFXDUX	T2,   X, INCX2
203	LFXDUX	T3,   X, INCX2
204	LFXDUX	T4,   X, INCX2
205
206	LFPDUX	A6,   X, INCX2
207	fsmr	A1, T1
208	LFPDUX	A7,   X, INCX2
209	fsmr	T1, T2
210	LFPDUX	A8,   X, INCX2
211	fsmr	T2, T3
212	LFPDUX	A9,   X, INCX2
213	fsmr	T3, T4
214	bdz	LL(23)
215	.align 4
216
217LL(22):
218	STFPDUX	A1,   Y, INCY2
219	fxmr	T5, A6
220	STFPDUX	T1,   Y, INCY2
221	fxmr	T6, A7
222	STFPDUX	T2,   Y, INCY2
223	fxmr	T7, A8
224	STFPDUX	T3,   Y, INCY2
225	fxmr	A1, A9
226
227	fsmr	T4, T5
228	LFPDUX	A2,   X, INCX2
229	fsmr	T5, T6
230	LFPDUX	A3,   X, INCX2
231	fsmr	T6, T7
232	LFPDUX	A4,   X, INCX2
233	fsmr	T7, A1
234	LFPDUX	A5,   X, INCX2
235
236	STFPDUX	T4,   Y, INCY2
237	fxmr	T1, A2
238	STFPDUX	T5,   Y, INCY2
239	fxmr	T2, A3
240	STFPDUX	T6,   Y, INCY2
241	fxmr	T3, A4
242	STFPDUX	T7,   Y, INCY2
243	fxmr	T4, A5
244
245	LFPDUX	A6,   X, INCX2
246	fsmr	A1, T1
247	LFPDUX	A7,   X, INCX2
248	fsmr	T1, T2
249	LFPDUX	A8,   X, INCX2
250	fsmr	T2, T3
251	LFPDUX	A9,   X, INCX2
252	fsmr	T3, T4
253	bdnz	LL(22)
254	.align 4
255
256LL(23):
257	STFPDUX	A1,   Y, INCY2
258	fxmr	T5, A6
259	STFPDUX	T1,   Y, INCY2
260	fxmr	T6, A7
261	STFPDUX	T2,   Y, INCY2
262	fxmr	T7, A8
263	STFPDUX	T3,   Y, INCY2
264	fxmr	A1, A9
265
266	fsmr	T4, T5
267	fsmr	T5, T6
268	fsmr	T6, T7
269	fsmr	T7, A1
270
271	STFPDUX	T4,   Y, INCY2
272	STFPDUX	T5,   Y, INCY2
273	STFPDUX	T6,   Y, INCY2
274	STFPDUX	T7,   Y, INCY2
275	.align 4
276
277LL(25):
278	andi.	r0,  N, 7
279	beq	LL(29)
280
281	andi.	r0,  N, 4
282	beq	LL(26)
283
284	LFXDUX	A2,    X, INCX2
285	LFXDUX	A3,    X, INCX2
286	LFXDUX	A4,    X, INCX2
287	LFXDUX	A5,    X, INCX2
288
289	fsmr	A1, A2
290	fsmr	A2, A3
291	fsmr	A3, A4
292	fsmr	A4, A5
293
294	STFPDUX	A1,    Y, INCY2
295	STFPDUX	A2,    Y, INCY2
296	STFPDUX	A3,    Y, INCY2
297	STFPDUX	A4,    Y, INCY2
298	fpmr	A1, A5
299	.align 4
300
301LL(26):
302	andi.	r0,  N, 2
303	beq	LL(27)
304
305	LFXDUX	A2,    X, INCX2
306	LFXDUX	A3,    X, INCX2
307	fsmr	A1, A2
308	fsmr	A2, A3
309	STFPDUX	A1,    Y, INCY2
310	STFPDUX	A2,    Y, INCY2
311	fpmr	A1, A3
312	.align 4
313
314LL(27):
315	andi.	r0,  N, 1
316	beq	LL(29)
317
318	LFXDUX	A2,    X, INCX2
319	fsmr	A1, A2
320	STFPDUX	A1,    Y, INCY2
321	fpmr	A1, A2
322	.align 4
323
324LL(29):
325	STFDUX	A1,    Y, INCY2
326	b  LL(999)
327	.align 4
328
329LL(30):  /* X ): unaligned   Y ): aligned */
330	andi.	r0, Y, 2 * SIZE - 1
331	bne	LL(40)
332
333	LFDX	A1, X, INCX2
334	add	X, X, INCX
335
336	srawi.	r0, N, 3
337	mtspr	CTR,  r0
338	beq-	LL(35)
339
340	LFXDUX	T1,   X, INCX2
341	LFXDUX	T2,   X, INCX2
342	LFXDUX	T3,   X, INCX2
343	LFXDUX	T4,   X, INCX2
344
345	LFPDUX	A6,   X, INCX2
346	fsmr	A1, T1
347	LFPDUX	A7,   X, INCX2
348	fsmr	T1, T2
349	LFPDUX	A8,   X, INCX2
350	fsmr	T2, T3
351	LFPDUX	A9,   X, INCX2
352	fsmr	T3, T4
353	bdz	LL(33)
354	.align 4
355
356LL(32):
357	fxmr	T5, A6
358	STFPDUX	A1,   Y, INCY2
359	fxmr	T6, A7
360	STFPDUX	T1,   Y, INCY2
361	fxmr	T7, A8
362	STFPDUX	T2,   Y, INCY2
363	fxmr	A1, A9
364	STFPDUX	T3,   Y, INCY2
365
366	LFPDUX	A2,   X, INCX2
367	fsmr	T4, T5
368	LFPDUX	A3,   X, INCX2
369	fsmr	T5, T6
370	LFPDUX	A4,   X, INCX2
371	fsmr	T6, T7
372	LFPDUX	A5,   X, INCX2
373	fsmr	T7, A1
374
375	fxmr	T1, A2
376	STFPDUX	T4,   Y, INCY2
377	fxmr	T2, A3
378	STFPDUX	T5,   Y, INCY2
379	fxmr	T3, A4
380	STFPDUX	T6,   Y, INCY2
381	fxmr	T4, A5
382	STFPDUX	T7,   Y, INCY2
383
384	fsmr	A1, T1
385	LFPDUX	A6,   X, INCX2
386	fsmr	T1, T2
387	LFPDUX	A7,   X, INCX2
388	fsmr	T2, T3
389	LFPDUX	A8,   X, INCX2
390	fsmr	T3, T4
391	LFPDUX	A9,   X, INCX2
392	bdnz	LL(32)
393	.align 4
394
395LL(33):
396	STFPDUX	A1,   Y, INCY2
397	fxmr	T5, A6
398	STFPDUX	T1,   Y, INCY2
399	fxmr	T6, A7
400	STFPDUX	T2,   Y, INCY2
401	fxmr	T7, A8
402	STFPDUX	T3,   Y, INCY2
403	fxmr	A1, A9
404
405	fsmr	T4, T5
406	fsmr	T5, T6
407	fsmr	T6, T7
408	fsmr	T7, A1
409
410	STFPDUX	T4,   Y, INCY2
411	STFPDUX	T5,   Y, INCY2
412	STFPDUX	T6,   Y, INCY2
413	STFPDUX	T7,   Y, INCY2
414	.align 4
415
416LL(35):
417	andi.	r0,  N, 7
418	beq	LL(999)
419
420	andi.	r0,  N, 4
421	beq	LL(36)
422
423	LFXDUX	A2,    X, INCX2
424	LFXDUX	A3,    X, INCX2
425	LFXDUX	A4,    X, INCX2
426	LFXDUX	A5,    X, INCX2
427
428	fsmr	A1, A2
429	fsmr	A2, A3
430	fsmr	A3, A4
431	fsmr	A4, A5
432
433	STFPDUX	A1,    Y, INCY2
434	STFPDUX	A2,    Y, INCY2
435	STFPDUX	A3,    Y, INCY2
436	STFPDUX	A4,    Y, INCY2
437	fpmr	A1, A5
438	.align 4
439
440LL(36):
441	andi.	r0,  N, 2
442	beq	LL(37)
443
444	LFXDUX	A2,    X, INCX2
445	LFXDUX	A3,    X, INCX2
446	fsmr	A1, A2
447	fsmr	A2, A3
448	STFPDUX	A1,    Y, INCY2
449	STFPDUX	A2,    Y, INCY2
450	fpmr	A1, A3
451	.align 4
452
453LL(37):
454	andi.	r0,  N, 1
455	beq	LL(999)
456
457	LFXDUX	A2,    X, INCX2
458	fsmr	A1, A2
459	STFPDUX	A1,    Y, INCY2
460	b  LL(999)
461	.align 4
462
463LL(40):  /* X : unaligned   Y : unaligned */
464
465	LFDX	A1, X, INCX2
466	add	X, X, INCX
467
468	addi	N, N, -1
469	cmpwi	cr0, N, 0
470	STFDX	A1, Y, INCY2
471	add	Y, Y, INCY
472	ble	LL(49)
473
474	srawi.	r0, N, 3
475	mtspr	CTR,  r0
476	beq-	LL(45)
477
478	LFPDUX	A1,   X, INCX2
479	LFPDUX	A2,   X, INCX2
480	LFPDUX	A3,   X, INCX2
481	LFPDUX	A4,   X, INCX2
482	LFPDUX	A5,   X, INCX2
483	LFPDUX	A6,   X, INCX2
484	LFPDUX	A7,   X, INCX2
485	LFPDUX	A8,   X, INCX2
486	bdz	LL(43)
487	.align 4
488
489LL(42):
490	STFPDUX	A1,   Y, INCY2
491	LFPDUX	A1,   X, INCX2
492	STFPDUX	A2,   Y, INCY2
493	LFPDUX	A2,   X, INCX2
494	STFPDUX	A3,   Y, INCY2
495	LFPDUX	A3,   X, INCX2
496	STFPDUX	A4,   Y, INCY2
497	LFPDUX	A4,   X, INCX2
498
499	STFPDUX	A5,   Y, INCY2
500	LFPDUX	A5,   X, INCX2
501	STFPDUX	A6,   Y, INCY2
502	LFPDUX	A6,   X, INCX2
503	STFPDUX	A7,   Y, INCY2
504	LFPDUX	A7,   X, INCX2
505	STFPDUX	A8,   Y, INCY2
506	LFPDUX	A8,   X, INCX2
507	bdnz	LL(42)
508	.align 4
509
510LL(43):
511	STFPDUX	A1,   Y, INCY2
512	STFPDUX	A2,   Y, INCY2
513	STFPDUX	A3,   Y, INCY2
514	STFPDUX	A4,   Y, INCY2
515	STFPDUX	A5,   Y, INCY2
516	STFPDUX	A6,   Y, INCY2
517	STFPDUX	A7,   Y, INCY2
518	STFPDUX	A8,   Y, INCY2
519	.align 4
520
521LL(45):
522	andi.	r0,  N, 7
523	beq	LL(49)
524
525	andi.	r0,  N, 4
526	beq	LL(46)
527
528	LFPDUX	A1,    X, INCX2
529	LFPDUX	A2,    X, INCX2
530	LFPDUX	A3,    X, INCX2
531	LFPDUX	A4,    X, INCX2
532
533	STFPDUX	A1,    Y, INCY2
534	STFPDUX	A2,    Y, INCY2
535	STFPDUX	A3,    Y, INCY2
536	STFPDUX	A4,    Y, INCY2
537	.align 4
538
539LL(46):
540	andi.	r0,  N, 2
541	beq	LL(47)
542
543	LFPDUX	A1,    X, INCX2
544	LFPDUX	A2,    X, INCX2
545
546	STFPDUX	A1,    Y, INCY2
547	STFPDUX	A2,    Y, INCY2
548	.align 4
549
550LL(47):
551	andi.	r0,  N, 1
552	beq	LL(49)
553
554	LFPDUX	A1,    X, INCX2
555	STFPDUX	A1,    Y, INCY2
556
557LL(49):
558	LFDUX	A1,    X, INCX2
559	STFDUX	A1,    Y, INCY2
560	b  LL(999)
561	.align 4
562
563LL(100):
564	addi	X2, X, SIZE
565	addi	Y2, Y, SIZE
566
567	srawi.	r0, N, 2
568	mtspr	CTR,  r0
569	beq-	LL(115)
570
571	LFDUX	A1,   X,  INCX2
572	LFDUX	A2,   X2, INCX2
573	LFDUX	A3,   X,  INCX2
574	LFDUX	A4,   X2, INCX2
575	LFDUX	A5,   X,  INCX2
576	LFDUX	A6,   X2, INCX2
577	LFDUX	A7,   X,  INCX2
578	LFDUX	A8,   X2, INCX2
579	bdz	LL(113)
580	.align 4
581
582LL(112):
583	STFDUX	A1,   Y,  INCY2
584	LFDUX	A1,   X,  INCX2
585	STFDUX	A2,   Y2, INCY2
586	LFDUX	A2,   X2, INCX2
587	STFDUX	A3,   Y,  INCY2
588	LFDUX	A3,   X,  INCX2
589	STFDUX	A4,   Y2, INCY2
590	LFDUX	A4,   X2, INCX2
591
592	STFDUX	A5,   Y,  INCY2
593	LFDUX	A5,   X,  INCX2
594	STFDUX	A6,   Y2, INCY2
595	LFDUX	A6,   X2, INCX2
596	STFDUX	A7,   Y,  INCY2
597	LFDUX	A7,   X,  INCX2
598	STFDUX	A8,   Y2, INCY2
599	LFDUX	A8,   X2, INCX2
600	bdnz	LL(112)
601	.align 4
602
603LL(113):
604	STFDUX	A1,   Y,  INCY2
605	STFDUX	A2,   Y2, INCY2
606	STFDUX	A3,   Y,  INCY2
607	STFDUX	A4,   Y2, INCY2
608	STFDUX	A5,   Y,  INCY2
609	STFDUX	A6,   Y2, INCY2
610	STFDUX	A7,   Y,  INCY2
611	STFDUX	A8,   Y2, INCY2
612	.align 4
613
614LL(115):
615	andi.	r0,  N, 3
616	beq	LL(999)
617	andi.	r0,  N, 2
618	beq	LL(117)
619
620	LFDUX	A1,    X,  INCX2
621	LFDUX	A2,    X2, INCX2
622	LFDUX	A3,    X,  INCX2
623	LFDUX	A4,    X2, INCX2
624
625	STFDUX	A1,    Y,  INCY2
626	STFDUX	A2,    Y2, INCY2
627	STFDUX	A3,    Y,  INCY2
628	STFDUX	A4,    Y2, INCY2
629	.align 4
630
631LL(117):
632	andi.	r0,  N, 1
633	beq	LL(999)
634
635	LFDUX	A1,    X,  INCX2
636	LFDUX	A2,    X2, INCX2
637
638	STFDUX	A1,    Y,  INCY2
639	STFDUX	A2,    Y2, INCY2
640	.align 4
641
642LL(999):
643	li	r10, 16
644	addi	SP, SP,  -16
645
646	lfpdux	f15, SP, r10
647	lfpdux	f14, SP, r10
648
649	addi	SP, SP,  16
650	blr
651
652	EPILOGUE
653