1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define N	r3
43#define X	r6
44#define INCX	r7
45
46#define INCX2	r4
47#define X2	r5
48
49#define ALPHA	f1
50
51#define A1	f0
52#define A2	f16
53#define A3	f2
54#define A4	f3
55#define A5	f4
56#define A6	f5
57#define A7	f6
58#define A8	f7
59
60#define B1	f8
61#define B2	f9
62#define B3	f10
63#define B4	f11
64#define B5	f12
65#define B6	f13
66#define B7	f14
67#define B8	f15
68
69	PROLOGUE
70	PROFCODE
71
72	li	r10, -16
73
74	stfpdux	f14, SP, r10
75	stfpdux	f15, SP, r10
76	stfpdux	f16, SP, r10
77
78	li	r10,   0
79	stwu	r10,   -4(SP)
80	stwu	r10,   -4(SP)
81	stwu	r10,   -4(SP)
82	stwu	r10,   -4(SP)
83
84	lfpdx	A1, SP, r10		# Zero clear
85	fsmfp	ALPHA, ALPHA
86
87	slwi	INCX,  INCX, BASE_SHIFT
88	add	INCX2, INCX, INCX
89
90	cmpwi	cr0, N, 0
91	ble	LL(999)
92
93	cmpwi	cr0, INCX, SIZE
94	bne	LL(100)
95
96	fcmpu	cr7, ALPHA, A1
97	bne	cr7, LL(50)
98
99	sub	X,  X, INCX2
100
101	andi.	r0, X, 2 * SIZE - 1
102	beq	LL(11)
103
104	STFDX	A1, X, INCX2
105	addi	X, X, 1 * SIZE
106	addi	N, N, -1
107	cmpwi	cr0, N, 0
108	ble	LL(999)
109	.align 4
110
111LL(11):
112	srawi.	r0, N, 4
113	mtspr	CTR,  r0
114	beq-	LL(15)
115	.align 4
116
117LL(12):
118	STFPDUX	A1,   X, INCX2
119	STFPDUX	A1,   X, INCX2
120	STFPDUX	A1,   X, INCX2
121	STFPDUX	A1,   X, INCX2
122	STFPDUX	A1,   X, INCX2
123	STFPDUX	A1,   X, INCX2
124	STFPDUX	A1,   X, INCX2
125	STFPDUX	A1,   X, INCX2
126	bdnz	LL(12)
127	.align 4
128
129LL(15):
130	andi.	r0,  N, 15
131	beq	LL(999)
132	andi.	r0,  N, 8
133	beq	LL(16)
134
135	STFPDUX	A1,   X, INCX2
136	STFPDUX	A1,   X, INCX2
137	STFPDUX	A1,   X, INCX2
138	STFPDUX	A1,   X, INCX2
139	.align 4
140
141LL(16):
142	andi.	r0,  N, 4
143	beq	LL(17)
144
145	STFPDUX	A1,   X, INCX2
146	STFPDUX	A1,   X, INCX2
147	.align 4
148
149LL(17):
150	andi.	r0,  N, 2
151	beq	LL(18)
152
153	STFPDUX	A1,   X, INCX2
154	.align 4
155
156LL(18):
157	andi.	r0,  N, 1
158	beq	LL(999)
159	STFDUX	A1,   X, INCX2
160	b	LL(999)
161	.align 4
162
163LL(50):
164	sub	X2, X, INCX2
165	sub	X,  X, INCX2
166
167	andi.	r0, X, 2 * SIZE - 1
168	beq	LL(51)
169
170	LFDX	A1, X,  INCX2
171	addi	X,  X,  1 * SIZE
172
173	fmul	B1, ALPHA, A1
174	addi	N, N, -1
175	cmpwi	cr0, N, 0
176
177	STFDX	B1, X2, INCX2
178	addi	X2, X2, 1 * SIZE
179	ble	LL(999)
180	.align 4
181
182LL(51):
183	srawi.	r0, N, 4
184	mtspr	CTR,  r0
185	beq-	LL(55)
186
187	LFPDUX	A1, X,  INCX2
188	LFPDUX	A2, X,  INCX2
189	LFPDUX	A3, X,  INCX2
190	LFPDUX	A4, X,  INCX2
191	LFPDUX	A5, X,  INCX2
192	LFPDUX	A6, X,  INCX2
193	LFPDUX	A7, X,  INCX2
194	LFPDUX	A8, X,  INCX2
195	bdz	LL(53)
196	.align 4
197
198LL(52):
199	fpmul	B1, ALPHA, A1
200	LFPDUX	A1, X,  INCX2
201	fpmul	B2, ALPHA, A2
202	LFPDUX	A2, X,  INCX2
203	fpmul	B3, ALPHA, A3
204	LFPDUX	A3, X,  INCX2
205	fpmul	B4, ALPHA, A4
206	LFPDUX	A4, X,  INCX2
207	fpmul	B5, ALPHA, A5
208	LFPDUX	A5, X,  INCX2
209	fpmul	B6, ALPHA, A6
210	LFPDUX	A6, X,  INCX2
211	fpmul	B7, ALPHA, A7
212	LFPDUX	A7, X,  INCX2
213	fpmul	B8, ALPHA, A8
214	LFPDUX	A8, X,  INCX2
215
216	STFPDUX	B1, X2, INCX2
217	STFPDUX	B2, X2, INCX2
218	STFPDUX	B3, X2, INCX2
219	STFPDUX	B4, X2, INCX2
220	STFPDUX	B5, X2, INCX2
221	STFPDUX	B6, X2, INCX2
222	STFPDUX	B7, X2, INCX2
223	STFPDUX	B8, X2, INCX2
224	bdnz	LL(52)
225	.align 4
226
227LL(53):
228	fpmul	B1, ALPHA, A1
229	fpmul	B2, ALPHA, A2
230	fpmul	B3, ALPHA, A3
231	fpmul	B4, ALPHA, A4
232	fpmul	B5, ALPHA, A5
233	fpmul	B6, ALPHA, A6
234	STFPDUX	B1, X2, INCX2
235	fpmul	B7, ALPHA, A7
236	STFPDUX	B2, X2, INCX2
237	fpmul	B8, ALPHA, A8
238	STFPDUX	B3, X2, INCX2
239
240	STFPDUX	B4, X2, INCX2
241	STFPDUX	B5, X2, INCX2
242	STFPDUX	B6, X2, INCX2
243	STFPDUX	B7, X2, INCX2
244	STFPDUX	B8, X2, INCX2
245	.align 4
246
247LL(55):
248	andi.	r0,  N, 15
249	beq	LL(999)
250	andi.	r0,  N, 8
251	beq	LL(56)
252
253	LFPDUX	A1, X,  INCX2
254	LFPDUX	A2, X,  INCX2
255	LFPDUX	A3, X,  INCX2
256	LFPDUX	A4, X,  INCX2
257
258	fpmul	B1, ALPHA, A1
259	fpmul	B2, ALPHA, A2
260	fpmul	B3, ALPHA, A3
261	fpmul	B4, ALPHA, A4
262
263	STFPDUX	B1, X2, INCX2
264	STFPDUX	B2, X2, INCX2
265	STFPDUX	B3, X2, INCX2
266	STFPDUX	B4, X2, INCX2
267	.align 4
268
269LL(56):
270	andi.	r0,  N, 4
271	beq	LL(57)
272
273	LFPDUX	A1, X,  INCX2
274	LFPDUX	A2, X,  INCX2
275	fpmul	B1, ALPHA, A1
276	fpmul	B2, ALPHA, A2
277	STFPDUX	B1, X2, INCX2
278	STFPDUX	B2, X2, INCX2
279	.align 4
280
281LL(57):
282	andi.	r0,  N, 2
283	beq	LL(58)
284
285	LFPDUX	A1, X,  INCX2
286	fpmul	B1, ALPHA, A1
287	STFPDUX	B1, X2, INCX2
288	.align 4
289
290LL(58):
291	andi.	r0,  N, 1
292	beq	LL(999)
293
294	LFDX	A1, X,  INCX2
295	fmul	B1, ALPHA, A1
296	STFDX	B1, X2, INCX2
297	b	LL(999)
298	.align 4
299
300
301LL(100):
302	fcmpu	cr7, ALPHA, A1
303	bne	cr7, LL(200)
304
305	sub	X,  X, INCX
306
307	srawi.	r0, N, 3
308	mtspr	CTR,  r0
309	beq-	LL(115)
310	.align 4
311
312LL(112):
313	STFDUX	A1,   X, INCX
314	STFDUX	A1,   X, INCX
315	STFDUX	A1,   X, INCX
316	STFDUX	A1,   X, INCX
317	STFDUX	A1,   X, INCX
318	STFDUX	A1,   X, INCX
319	STFDUX	A1,   X, INCX
320	STFDUX	A1,   X, INCX
321	bdnz	LL(112)
322	.align 4
323
324LL(115):
325	andi.	r0,  N, 7
326	beq	LL(999)
327	andi.	r0,  N, 4
328	beq	LL(117)
329
330	STFDUX	A1,   X, INCX
331	STFDUX	A1,   X, INCX
332	STFDUX	A1,   X, INCX
333	STFDUX	A1,   X, INCX
334	.align 4
335
336LL(117):
337	andi.	r0,  N, 2
338	beq	LL(118)
339
340	STFDUX	A1,   X, INCX
341	STFDUX	A1,   X, INCX
342	.align 4
343
344LL(118):
345	andi.	r0,  N, 1
346	beq	LL(999)
347	STFDUX	A1,   X, INCX
348	b	LL(999)
349	.align 4
350
351LL(200):
352	sub	X2, X, INCX
353	sub	X,  X, INCX
354
355	srawi.	r0, N, 3
356	mtspr	CTR,  r0
357	beq-	LL(215)
358
359	LFDUX	A1,   X, INCX
360	LFDUX	A2,   X, INCX
361	LFDUX	A3,   X, INCX
362	LFDUX	A4,   X, INCX
363	LFDUX	A5,   X, INCX
364	LFDUX	A6,   X, INCX
365	LFDUX	A7,   X, INCX
366	LFDUX	A8,   X, INCX
367	bdz	LL(213)
368	.align 4
369
370LL(212):
371	fmul	B1, ALPHA, A1
372	LFDUX	A1,   X, INCX
373	fmul	B2, ALPHA, A2
374	LFDUX	A2,   X, INCX
375
376	fmul	B3, ALPHA, A3
377	LFDUX	A3,   X, INCX
378	fmul	B4, ALPHA, A4
379	LFDUX	A4,   X, INCX
380
381	fmul	B5, ALPHA, A5
382	LFDUX	A5,   X, INCX
383	fmul	B6, ALPHA, A6
384	LFDUX	A6,   X, INCX
385
386	fmul	B7, ALPHA, A7
387	LFDUX	A7,   X, INCX
388	fmul	B8, ALPHA, A8
389	LFDUX	A8,   X, INCX
390
391	STFDUX	B1,   X2, INCX
392	STFDUX	B2,   X2, INCX
393	STFDUX	B3,   X2, INCX
394	STFDUX	B4,   X2, INCX
395	STFDUX	B5,   X2, INCX
396	STFDUX	B6,   X2, INCX
397	STFDUX	B7,   X2, INCX
398	STFDUX	B8,   X2, INCX
399	bdnz	LL(212)
400	.align 4
401
402LL(213):
403	fmul	B1, ALPHA, A1
404	fmul	B2, ALPHA, A2
405	fmul	B3, ALPHA, A3
406	fmul	B4, ALPHA, A4
407	fmul	B5, ALPHA, A5
408
409	fmul	B6, ALPHA, A6
410	STFDUX	B1,   X2, INCX
411	fmul	B7, ALPHA, A7
412	STFDUX	B2,   X2, INCX
413	fmul	B8, ALPHA, A8
414	STFDUX	B3,   X2, INCX
415	STFDUX	B4,   X2, INCX
416	STFDUX	B5,   X2, INCX
417	STFDUX	B6,   X2, INCX
418	STFDUX	B7,   X2, INCX
419	STFDUX	B8,   X2, INCX
420	.align 4
421
422LL(215):
423	andi.	r0,  N, 7
424	beq	LL(999)
425	andi.	r0,  N, 4
426	beq	LL(217)
427
428	LFDUX	A1,   X, INCX
429	LFDUX	A2,   X, INCX
430	LFDUX	A3,   X, INCX
431	LFDUX	A4,   X, INCX
432
433	fmul	B1, ALPHA, A1
434	fmul	B2, ALPHA, A2
435	fmul	B3, ALPHA, A3
436	fmul	B4, ALPHA, A4
437
438	STFDUX	B1,   X2, INCX
439	STFDUX	B2,   X2, INCX
440	STFDUX	B3,   X2, INCX
441	STFDUX	B4,   X2, INCX
442	.align 4
443
444LL(217):
445	andi.	r0,  N, 2
446	beq	LL(218)
447
448	LFDUX	A1,   X, INCX
449	LFDUX	A2,   X, INCX
450
451	fmul	B1, ALPHA, A1
452	fmul	B2, ALPHA, A2
453
454	STFDUX	B1,   X2, INCX
455	STFDUX	B2,   X2, INCX
456	.align 4
457
458LL(218):
459	andi.	r0,  N, 1
460	beq	LL(999)
461
462	LFDUX	A1,   X, INCX
463	fmul	B1, ALPHA, A1
464	STFDUX	B1,   X2, INCX
465	.align 4
466
467LL(999):
468	li	r10, 16
469
470	lfpdux	f16, SP, r10
471	lfpdux	f15, SP, r10
472	lfpdux	f14, SP, r10
473
474	addi	SP, SP,  16
475	blr
476
477	EPILOGUE
478