1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define N	r3
26#define X	r4
27#define INCX	r5
28
29#define INCX2	r6
30#define X2	r7
31
32#define C1	f1
33#define C2	f0
34#define C3	f2
35#define C4	f3
36
37#define A1	f4
38#define A2	f5
39#define A3	f6
40#define A4	f7
41#define A5	f8
42#define A6	f9
43#define A7	f10
44#define A8	f11
45
46#define T1	f12
47#define T2	f13
48#define T3	f14
49#define T4	f15
50
51	PROLOGUE
52	PROFCODE
53
54	li	r10, -16
55
56	stfpdux	f14, SP, r10
57	stfpdux	f15, SP, r10
58
59	li	r10,   0
60	stwu	r10,   -4(SP)
61	stwu	r10,   -4(SP)
62	stwu	r10,   -4(SP)
63	stwu	r10,   -4(SP)
64
65#ifdef F_INTERFACE
66	LDINT	N,    0(N)
67	LDINT	INCX, 0(INCX)
68#endif
69
70	lfpdx	C1, SP, r10		# Zero clear
71
72	slwi	INCX,  INCX, BASE_SHIFT
73	add	INCX2, INCX, INCX
74
75	fpmr	C2, C1
76	fpmr	C3, C1
77	fpmr	C4, C1
78
79	cmpwi	cr0, N, 0
80	ble	LL(999)
81	cmpwi	cr0, INCX, 0
82	ble	LL(999)
83
84	cmpwi	cr0, INCX, SIZE
85	bne	LL(100)
86
87	andi.	r0, X, 2 * SIZE - 1
88	beq	LL(05)
89
90	LFD	C1, 0(X)
91	addi	X, X, 1 * SIZE
92	addi	N, N, -1
93	cmpwi	cr0, N, 0
94	fabs	C1, C1
95	ble	LL(999)
96	.align 4
97
98LL(05):
99	srawi.	r0, N, 4
100	sub	X, X, INCX2
101	mtspr	CTR,  r0
102	beq-	LL(15)
103
104	LFPDUX	A1,   X, INCX2
105	fpmr	T1, C2
106	LFPDUX	A2,   X, INCX2
107	fpmr	T2, C2
108	LFPDUX	A3,   X, INCX2
109	fpmr	T3, C2
110	LFPDUX	A4,   X, INCX2
111	fpmr	T4, C2
112	LFPDUX	A5,   X, INCX2
113	LFPDUX	A6,   X, INCX2
114	LFPDUX	A7,   X, INCX2
115	LFPDUX	A8,   X, INCX2
116	bdz	LL(13)
117	.align 4
118
119LL(12):
120	fpadd	C1, C1, T1
121	nop
122	fpabs	T1, A1
123	LFPDUX	A1,   X, INCX2
124
125	fpadd	C2, C2, T2
126	nop
127	fpabs	T2, A2
128	LFPDUX	A2,   X, INCX2
129
130	fpadd	C3, C3, T3
131	nop
132	fpabs	T3, A3
133	LFPDUX	A3,   X, INCX2
134
135	fpadd	C4, C4, T4
136	nop
137	fpabs	T4, A4
138	LFPDUX	A4,   X, INCX2
139
140	fpadd	C1, C1, T1
141	nop
142	fpabs	T1, A5
143	LFPDUX	A5,   X, INCX2
144
145	fpadd	C2, C2, T2
146	nop
147	fpabs	T2, A6
148	LFPDUX	A6,   X, INCX2
149
150	fpadd	C3, C3, T3
151	nop
152	fpabs	T3, A7
153	LFPDUX	A7,   X, INCX2
154
155	fpadd	C4, C4, T4
156	fpabs	T4, A8
157	LFPDUX	A8,   X, INCX2
158	bdnz	LL(12)
159	.align 4
160
161LL(13):
162	fpadd	C1, C1, T1
163	fpabs	T1, A1
164	fpadd	C2, C2, T2
165	fpabs	T2, A2
166	fpadd	C3, C3, T3
167	fpabs	T3, A3
168	fpadd	C4, C4, T4
169	fpabs	T4, A4
170
171	fpadd	C1, C1, T1
172	fpabs	T1, A5
173	fpadd	C2, C2, T2
174	fpabs	T2, A6
175	fpadd	C3, C3, T3
176	fpabs	T3, A7
177	fpadd	C4, C4, T4
178	fpabs	T4, A8
179
180	fpadd	C1, C1, T1
181	fpadd	C2, C2, T2
182	fpadd	C3, C3, T3
183	fpadd	C4, C4, T4
184	.align 4
185
186LL(15):
187	andi.	r0,  N, 15
188	beq	LL(999)
189	andi.	r0,  N, 8
190	beq	LL(16)
191
192	LFPDUX	A1,    X, INCX2
193	LFPDUX	A2,    X, INCX2
194	LFPDUX	A3,    X, INCX2
195	LFPDUX	A4,    X, INCX2
196
197	fpabs	T1, A1
198	fpabs	T2, A2
199	fpabs	T3, A3
200	fpabs	T4, A4
201
202	fpadd	C1, C1, T1
203	fpadd	C2, C2, T2
204	fpadd	C3, C3, T3
205	fpadd	C4, C4, T4
206	.align 4
207
208LL(16):
209	andi.	r0,  N, 4
210	beq	LL(17)
211
212	LFPDUX	A1,    X, INCX2
213	LFPDUX	A2,    X, INCX2
214	fpabs	T1, A1
215	fpabs	T2, A2
216
217	fpadd	C1, C1, T1
218	fpadd	C2, C2, T2
219	.align 4
220
221LL(17):
222	andi.	r0,  N, 2
223	beq	LL(18)
224
225	LFPDUX	A1,    X, INCX2
226	fpabs	T1, A1
227	fpadd	C1, C1, T1
228	.align 4
229
230LL(18):
231	andi.	r0,  N, 1
232	beq	LL(999)
233
234	LFDX	A1,    X, INCX2
235	fabs	T1, A1
236	fadd	C1, C1, T1
237	b LL(999)
238	.align 4
239
240LL(100):
241	sub	X2, X, INCX
242	sub	X,  X, INCX2
243
244	srawi.	r0, N, 4
245	mtspr	CTR,  r0
246	beq-	LL(115)
247
248
249	LFDUX	A1,   X, INCX2
250	fpmr	T1, C2
251	LFDUX	A2,   X, INCX2
252	fpmr	T2, C2
253	LFDUX	A3,   X, INCX2
254	fpmr	T3, C2
255	LFDUX	A4,   X, INCX2
256	fpmr	T4, C2
257
258	LFDUX	A5,   X, INCX2
259	LFSDUX	A1,   X2, INCX2
260
261	LFDUX	A6,   X, INCX2
262	LFSDUX	A2,   X2, INCX2
263
264	LFDUX	A7,   X, INCX2
265	LFSDUX	A3,   X2, INCX2
266
267	LFDUX	A8,   X, INCX2
268	LFSDUX	A4,   X2, INCX2
269	bdz	LL(113)
270	.align 4
271
272LL(112):
273	fpadd	C1, C1, T1
274	LFSDUX	A5,   X2, INCX2
275	fpabs	T1, A1
276	LFDUX	A1,   X, INCX2
277
278	fpadd	C2, C2, T2
279	LFSDUX	A6,   X2, INCX2
280	fpabs	T2, A2
281	LFDUX	A2,   X, INCX2
282
283	fpadd	C3, C3, T3
284	LFSDUX	A7,   X2, INCX2
285	fpabs	T3, A3
286	LFDUX	A3,   X, INCX2
287
288	fpadd	C4, C4, T4
289	LFSDUX	A8,   X2, INCX2
290	fpabs	T4, A4
291	LFDUX	A4,   X, INCX2
292
293	fpadd	C1, C1, T1
294	LFSDUX	A1,   X2, INCX2
295	fpabs	T1, A5
296	LFDUX	A5,   X, INCX2
297	fpadd	C2, C2, T2
298	LFSDUX	A2,   X2, INCX2
299	fpabs	T2, A6
300	LFDUX	A6,   X, INCX2
301
302	fpadd	C3, C3, T3
303	LFSDUX	A3,   X2, INCX2
304	fpabs	T3, A7
305	LFDUX	A7,   X, INCX2
306	fpadd	C4, C4, T4
307	LFSDUX	A4,   X2, INCX2
308	fpabs	T4, A8
309	LFDUX	A8,   X, INCX2
310
311	bdnz	LL(112)
312	.align 4
313
314LL(113):
315	fpadd	C1, C1, T1
316	nop
317	fpabs	T1, A1
318	LFSDUX	A5,   X2, INCX2
319	fpadd	C2, C2, T2
320	nop
321	fpabs	T2, A2
322	LFSDUX	A6,   X2, INCX2
323	fpadd	C3, C3, T3
324
325	nop
326	fpabs	T3, A3
327	LFSDUX	A7,   X2, INCX2
328	fpadd	C4, C4, T4
329	nop
330	fpabs	T4, A4
331	LFSDUX	A8,   X2, INCX2
332
333	fpadd	C1, C1, T1
334	fpabs	T1, A5
335	fpadd	C2, C2, T2
336	fpabs	T2, A6
337	fpadd	C3, C3, T3
338	fpabs	T3, A7
339	fpadd	C4, C4, T4
340	fpabs	T4, A8
341
342	fpadd	C1, C1, T1
343	fpadd	C2, C2, T2
344	fpadd	C3, C3, T3
345	fpadd	C4, C4, T4
346	.align 4
347
348LL(115):
349	andi.	r0,  N, 15
350	beq	LL(999)
351	andi.	r0,  N, 8
352	beq	LL(116)
353
354	LFDUX	A1,    X,  INCX2
355	LFDUX	A2,    X2, INCX2
356	LFDUX	A3,    X,  INCX2
357	LFDUX	A4,    X2, INCX2
358
359	fabs	T1, A1
360	LFDUX	A5,    X,  INCX2
361	fabs	T2, A2
362	LFDUX	A6,    X2, INCX2
363	fabs	T3, A3
364	LFDUX	A7,    X,  INCX2
365	fabs	T4, A4
366	LFDUX	A8,    X2, INCX2
367
368	fadd	C1, C1, T1
369	fabs	T1, A5
370	fadd	C2, C2, T2
371	fabs	T2, A6
372
373	fadd	C3, C3, T3
374	fabs	T3, A7
375	fadd	C4, C4, T4
376	fabs	T4, A8
377
378	fadd	C1, C1, T1
379	fadd	C2, C2, T2
380	fadd	C3, C3, T3
381	fadd	C4, C4, T4
382	.align 4
383
384LL(116):
385	andi.	r0,  N, 4
386	beq	LL(117)
387
388	LFDUX	A1,    X,  INCX2
389	LFDUX	A2,    X2, INCX2
390	LFDUX	A3,    X,  INCX2
391	LFDUX	A4,    X2, INCX2
392
393	fabs	T1, A1
394	fabs	T2, A2
395	fabs	T3, A3
396	fabs	T4, A4
397
398	fadd	C1, C1, T1
399	fadd	C2, C2, T2
400	fadd	C3, C3, T3
401	fadd	C4, C4, T4
402	.align 4
403
404LL(117):
405	andi.	r0,  N, 2
406	beq	LL(118)
407
408	LFDUX	A1,    X,  INCX2
409	LFDUX	A2,    X2, INCX2
410
411	fabs	T1, A1
412	fabs	T2, A2
413	fadd	C1, C1, T1
414	fadd	C2, C2, T2
415	.align 4
416
417LL(118):
418	andi.	r0,  N, 1
419	beq	LL(999)
420
421	LFDX	A1,    X, INCX2
422	fabs	T1, A1
423	fadd	C1, C1, T1
424	.align 4
425
426LL(999):
427	fpadd	C1,  C1,  C2
428	li	r10, 16
429	fpadd	C3,  C3,  C4
430	fpadd	C1,  C1,  C3
431	lfpdux	f15, SP, r10
432	fsmtp	C2, C1
433	lfpdux	f14, SP, r10
434	addi	SP, SP,  16
435	fadd	C1, C2, C1
436	blr
437
438	EPILOGUE
439