1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define N	r3
26#define X	r4
27#define INCX	r5
28
29#define PREA	r8
30
31#define FZERO	f1
32
33#define STACKSIZE 160
34
35	PROLOGUE
36	PROFCODE
37
38	addi	SP, SP, -STACKSIZE
39	li	r0,   0
40
41	stfd	f14,    0(SP)
42	stfd	f15,    8(SP)
43	stfd	f16,   16(SP)
44	stfd	f17,   24(SP)
45
46	stfd	f18,   32(SP)
47	stfd	f19,   40(SP)
48	stfd	f20,   48(SP)
49	stfd	f21,   56(SP)
50
51	stfd	f22,   64(SP)
52	stfd	f23,   72(SP)
53	stfd	f24,   80(SP)
54	stfd	f25,   88(SP)
55
56	stfd	f26,   96(SP)
57	stfd	f27,  104(SP)
58	stfd	f28,  112(SP)
59	stfd	f29,  120(SP)
60
61	stfd	f30,  128(SP)
62	stfd	f31,  136(SP)
63
64	stw	r0,   144(SP)
65	lfs	FZERO,144(SP)
66
67#ifdef F_INTERFACE
68	LDINT	N,    0(N)
69	LDINT	INCX, 0(INCX)
70#endif
71
72	slwi	INCX, INCX, BASE_SHIFT
73
74	sub	X, X, INCX
75
76	cmpwi	cr0, N, 0
77	ble-	LL(9999)
78	cmpwi	cr0, INCX, 0
79	ble-	LL(9999)
80
81	LFDUX	f1, X, INCX
82
83	fmr	f0, f1
84	subi	N, N, 1
85	fmr	f2, f1
86	fmr	f3, f1
87	fmr	f4, f1
88	fmr	f5, f1
89	srawi.	r0, N, 4
90	fmr	f6, f1
91	mtspr	CTR,  r0
92	fmr	f7, f1
93	beq-	LL(150)
94
95	LFDUX	f16,   X, INCX
96	LFDUX	f17,   X, INCX
97	LFDUX	f18,   X, INCX
98	LFDUX	f19,   X, INCX
99	LFDUX	f20,   X, INCX
100	LFDUX	f21,   X, INCX
101	LFDUX	f22,   X, INCX
102	LFDUX	f23,   X, INCX
103
104	LFDUX	f24,   X, INCX
105	fsub	f8,  f0,  f16
106	LFDUX	f25,   X, INCX
107	fsub	f9,  f1,  f17
108	LFDUX	f26,   X, INCX
109	fsub	f10, f2,  f18
110	LFDUX	f27,   X, INCX
111	fsub	f11, f3,  f19
112	LFDUX	f28,   X, INCX
113	fsub	f12, f4,  f20
114	LFDUX	f29,   X, INCX
115	fsub	f13, f5,  f21
116	LFDUX	f30,   X, INCX
117	fsub	f14, f6,  f22
118	LFDUX	f31,   X, INCX
119	fsub	f15, f7,  f23
120	bdz	LL(120)
121	.align 4
122
123LL(110):
124	fsel	f0,  f8,  f16,  f0
125	LFDUX	f16,   X, INCX
126	fsub	f8,  f0,  f24
127	fsel	f1,  f9,  f17,  f1
128	LFDUX	f17,   X, INCX
129	fsub	f9,  f1,  f25
130	fsel	f2,  f10, f18,  f2
131	LFDUX	f18,   X, INCX
132	fsub	f10, f2,  f26
133	fsel	f3,  f11, f19,  f3
134	LFDUX	f19,   X, INCX
135	fsub	f11, f3,  f27
136
137	fsel	f4,  f12, f20, f4
138	LFDUX	f20,   X, INCX
139	fsub	f12, f4,  f28
140	fsel	f5,  f13, f21, f5
141	LFDUX	f21,   X, INCX
142	fsub	f13, f5,  f29
143	fsel	f6,  f14, f22, f6
144	LFDUX	f22,   X, INCX
145	fsub	f14, f6,  f30
146	fsel	f7,  f15, f23, f7
147	LFDUX	f23,   X, INCX
148	fsub	f15, f7,  f31
149
150	fsel	f0,  f8,  f24, f0
151	LFDUX	f24,   X, INCX
152	fsub	f8,  f0,  f16
153	fsel	f1,  f9,  f25, f1
154	LFDUX	f25,   X, INCX
155	fsub	f9,  f1,  f17
156	fsel	f2,  f10, f26, f2
157	LFDUX	f26,   X, INCX
158	fsub	f10, f2,  f18
159	fsel	f3,  f11, f27, f3
160	LFDUX	f27,   X, INCX
161	fsub	f11, f3,  f19
162
163	fsel	f4,  f12, f28, f4
164	LFDUX	f28,   X, INCX
165	fsub	f12, f4,  f20
166	fsel	f5,  f13, f29, f5
167	LFDUX	f29,   X, INCX
168	fsub	f13, f5,  f21
169	fsel	f6,  f14, f30, f6
170	LFDUX	f30,   X, INCX
171	fsub	f14, f6,  f22
172	fsel	f7,  f15, f31, f7
173	LFDUX	f31,   X, INCX
174	fsub	f15, f7,  f23
175	bdnz	LL(110)
176	.align 4
177
178LL(120):
179	fsel	f0,  f8,  f16, f0
180	fsub	f8,  f0,  f24
181	fsel	f1,  f9,  f17, f1
182	fsub	f9,  f1,  f25
183	fsel	f2,  f10, f18, f2
184	fsub	f10, f2,  f26
185	fsel	f3,  f11, f19, f3
186	fsub	f11, f3,  f27
187
188	fsel	f4,  f12, f20, f4
189	fsub	f12, f4,  f28
190	fsel	f5,  f13, f21, f5
191	fsub	f13, f5,  f29
192	fsel	f6,  f14, f22, f6
193	fsub	f14, f6,  f30
194	fsel	f7,  f15, f23, f7
195	fsub	f15, f7,  f31
196
197	fsel	f0,  f8,  f24, f0
198	fsel	f1,  f9,  f25, f1
199	fsel	f2,  f10, f26, f2
200	fsel	f3,  f11, f27, f3
201	fsel	f4,  f12, f28, f4
202	fsel	f5,  f13, f29, f5
203	fsel	f6,  f14, f30, f6
204	fsel	f7,  f15, f31, f7
205	.align 4
206
207LL(150):
208	andi.	r0,  N, 15
209	mtspr	CTR, r0
210	beq	LL(999)
211	.align 4
212
213LL(160):
214	LFDUX	f8,    X, INCX
215	fsub	f16, f1, f8
216	fsel	f1, f16, f8, f1
217	bdnz	LL(160)
218	.align 4
219
220LL(999):
221	fsub	f8,  f0,  f1
222	fsub	f9,  f2,  f3
223	fsub	f10, f4,  f5
224	fsub	f11, f6,  f7
225
226	fsel	f0,  f8,  f1,  f0
227	fsel	f2,  f9,  f3,  f2
228	fsel	f4,  f10, f5,  f4
229	fsel	f6,  f11, f7,  f6
230
231	fsub	f8,  f0,  f2
232	fsub	f9,  f4,  f6
233	fsel	f0,  f8,  f2,  f0
234	fsel	f4,  f9,  f6,  f4
235
236	fsub	f8,  f0,  f4
237	fsel	f1,  f8,  f4,  f0
238	.align 4
239
240LL(9999):
241	lfd	f14,    0(SP)
242	lfd	f15,    8(SP)
243	lfd	f16,   16(SP)
244	lfd	f17,   24(SP)
245
246	lfd	f18,   32(SP)
247	lfd	f19,   40(SP)
248	lfd	f20,   48(SP)
249	lfd	f21,   56(SP)
250
251	lfd	f22,   64(SP)
252	lfd	f23,   72(SP)
253	lfd	f24,   80(SP)
254	lfd	f25,   88(SP)
255
256	lfd	f26,   96(SP)
257	lfd	f27,  104(SP)
258	lfd	f28,  112(SP)
259	lfd	f29,  120(SP)
260
261	lfd	f30,  128(SP)
262	lfd	f31,  136(SP)
263
264	addi	SP, SP, STACKSIZE
265	blr
266
267	EPILOGUE
268