1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define PREFETCHSIZE 140
26
27#define CO1	r14
28#define CO2	r15
29#define CO3	r16
30#define DO1	r17
31#define DO2	r18
32#define DO3	r19
33
34#define I	r22
35#define I_AND_15 r23
36#define PRE1	r24
37
38#define PR	r30
39#define ARLC	r31
40
41#define M	r32
42#define N	r33
43#define C	r34
44#define LDC	r35
45#define J	r36
46
47#define BETA	f8
48
49	PROLOGUE
50	.prologue
51	PROFCODE
52
53	{ .mmi
54#ifndef XDOUBLE
55	adds	CO1 = 16, r12
56	adds	CO2 = 24, r12
57#else
58	adds	CO1 = 32, r12
59	adds	CO2 = 40, r12
60#endif
61	.save	ar.lc, ARLC
62	mov	ARLC = ar.lc
63	}
64	{ .mfb
65	cmp.ge	p6, p0 = 0, N
66	fcmp.eq	p0, p15 = BETA, f0
67	(p6) br.ret.sptk.many b0
68	}
69	;;
70	.body
71	{ .mmi
72	ld8	C = [CO1], 8
73	ld8	LDC = [CO2]
74	mov	PR = pr
75	}
76	{ .mmi
77	mov	J = N
78	shr	I = M, 4
79	}
80	;;
81	{ .mmb
82	shladd LDC = LDC, BASE_SHIFT, r0
83	adds	I = -1, I
84	(p15) br.cond.dpnt .L100		// if (beta != 0) goto L100
85	}
86	;;
87	.align 32
88
89.L60:
90	{ .mmi
91	mov	CO1 = C
92	mov	CO3 = C
93	add	CO2 = 4 * SIZE, C
94	}
95	{ .mmi
96	adds	PRE1 = PREFETCHSIZE * SIZE, C
97	add	C = C, LDC
98	tbit.nz	p12, p0 = M, 3
99	}
100	;;
101	{ .mmi
102	and	I_AND_15 = 15, M
103	mov	ar.lc = I
104	}
105	{ .mib
106	cmp.gt	p8, p0 = 0, I
107	(p8) br.cond.dpnt .L80
108	}
109	;;
110	.align 32
111
112.L70:
113	{ .mmi
114	STFD	[CO1] = f0, 1 * SIZE
115	STFD	[CO2] = f0, 1 * SIZE
116	}
117	{ .mmi
118	lfetch.excl.nt1	[PRE1]
119	nop.m 0
120	adds	PRE1 = 16 * SIZE, PRE1
121	}
122	;;
123	{ .mmi
124	STFD	[CO1] = f0, 1 * SIZE
125	STFD	[CO2] = f0, 1 * SIZE
126	adds	CO3 = 16 * SIZE, CO3
127	}
128	;;
129	{ .mmi
130	STFD	[CO1] = f0, 1 * SIZE
131	STFD	[CO2] = f0, 1 * SIZE
132	}
133	;;
134	{ .mmi
135	STFD	[CO1] = f0, 5 * SIZE
136	STFD	[CO2] = f0, 5 * SIZE
137	}
138	;;
139	{ .mmi
140	STFD	[CO1] = f0, 1 * SIZE
141	STFD	[CO2] = f0, 1 * SIZE
142	}
143	;;
144	{ .mmi
145	STFD	[CO1] = f0, 1 * SIZE
146	STFD	[CO2] = f0, 1 * SIZE
147	}
148	;;
149	{ .mmi
150	STFD	[CO1] = f0, 1 * SIZE
151	STFD	[CO2] = f0, 1 * SIZE
152	}
153	;;
154	{ .mmb
155	STFD	[CO1] = f0, 5 * SIZE
156	STFD	[CO2] = f0, 5 * SIZE
157	br.cloop.sptk.few .L70
158	}
159	;;
160	.align 32
161
162.L80:
163	{ .mmi
164	(p12) STFD [CO1] = f0, 1 * SIZE
165	(p12) STFD [CO2] = f0, 1 * SIZE
166	tbit.nz	p13, p0 = M, 2
167	}
168	{ .mmb
169	cmp.eq	p9, p0 = 0, I_AND_15
170	adds	J = -1, J
171	(p9) br.cond.dptk .L99
172	}
173	;;
174	{ .mmi
175	(p12) STFD [CO1] = f0, 1 * SIZE
176	(p12) STFD [CO2] = f0, 1 * SIZE
177	tbit.nz	p14, p0 = M, 1
178	}
179	;;
180	{ .mmi
181	(p12) STFD [CO1] = f0, 1 * SIZE
182	(p12) STFD [CO2] = f0, 1 * SIZE
183	(p12) adds CO3 = 8 * SIZE, CO3
184	}
185	;;
186	{ .mmi
187	(p12) STFD [CO1] = f0, 5 * SIZE
188	(p12) STFD [CO2] = f0
189	(p13) adds CO3 = 4 * SIZE, CO3
190	}
191	;;
192	{ .mmi
193	(p13) STFD [CO1] = f0, 1 * SIZE
194	(p14) STFD [CO3] = f0, 1 * SIZE
195	}
196	;;
197	{ .mmi
198	(p13) STFD [CO1] = f0, 1 * SIZE
199	(p14) STFD [CO3] = f0, 1 * SIZE
200	tbit.nz	p15, p0 = M, 0
201	}
202	;;
203	{ .mmi
204	(p13) STFD [CO1] = f0, 1 * SIZE
205	(p15) STFD [CO3] = f0
206	}
207	;;
208	{ .mmi
209	(p13) STFD [CO1] = f0
210	}
211	;;
212	.align 32
213
214.L99:
215	{ .mib
216	cmp.lt	p6, p0 = 0, J
217	mov ar.lc = ARLC
218	}
219	{ .mbb
220	(p6) br.cond.dptk .L60
221	br.ret.sptk.many b0
222	}
223	;;
224	.align 32
225
226.L100:
227	{ .mmi
228	mov	CO1 = C
229	mov	CO3 = C
230	mov	pr.rot = 0
231	}
232	{ .mmi
233	adds	PRE1 = PREFETCHSIZE * SIZE, C
234	add	CO2 = 4 * SIZE, C
235	mov	DO1 = C
236	}
237	;;
238	{ .mmi
239	mov	ar.ec = 6
240	}
241	{ .mmi
242	adds	DO2 = 4 * SIZE, C
243	mov	DO3 = C
244	add	C = C, LDC
245	}
246	;;
247	{ .mmi
248	and	I_AND_15 = 15, M
249	cmp.eq	p16, p0 = r0, r0
250	mov	ar.lc = I
251	}
252	{ .mib
253	cmp.gt	p8, p0 = 0, I
254	tbit.nz	p12, p0 = M, 3
255	(p8) br.cond.dpnt .L180
256	}
257	;;
258	.align 32
259
260.L170:
261	{ .mmf
262	(p21) STFD [DO1] = f6, 1 * SIZE
263	(p21) STFD [DO2] = f7, 1 * SIZE
264	(p21) FMPY f6  = BETA, f85
265	}
266	{ .mmf
267	(p16) lfetch.excl.nt1	[PRE1]
268	(p16) adds CO3 = 16 * SIZE, CO3
269	(p21) FMPY f7  = BETA, f91
270	}
271	;;
272	{ .mmf
273	(p21) STFD [DO1] = f10, 1 * SIZE
274	(p21) STFD [DO2] = f11, 1 * SIZE
275	(p21) FMPY f10 = BETA, f97
276	}
277	{ .mmf
278	(p16) LDFD f32 = [CO1], 1 * SIZE
279	(p16) LDFD f38 = [CO2], 1 * SIZE
280	(p21) FMPY f11 = BETA, f103
281	}
282	;;
283	{ .mmf
284	(p21) STFD [DO1] = f12, 1 * SIZE
285	(p21) STFD [DO2] = f13, 1 * SIZE
286	(p21) FMPY f12 = BETA, f109
287	}
288	{ .mmf
289	(p16) LDFD f44 = [CO1], 1 * SIZE
290	(p16) LDFD f50 = [CO2], 1 * SIZE
291	(p21) FMPY f13 = BETA, f115
292	}
293	;;
294	{ .mmf
295	(p21) STFD [DO1] = f14, 5 * SIZE
296	(p21) STFD [DO2] = f15, 5 * SIZE
297	(p21) FMPY f14 = BETA, f121
298	}
299	{ .mmf
300	(p16) LDFD f56 = [CO1], 1 * SIZE
301	(p16) LDFD f62 = [CO2], 1 * SIZE
302	(p21) FMPY f15 = BETA, f127
303	}
304	;;
305	{ .mmf
306	(p21) STFD [DO1] = f6, 1 * SIZE
307	(p21) STFD [DO2] = f7, 1 * SIZE
308	(p20) FMPY f6  = BETA, f36
309	}
310	{ .mmf
311	(p16) LDFD f68 = [CO1], 5 * SIZE
312	(p16) LDFD f74 = [CO2], 5 * SIZE
313	(p20) FMPY f7  = BETA, f42
314	}
315	;;
316	{ .mmf
317	(p21) STFD [DO1] = f10, 1 * SIZE
318	(p21) STFD [DO2] = f11, 1 * SIZE
319	(p20) FMPY f10 = BETA, f48
320	}
321	{ .mmf
322	(p16) LDFD f80 = [CO1], 1 * SIZE
323	(p16) LDFD f86 = [CO2], 1 * SIZE
324	(p20) FMPY f11 = BETA, f54
325	}
326	;;
327	{ .mmf
328	(p21) STFD [DO1] = f12, 1 * SIZE
329	(p21) STFD [DO2] = f13, 1 * SIZE
330	(p20) FMPY f12 = BETA, f60
331	}
332	{ .mmf
333	(p16) LDFD f92 = [CO1], 1 * SIZE
334	(p16) LDFD f98 = [CO2], 1 * SIZE
335	(p20) FMPY f13 = BETA, f66
336	}
337	;;
338	{ .mmf
339	(p21) STFD [DO1] = f14, 5 * SIZE
340	(p21) STFD [DO2] = f15, 5 * SIZE
341	(p20) FMPY f14 = BETA, f72
342	}
343	{ .mmf
344	(p16) LDFD f104 = [CO1], 1 * SIZE
345	(p16) LDFD f110 = [CO2], 1 * SIZE
346	(p20) FMPY f15 = BETA, f78
347	}
348	;;
349	{ .mmi
350	(p16) LDFD f116 = [CO1], 5 * SIZE
351	(p16) LDFD f122 = [CO2], 5 * SIZE
352	adds	PRE1 = 16 * SIZE, PRE1
353	}
354	{ .mmb
355	(p16) adds DO3 = 16 * SIZE, DO3
356	nop.m 0
357	br.ctop.sptk.few .L170
358	}
359	;;
360	.align 32
361
362.L180:
363	{ .mmi
364	(p12) LDFD f32 = [CO1], 1 * SIZE
365	(p12) LDFD f36 = [CO2], 1 * SIZE
366	tbit.nz	p13, p0 = M, 2
367	}
368	{ .mmb
369	cmp.eq	p9, p0 = 0, I_AND_15
370	adds	J = -1, J
371	(p9) br.cond.dptk .L199
372	}
373	;;
374	{ .mmi
375	(p12) LDFD f33 = [CO1], 1 * SIZE
376	(p12) LDFD f37 = [CO2], 1 * SIZE
377	tbit.nz	p14, p0 = M, 1
378	}
379	;;
380	{ .mmi
381	(p12) LDFD f34 = [CO1], 1 * SIZE
382	(p12) LDFD f38 = [CO2], 1 * SIZE
383	(p12) adds CO3 = 8 * SIZE, CO3
384	}
385	;;
386	{ .mmi
387	(p12) LDFD f35 = [CO1], 5 * SIZE
388	(p12) LDFD f39 = [CO2]
389	(p13) adds CO3 = 4 * SIZE, CO3
390	}
391	;;
392	{ .mmi
393	(p13) LDFD f40 = [CO1], 1 * SIZE
394	(p14) LDFD f44 = [CO3], 1 * SIZE
395	}
396	;;
397	{ .mmi
398	(p13) LDFD f41 = [CO1], 1 * SIZE
399	(p14) LDFD f45 = [CO3], 1 * SIZE
400	tbit.nz	p15, p0 = M, 0
401	}
402	;;
403	{ .mmf
404	(p13) LDFD f42 = [CO1], 1 * SIZE
405	(p15) LDFD f46 = [CO3]
406	(p12) FMPY f32 = BETA, f32
407	}
408	{ .mmf
409	(p12) FMPY f36 = BETA, f36
410	}
411	;;
412	{ .mmf
413	(p13) LDFD f43 = [CO1]
414	(p12) FMPY f33 = BETA, f33
415	}
416	{ .mmf
417	(p12) FMPY f37 = BETA, f37
418	}
419	;;
420	(p12) FMPY f34 = BETA, f34
421	(p12) FMPY f38 = BETA, f38
422	(p12) FMPY f35 = BETA, f35
423	(p12) FMPY f39 = BETA, f39
424
425	;;
426	{ .mmf
427	(p12) STFD [DO1] = f32, 1 * SIZE
428	(p12) STFD [DO2] = f36, 1 * SIZE
429	(p13) FMPY f40 = BETA, f40
430	}
431	{ .mmf
432	(p12) adds DO3 = 8 * SIZE, DO3
433	(p14) FMPY f44 = BETA, f44
434	}
435	;;
436	{ .mmf
437	(p12) STFD [DO1] = f33, 1 * SIZE
438	(p12) STFD [DO2] = f37, 1 * SIZE
439	(p13) FMPY f41 = BETA, f41
440	}
441	{ .mmf
442	(p13) adds DO3 = 4 * SIZE, DO3
443	(p14) FMPY f45 = BETA, f45
444	}
445	;;
446	{ .mmf
447	(p12) STFD [DO1] = f34, 1 * SIZE
448	(p12) STFD [DO2] = f38, 1 * SIZE
449	(p13) FMPY f42 = BETA, f42
450	}
451	{ .mmf
452	(p15) FMPY f46 = BETA, f46
453	}
454	;;
455	{ .mmf
456	(p12) STFD [DO1] = f35, 5 * SIZE
457	(p12) STFD [DO2] = f39
458	(p13) FMPY f43 = BETA, f43
459	}
460	;;
461	{ .mmi
462	(p13) STFD [DO1] = f40, 1 * SIZE
463	(p14) STFD [DO3] = f44, 1 * SIZE
464	}
465	;;
466	{ .mmi
467	(p13) STFD [DO1] = f41, 1 * SIZE
468	(p14) STFD [DO3] = f45, 1 * SIZE
469	}
470	;;
471	{ .mmi
472	(p13) STFD [DO1] = f42, 1 * SIZE
473	(p15) STFD [DO3] = f46
474	}
475	;;
476	{ .mmi
477	(p13) STFD [DO1] = f43
478	}
479	;;
480	.align 32
481
482.L199:
483	{ .mib
484	cmp.lt	p6, p0 = 0, J
485	mov ar.lc = ARLC
486	(p6) br.cond.dptk .L100
487	}
488	;;
489	{ .mib
490	mov	pr = PR, -1
491	br.ret.sptk.many b0
492	}
493	;;
494	EPILOGUE
495
496