1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define PREFETCHSIZE   64
43#define WPREFETCHSIZE  32
44
45#ifndef XDOUBLE
46#define LD	LDF8
47#define ST	STF8_NTA
48#else
49#define LD	LDFD
50#define ST	STFD_NTA
51#endif
52
53#define J	r15
54#define PREB	r17
55#define PREA	r18
56
57#define A1	r19
58#define A2	r20
59#define A3	r21
60#define A4	r22
61#define A5	r23
62#define A6	r24
63#define A7	r25
64#define A8	r26
65#define B1	r27
66#define B2	r28
67
68#define COUNT	r9
69#define I	r10
70#define II	r11
71
72#define ARLC	r29
73#define PR	r30
74
75#define M	r32
76#define N	r33
77#define A	r34
78#define LDA	r35
79#define B	r36
80
81	PROLOGUE
82	.prologue
83	PROFCODE
84
85	.body
86	{ .mii
87	shladd	LDA = LDA, BASE_SHIFT, r0
88	mov	PR = pr
89	shr	J = N, 3
90	}
91	;;
92	{ .mib
93	cmp.eq	p8,  p0 = 0, J
94	mov	ARLC  = ar.lc
95	(p8)  br.cond.dpnt .L20
96	}
97	;;
98	.align 32
99
100.L11:
101	{ .mmi
102	mov	A1 = A
103	add	A2 = A, LDA
104	mov	pr.rot = 0
105	}
106	{ .mmi
107	shladd	A3 = LDA, 1, A
108	shladd	A5 = LDA, 2, A
109	adds	I = 1, M
110	}
111	;;
112	{ .mmi
113	shladd	A4 = LDA, 1, A2
114	shladd	A6 = LDA, 2, A2
115	mov	ar.ec  = 6
116	}
117	{ .mmi
118	cmp.eq	p16, p0 = r0, r0
119	shladd	A7 = LDA, 2, A3
120	shr	I = I, 1
121	}
122	;;
123	{ .mmi
124	adds	B1 = 8 * SIZE, B
125	shladd	A8 = LDA, 2, A4
126	shladd	A = LDA, 3, A
127	}
128	{ .mmi
129	adds	I = -1, I
130	mov	COUNT = 0
131	adds	J = -1, J
132	}
133	;;
134	{ .mmi
135	adds	PREA =  PREFETCHSIZE * SIZE, A
136	adds	PREB = WPREFETCHSIZE * SIZE, B
137	mov	ar.lc = I
138	}
139	{ .mmi
140	mov	I  = M
141	mov	II = M
142	cmp.ne p14, p0 = r0, r0
143	}
144	;;
145	.align 32
146
147.L12:
148	{ .mmi
149	(p21) ST	[B ] = f37,  1 * SIZE
150	(p14) ST	[B1] = f49,  1 * SIZE
151	(p16) cmp.ne.unc p13, p0 = 1, I
152	}
153	{ .mmi
154	lfetch.nt1	[PREA], LDA
155	lfetch.excl.nt1 [PREB]
156	adds	PREB = 16 * SIZE, PREB
157	}
158	;;
159	{ .mmi
160	(p21) ST	[B ] = f43,  1 * SIZE
161	(p14) ST	[B1] = f55,  1 * SIZE
162	cmp.eq	p9, p0 = 8, COUNT
163	}
164	{ .mmi
165	(p16) LD	f32  = [A1], SIZE
166	(p16) LD	f38  = [A2], SIZE
167	(p16) adds	I  = -2, I
168	}
169	;;
170	{ .mmi
171	(p21) ST	[B ] = f61,  1 * SIZE
172	(p14) ST	[B1] = f73,  1 * SIZE
173	(p9) mov COUNT = 0
174	}
175	{ .mmi
176	(p13) LD	f44  = [A1], SIZE
177	(p13) LD	f50  = [A2], SIZE
178	(p21) adds	II = -2, II
179	}
180	;;
181	{ .mmb
182	(p21) ST	[B ] = f67,  1 * SIZE
183	(p14) ST	[B1] = f79,  1 * SIZE
184	nop   __LINE__
185	}
186	{ .mmb
187	(p16) LD	f56  = [A3], SIZE
188	(p16) LD	f62  = [A4], SIZE
189	nop   __LINE__
190	}
191	;;
192	{ .mmi
193	(p21) ST	[B ] = f85,  1 * SIZE
194	(p14) ST	[B1] = f97,  1 * SIZE
195	(p9) adds PREA =  (PREFETCHSIZE - 2)* SIZE, A1
196	}
197	{ .mmb
198	(p13) LD	f68  = [A3], SIZE
199	(p13) LD	f74  = [A4], SIZE
200	nop   __LINE__
201	}
202	;;
203	{ .mmb
204	(p21) ST	[B ] = f91,  1 * SIZE
205	(p14) ST	[B1] = f103, 1 * SIZE
206	nop   __LINE__
207	}
208	{ .mmb
209	(p16) LD	f80  = [A5], SIZE
210	(p16) LD	f86  = [A6], SIZE
211	nop   __LINE__
212	}
213	;;
214	{ .mmb
215	(p21) ST	[B ] = f109, 1 * SIZE
216	(p14) ST	[B1] = f121, 1 * SIZE
217	nop   __LINE__
218	}
219	{ .mmb
220	(p13) LD	f92  = [A5], SIZE
221	(p13) LD	f98  = [A6], SIZE
222	nop   __LINE__
223	}
224	;;
225	{ .mmi
226	(p21) ST	[B ] = f115, 1 * SIZE
227	(p14) ST	[B1] = f127, 9 * SIZE
228	(p16) adds	COUNT = 1, COUNT
229	}
230	{ .mmb
231	(p16) LD	f104 = [A7], SIZE
232	(p16) LD	f110 = [A8], SIZE
233	nop   __LINE__
234	}
235	;;
236	{ .mmi
237	(p13) LD	f116 = [A7], SIZE
238	(p13) LD	f122 = [A8], SIZE
239	(p14) adds	B = 8 * SIZE, B
240	}
241	{ .mmb
242	(p20) cmp.ne.unc p14, p0 = 1, II
243	nop   __LINE__
244	br.ctop.sptk.few .L12
245	}
246	;;
247	{ .mmb
248	cmp.ne	p6, p0 = 0, J
249	nop   __LINE__
250	(p6) br.cond.dptk .L11
251	}
252	;;
253	.align 32
254
255.L20:
256	{ .mmi
257	adds	I = 1, M
258	mov	A1 = A
259	mov	pr.rot = 0
260	}
261	{ .mmi
262	add	A2 = A, LDA
263	shladd	A3 = LDA, 1, A
264	tbit.z	p6, p0 = N, 2
265	}
266	;;
267	{ .mmi
268	shladd	A4 = LDA, 1, A2
269	adds	B1 = 4 * SIZE, B
270	mov	ar.ec  = 6
271	}
272	{ .mib
273	cmp.eq	p16, p0 = r0, r0
274	shr	I = I, 1
275	(p6)  br.cond.dpnt .L30
276	}
277	;;
278	{ .mmi
279	shladd	A = LDA, 2, A
280	nop	__LINE__
281	nop	__LINE__
282	}
283	{ .mmi
284	adds	I = -1, I
285	mov	COUNT = 0
286	adds	J = -1, J
287	}
288	;;
289	{ .mmi
290	adds	PREA =  PREFETCHSIZE * SIZE, A
291	adds	PREB = WPREFETCHSIZE * SIZE, B
292	mov	ar.lc = I
293	}
294	{ .mmi
295	mov	I  = M
296	mov	II = M
297	cmp.ne p14, p0 = r0, r0
298	}
299	;;
300	.align 32
301
302.L22:
303	{ .mmi
304	(p21) ST	[B ] = f37,  1 * SIZE
305	(p14) ST	[B1] = f49,  1 * SIZE
306	(p16) cmp.ne.unc p13, p0 = 1, I
307	}
308	{ .mmi
309	lfetch.nt1	[PREA], LDA
310	lfetch.excl.nt1 [PREB], 8 * SIZE
311	cmp.eq	p9, p0 = 4, COUNT
312	}
313	;;
314	{ .mmi
315	(p21) ST	[B ] = f43,  1 * SIZE
316	(p14) ST	[B1] = f55,  1 * SIZE
317	(p16) adds	I  = -2, I
318	}
319	{ .mmi
320	(p16) LD	f32  = [A1], SIZE
321	(p16) LD	f38  = [A2], SIZE
322	(p21) adds	II = -2, II
323	}
324	;;
325	{ .mmi
326	(p21) ST	[B ] = f61,  1 * SIZE
327	(p14) ST	[B1] = f73,  1 * SIZE
328	(p9) mov COUNT = 0
329	}
330	{ .mmi
331	(p13) LD	f44  = [A1], SIZE
332	(p13) LD	f50  = [A2], SIZE
333	nop   __LINE__
334	}
335	;;
336	{ .mmi
337	(p21) ST	[B ] = f67,  1 * SIZE
338	(p14) ST	[B1] = f79,  5 * SIZE
339	(p9) adds PREA =  PREFETCHSIZE * SIZE, A1
340	}
341	{ .mmb
342	(p16) LD	f56  = [A3], SIZE
343	(p16) LD	f62  = [A4], SIZE
344	nop   __LINE__
345	}
346	;;
347	{ .mmi
348	(p13) LD	f68  = [A3], SIZE
349	(p13) LD	f74  = [A4], SIZE
350	(p16) adds	COUNT = 1, COUNT
351	}
352	{ .mmb
353	(p14) adds	B = 4 * SIZE, B
354	(p20) cmp.ne.unc p14, p0 = 1, II
355	br.ctop.sptk.few .L22
356	}
357	;;
358	.align 32
359
360.L30:
361	{ .mmi
362	adds	I = 1, M
363	mov	A1 = A
364	mov	pr.rot = 0
365	}
366	{ .mmi
367	add	A2 = A, LDA
368	adds	B1 = 2 * SIZE, B
369	tbit.z	p6, p0 = N, 1
370	}
371	;;
372	{ .mmi
373	nop	__LINE__
374	nop	__LINE__
375	mov	ar.ec  = 6
376	}
377	{ .mib
378	cmp.eq	p16, p0 = r0, r0
379	shr	I = I, 1
380	(p6)  br.cond.dpnt .L40
381	}
382	;;
383	{ .mmi
384	adds	I = -1, I
385	;;
386	shladd	A = LDA, 1, A
387	mov	ar.lc = I
388	}
389	{ .mmi
390	mov	I  = M
391	mov	II = M
392	cmp.ne p14, p0 = r0, r0
393	}
394	;;
395	.align 32
396
397.L32:
398	{ .mmi
399	(p21) ST	[B ] = f37,  1 * SIZE
400	(p14) ST	[B1] = f49,  1 * SIZE
401	(p16) cmp.ne.unc p13, p0 = 1, I
402	}
403	{ .mmi
404	nop	__LINE__
405	nop	__LINE__
406	(p21) adds	II = -2, II
407	}
408	;;
409	{ .mmi
410	(p21) ST	[B ] = f43,  1 * SIZE
411	(p14) ST	[B1] = f55,  3 * SIZE
412	nop	__LINE__
413	}
414	{ .mmi
415	(p16) LD	f32  = [A1], SIZE
416	(p16) LD	f38  = [A2], SIZE
417	nop	__LINE__
418	}
419	;;
420	{ .mmi
421	(p13) LD	f44  = [A1], SIZE
422	(p13) LD	f50  = [A2], SIZE
423	(p16) adds	I  = -2, I
424	}
425	{ .mmb
426	(p14) adds	B = 2 * SIZE, B
427	(p20) cmp.ne.unc p14, p0 = 1, II
428	br.ctop.sptk.few .L32
429	}
430	;;
431	.align 32
432
433.L40:
434	{ .mmi
435	adds	I = 1, M
436	mov	A1 = A
437	mov	pr.rot = 0
438	}
439	{ .mmi
440	tbit.z	p6, p0 = N, 0
441	}
442	;;
443	{ .mmi
444	nop	__LINE__
445	nop	__LINE__
446	mov	ar.ec  = 6
447	}
448	{ .mib
449	cmp.eq	p16, p0 = r0, r0
450	shr	I = I, 1
451	(p6)  br.cond.dpnt .L999
452	}
453	;;
454	{ .mmi
455	adds	I = -1, I
456	;;
457	mov	ar.lc = I
458	}
459	{ .mmi
460	mov	I  = M
461	mov	II = M
462	cmp.ne p14, p0 = r0, r0
463	}
464	;;
465	.align 32
466
467.L42:
468	{ .mmi
469	(p21) ST	[B ] = f37,  1 * SIZE
470	(p16) cmp.ne.unc p13, p0 = 1, I
471	(p21) adds	II = -2, II
472	}
473	;;
474	{ .mmi
475	(p14) ST	[B ] = f49,  1 * SIZE
476	(p16) LD	f32  = [A1], SIZE
477	(p16) adds	I  = -2, I
478	}
479	;;
480	{ .mmb
481	(p13) LD	f44  = [A1], SIZE
482	(p20) cmp.ne.unc p14, p0 = 1, II
483	br.ctop.sptk.few .L42
484	}
485	;;
486	.align 32
487
488.L999:
489	mov pr    = PR, -1
490	mov	 ar.lc = ARLC
491	br.ret.sptk.many b0
492	EPILOGUE
493
494