1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%i0
43#define N	%i1
44#define A	%i2
45#define LDA	%i3
46#define B	%i4
47
48#define A1	%l0
49#define A2	%l1
50#define A3	%l2
51#define A4	%l3
52
53#define I	%l4
54#define J	%l5
55
56#ifdef DOUBLE
57#define c01	%f0
58#define c02	%f2
59#define c03	%f4
60#define c04	%f6
61#define c05	%f8
62#define c06	%f10
63#define c07	%f12
64#define c08	%f14
65#define c09	%f16
66#define c10	%f18
67#define c11	%f20
68#define c12	%f22
69#define c13	%f24
70#define c14	%f26
71#define c15	%f28
72#define c16	%f30
73#else
74#define c01	%f0
75#define c02	%f1
76#define c03	%f2
77#define c04	%f3
78#define c05	%f4
79#define c06	%f5
80#define c07	%f6
81#define c08	%f7
82#define c09	%f8
83#define c10	%f9
84#define c11	%f10
85#define c12	%f11
86#define c13	%f12
87#define c14	%f13
88#define c15	%f14
89#define c16	%f15
90#endif
91
92	PROLOGUE
93	SAVESP
94
95	sra	N, 2, J
96	cmp	J, 0
97	ble,pn	%icc, .LL100
98	sll	LDA, BASE_SHIFT, LDA
99
100.LL11:
101	add	A,  LDA, A2
102	mov	A,  A1
103	add	A2, LDA, A3
104	sra	M, 2, I
105	add	A3, LDA, A4
106	cmp	I, 0
107
108	ble,pn	%icc, .LL15
109	add	A4, LDA, A
110
111#define  PREFETCHSIZE 36
112#define WPREFETCHSIZE 20
113
114.LL12:
115	prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0
116	LDF	[A1 +  0 * SIZE], c01
117	LDF	[A2 +  0 * SIZE], c05
118	LDF	[A3 +  0 * SIZE], c09
119	LDF	[A4 +  0 * SIZE], c13
120
121	prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0
122	LDF	[A1 +  1 * SIZE], c02
123	LDF	[A2 +  1 * SIZE], c06
124	LDF	[A3 +  1 * SIZE], c10
125	LDF	[A4 +  1 * SIZE], c14
126
127	prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0
128	LDF	[A1 +  2 * SIZE], c03
129	LDF	[A2 +  2 * SIZE], c07
130	LDF	[A3 +  2 * SIZE], c11
131	LDF	[A4 +  2 * SIZE], c15
132
133	prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0
134	LDF	[A1 +  3 * SIZE], c04
135	LDF	[A2 +  3 * SIZE], c08
136	LDF	[A3 +  3 * SIZE], c12
137	LDF	[A4 +  3 * SIZE], c16
138
139	prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2
140	STF	c01, [B +  0 * SIZE]
141	add	A1,  4 * SIZE, A1
142	STF	c05, [B +  1 * SIZE]
143	add	A2,  4 * SIZE, A2
144	STF	c09, [B +  2 * SIZE]
145	add	A3,  4 * SIZE, A3
146	STF	c13, [B +  3 * SIZE]
147	add	A4,  4 * SIZE, A4
148	STF	c02, [B +  4 * SIZE]
149	add	I, -1, I
150	STF	c06, [B +  5 * SIZE]
151	cmp	I, 0
152	STF	c10, [B +  6 * SIZE]
153	STF	c14, [B +  7 * SIZE]
154#ifdef DOUBLE
155	prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2
156#endif
157	STF	c03, [B +  8 * SIZE]
158	STF	c07, [B +  9 * SIZE]
159	STF	c11, [B + 10 * SIZE]
160	STF	c15, [B + 11 * SIZE]
161	STF	c04, [B + 12 * SIZE]
162	STF	c08, [B + 13 * SIZE]
163	STF	c12, [B + 14 * SIZE]
164	STF	c16, [B + 15 * SIZE]
165	bg,pt	%icc, .LL12
166	add	B, 16 * SIZE, B
167
168.LL15:
169	and	M, 3, I
170	cmp	I, 0
171	ble,pn	%icc, .LL99
172	nop
173
174.LL16:
175	LDF	[A1 +  0 * SIZE], c01
176	add	A1,  1 * SIZE, A1
177	LDF	[A2 +  0 * SIZE], c05
178	add	A2,  1 * SIZE, A2
179	LDF	[A3 +  0 * SIZE], c09
180	add	A3,  1 * SIZE, A3
181	LDF	[A4 +  0 * SIZE], c13
182	add	A4,  1 * SIZE, A4
183
184	STF	c01, [B +  0 * SIZE]
185	add	I, -1, I
186	STF	c05, [B +  1 * SIZE]
187	cmp	I, 0
188	STF	c09, [B +  2 * SIZE]
189	STF	c13, [B +  3 * SIZE]
190	bg,pt	%icc, .LL16
191	add	B,   4 * SIZE, B
192
193.LL99:
194	add	J, -1, J
195	cmp	J, 0
196	bg,pt	%icc, .LL11
197	nop
198
199.LL100:
200	and	N, 2, J
201	cmp	J, 0
202	ble,pn	%icc, .LL200
203	nop
204
205.LL111:
206	sra	M, 2, I
207	add	A,  LDA, A2
208	cmp	I, 0
209	mov	A,  A1
210
211	ble,pn	%icc, .LL115
212	add	A2, LDA, A
213
214.LL112:
215	LDF	[A1 +  0 * SIZE], c01
216	LDF	[A2 +  0 * SIZE], c05
217	LDF	[A1 +  1 * SIZE], c02
218	LDF	[A2 +  1 * SIZE], c06
219
220	LDF	[A1 +  2 * SIZE], c03
221	LDF	[A2 +  2 * SIZE], c07
222	LDF	[A1 +  3 * SIZE], c04
223	LDF	[A2 +  3 * SIZE], c08
224
225	STF	c01, [B +  0 * SIZE]
226	add	A1,  4 * SIZE, A1
227	STF	c05, [B +  1 * SIZE]
228	add	A2,  4 * SIZE, A2
229	STF	c02, [B +  2 * SIZE]
230	add	I, -1, I
231	STF	c06, [B +  3 * SIZE]
232	cmp	I, 0
233	STF	c03, [B +  4 * SIZE]
234	STF	c07, [B +  5 * SIZE]
235	STF	c04, [B +  6 * SIZE]
236	STF	c08, [B +  7 * SIZE]
237
238	bg,pt	%icc, .LL112
239	add	B,   8 * SIZE, B
240
241.LL115:
242	and	M, 3, I
243	cmp	I, 0
244	ble,pn	%icc, .LL200
245	nop
246
247.LL116:
248	LDF	[A1 +  0 * SIZE], c01
249	add	A1,  1 * SIZE, A1
250	add	I, -1, I
251	LDF	[A2 +  0 * SIZE], c05
252	add	A2,  1 * SIZE, A2
253	cmp	I, 0
254
255	STF	c01, [B +  0 * SIZE]
256	STF	c05, [B +  1 * SIZE]
257	bg,pt	%icc, .LL116
258	add	B,   2 * SIZE, B
259
260.LL200:
261	and	N, 1, J
262	cmp	J, 0
263	ble,pn	%icc, .LL999
264	nop
265
266.LL211:
267	sra	M, 2, I
268	cmp	I, 0
269	ble,pn	%icc, .LL215
270	mov	A,  A1
271
272.LL212:
273	LDF	[A1 +  0 * SIZE], c01
274	LDF	[A1 +  1 * SIZE], c02
275	LDF	[A1 +  2 * SIZE], c03
276	LDF	[A1 +  3 * SIZE], c04
277
278	STF	c01, [B +  0 * SIZE]
279	add	I, -1, I
280	STF	c02, [B +  1 * SIZE]
281	cmp	I, 0
282	STF	c03, [B +  2 * SIZE]
283	add	A1,  4 * SIZE, A1
284	STF	c04, [B +  3 * SIZE]
285
286	bg,pt	%icc, .LL212
287	add	B,   4 * SIZE, B
288
289.LL215:
290	and	M, 3, I
291	cmp	I, 0
292	ble,pn	%icc, .LL999
293	nop
294
295.LL216:
296	LDF	[A1 +  0 * SIZE], c01
297	add	A1,  1 * SIZE, A1
298	add	I, -1, I
299	cmp	I, 0
300
301	STF	c01, [B +  0 * SIZE]
302	bg,pt	%icc, .LL216
303	add	B,   1 * SIZE, B
304
305.LL999:
306	return	%i7 + 8
307	clr	%o0
308
309	EPILOGUE
310