1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#ifndef WINDOWS_ABI
43
44#define M	ARG1	/* rdi */
45#define N	ARG2	/* rsi */
46#define A	ARG3	/* rdx */
47#define LDA	ARG4	/* rcx */
48#define B	ARG5	/* r8  */
49
50#define I	%r10
51#define J	%rbp
52
53#define AO1	%r9
54#define AO2	%r15
55#define AO3	%r11
56#define AO4	%r14
57#define BO1	%r13
58#define M8	%rbx
59#define BO	%rax
60
61#else
62
63#define STACKSIZE 256
64
65#define M	ARG1	/* rcx */
66#define N	ARG2	/* rdx */
67#define A	ARG3	/* r8  */
68#define LDA	ARG4	/* r9  */
69#define OLD_B		40 + 64 + STACKSIZE(%rsp)
70
71#define B	%rdi
72
73#define I	%r10
74#define J	%r11
75
76#define AO1	%r12
77#define AO2	%r13
78#define AO3	%r14
79#define AO4	%r15
80
81#define BO1	%rsi
82#define M8	%rbp
83#define BO	%rax
84
85#endif
86
87	PROLOGUE
88	PROFCODE
89
90#ifdef WINDOWS_ABI
91	pushq	%rdi
92	pushq	%rsi
93#endif
94	pushq	%r15
95	pushq	%r14
96	pushq	%r13
97	pushq	%r12
98	pushq	%rbp
99	pushq	%rbx
100
101#ifdef WINDOWS_ABI
102	subq	$STACKSIZE, %rsp
103
104	vmovups	%xmm6,    0(%rsp)
105	vmovups	%xmm7,   16(%rsp)
106	vmovups	%xmm8,   32(%rsp)
107	vmovups	%xmm9,   48(%rsp)
108	vmovups	%xmm10,  64(%rsp)
109	vmovups	%xmm11,  80(%rsp)
110	vmovups	%xmm12,  96(%rsp)
111	vmovups	%xmm13, 112(%rsp)
112	vmovups	%xmm14, 128(%rsp)
113	vmovups	%xmm15, 144(%rsp)
114
115	movq	OLD_B,     B
116#endif
117
118	movq	N,    %rax
119	andq	$-2,  %rax
120	imulq	M,    %rax
121
122	leaq	(B, %rax, SIZE), BO1
123
124	leaq	(,   LDA, SIZE), LDA
125	leaq	(,   M,   SIZE), M8
126
127	movq	M,  J
128	sarq	$1, J
129	jle	.L20
130	ALIGN_4
131
132.L01:
133	movq	A, AO1
134	leaq	(A,   LDA   ), AO2
135	leaq	(A,   LDA, 2), A
136
137	movq	B, BO
138	addq	$4 * SIZE, B
139
140	movq	N,  I
141	sarq	$3, I
142	jle	.L10
143	ALIGN_4
144
145
146.L08:
147#ifndef DOUBLE
148
149	vmovsd	0 * SIZE(AO1), %xmm0
150	vmovsd	2 * SIZE(AO1), %xmm2
151	vmovsd	4 * SIZE(AO1), %xmm4
152	vmovsd	6 * SIZE(AO1), %xmm6
153	vmovsd	0 * SIZE(AO2), %xmm1
154	vmovsd	2 * SIZE(AO2), %xmm3
155	vmovsd	4 * SIZE(AO2), %xmm5
156	vmovsd	6 * SIZE(AO2), %xmm7
157
158	vmovsd	%xmm0,    0 * SIZE(BO)
159	vmovsd	%xmm1,    2 * SIZE(BO)
160	leaq	(BO, M8, 2), BO
161
162	vmovsd	%xmm2,    0 * SIZE(BO)
163	vmovsd	%xmm3,    2 * SIZE(BO)
164	leaq	(BO, M8, 2), BO
165
166	vmovsd	%xmm4,    0 * SIZE(BO)
167	vmovsd	%xmm5,    2 * SIZE(BO)
168	leaq	(BO, M8, 2), BO
169
170	vmovsd	%xmm6,    0 * SIZE(BO)
171	vmovsd	%xmm7,    2 * SIZE(BO)
172	leaq	(BO, M8, 2), BO
173
174
175#else
176
177	prefetchnta	256(AO1)
178	prefetchnta	256(AO2)
179	vmovups	0 * SIZE(AO1), %xmm0
180	vmovups	2 * SIZE(AO1), %xmm2
181	vmovups	4 * SIZE(AO1), %xmm4
182	vmovups	6 * SIZE(AO1), %xmm6
183	vmovups	0 * SIZE(AO2), %xmm1
184	vmovups	2 * SIZE(AO2), %xmm3
185	vmovups	4 * SIZE(AO2), %xmm5
186	vmovups	6 * SIZE(AO2), %xmm7
187
188	vmovups	%xmm0,    0 * SIZE(BO)
189	vmovups	%xmm1,    2 * SIZE(BO)
190	leaq	(BO, M8, 2), BO
191
192	vmovups	%xmm2,    0 * SIZE(BO)
193	vmovups	%xmm3,    2 * SIZE(BO)
194	leaq	(BO, M8, 2), BO
195
196	vmovups	%xmm4,    0 * SIZE(BO)
197	vmovups	%xmm5,    2 * SIZE(BO)
198	leaq	(BO, M8, 2), BO
199
200	vmovups	%xmm6,    0 * SIZE(BO)
201	vmovups	%xmm7,    2 * SIZE(BO)
202	leaq	(BO, M8, 2), BO
203
204#endif
205
206	addq	$8 * SIZE, AO1
207	addq	$8 * SIZE, AO2
208	decq	I
209	jg	.L08
210	ALIGN_4
211
212
213
214.L10:
215	testq	$4, N
216	jle	.L12
217#ifndef DOUBLE
218
219	vmovsd	0 * SIZE(AO1), %xmm0
220	vmovsd	2 * SIZE(AO1), %xmm2
221	vmovsd	0 * SIZE(AO2), %xmm1
222	vmovsd	2 * SIZE(AO2), %xmm3
223
224	vmovsd	%xmm0,    0 * SIZE(BO)
225	vmovsd	%xmm1,    2 * SIZE(BO)
226	leaq	(BO, M8, 2), BO
227
228	vmovsd	%xmm2,    0 * SIZE(BO)
229	vmovsd	%xmm3,    2 * SIZE(BO)
230	leaq	(BO, M8, 2), BO
231
232
233#else
234
235	vmovups	0 * SIZE(AO1), %xmm0
236	vmovups	2 * SIZE(AO1), %xmm2
237	vmovups	0 * SIZE(AO2), %xmm1
238	vmovups	2 * SIZE(AO2), %xmm3
239
240	vmovups	%xmm0,    0 * SIZE(BO)
241	vmovups	%xmm1,    2 * SIZE(BO)
242	leaq	(BO, M8, 2), BO
243
244	vmovups	%xmm2,    0 * SIZE(BO)
245	vmovups	%xmm3,    2 * SIZE(BO)
246	leaq	(BO, M8, 2), BO
247
248#endif
249
250	addq	$4 * SIZE, AO1
251	addq	$4 * SIZE, AO2
252	ALIGN_4
253
254
255.L12:
256	testq	$2, N
257	jle	.L14
258#ifndef DOUBLE
259	vmovsd	0 * SIZE(AO1), %xmm0
260	vmovsd	0 * SIZE(AO2), %xmm1
261
262	vmovsd	%xmm0,    0 * SIZE(BO)
263	vmovsd	%xmm1,    2 * SIZE(BO)
264#else
265	vmovups	0 * SIZE(AO1), %xmm0
266	vmovups	0 * SIZE(AO2), %xmm1
267
268	vmovups	%xmm0,    0 * SIZE(BO)
269	vmovups	%xmm1,    2 * SIZE(BO)
270#endif
271
272	leaq	(BO, M8, 2), BO
273	addq	$2 * SIZE, AO1
274	addq	$2 * SIZE, AO2
275	ALIGN_4
276
277.L14:
278	testq	$1, N
279	jle	.L19
280
281#ifndef DOUBLE
282	vmovss	0 * SIZE(AO1), %xmm0
283	vmovss	0 * SIZE(AO2), %xmm1
284
285	vmovss	%xmm0,    0 * SIZE(BO1)
286	vmovss	%xmm1,    1 * SIZE(BO1)
287#else
288	vmovsd	0 * SIZE(AO1), %xmm0
289	vmovhpd	0 * SIZE(AO2), %xmm0 , %xmm0
290
291	vmovups	%xmm0,    0 * SIZE(BO1)
292#endif
293
294	addq	$2 * SIZE, BO1
295	ALIGN_4
296
297.L19:
298	decq	J
299	jg	.L01
300	ALIGN_4
301
302.L20:
303	testq	$1, M
304	jle	.L999
305	ALIGN_4
306
307.L31:
308	movq	A, AO1
309	movq	B, BO
310
311	movq	N,  I
312	sarq	$1, I
313	jle	.L33
314	ALIGN_4
315
316.L32:
317#ifndef DOUBLE
318	vmovsd	0 * SIZE(AO1), %xmm0
319	vmovsd	%xmm0,    0 * SIZE(BO)
320#else
321	vmovups	0 * SIZE(AO1), %xmm0
322	vmovups	%xmm0,    0 * SIZE(BO)
323#endif
324
325	addq	$2 * SIZE, AO1
326	leaq	(BO, M8, 2), BO
327	decq	I
328	jg	.L32
329	ALIGN_4
330
331.L33:
332	testq	$1, N
333	jle	.L999
334
335#ifndef DOUBLE
336	vmovss	0 * SIZE(AO1), %xmm0
337	vmovss	%xmm0,    0 * SIZE(BO1)
338#else
339	vmovsd	0 * SIZE(AO1), %xmm0
340	vmovsd	%xmm0,    0 * SIZE(BO1)
341#endif
342	addq	$1 * SIZE, BO1
343	ALIGN_4
344
345.L999:
346#ifdef WINDOWS_ABI
347	vmovups	  0(%rsp), %xmm6
348	vmovups	 16(%rsp), %xmm7
349	vmovups	 32(%rsp), %xmm8
350	vmovups	 48(%rsp), %xmm9
351	vmovups	 64(%rsp), %xmm10
352	vmovups	 80(%rsp), %xmm11
353	vmovups	 96(%rsp), %xmm12
354	vmovups	112(%rsp), %xmm13
355	vmovups	128(%rsp), %xmm14
356	vmovups	144(%rsp), %xmm15
357
358	addq	$STACKSIZE, %rsp
359#endif
360
361	popq	%rbx
362	popq	%rbp
363	popq	%r12
364	popq	%r13
365	popq	%r14
366	popq	%r15
367#ifdef WINDOWS_ABI
368	popq	%rsi
369	popq	%rdi
370#endif
371
372	ret
373
374	EPILOGUE
375