1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define M	ARG1	/* rdi */
26#define X	ARG2	/* rsi */
27#define INCX	ARG3	/* rdx */
28
29#define I	%rax
30
31#include "l1param.h"
32
33	PROLOGUE
34	PROFCODE
35
36	SAVEREGISTERS
37
38	pxor	%xmm0, %xmm0
39	testq	M, M
40	jle	.L999
41	testq	INCX, INCX
42	jle	.L999
43
44	pxor	%xmm1, %xmm1
45	pxor	%xmm2, %xmm2
46	pxor	%xmm3, %xmm3
47
48	pcmpeqb	%xmm15, %xmm15
49	psrld	$1, %xmm15
50
51	salq	$ZBASE_SHIFT, INCX
52
53	cmpq	$2 * SIZE, INCX
54	jne	.L100
55
56	subq	$-32 * SIZE, X
57	addq	M, M
58
59	cmpq	$3, M
60	jle	.L18
61
62	testq	$4, X
63	je	.L05
64	movss	-32 * SIZE(X), %xmm0
65	andps	%xmm15, %xmm0
66	addq	$SIZE, X
67	decq	M
68	jle	.L998
69	ALIGN_3
70
71.L05:
72	testq	$8, X
73	je	.L10
74
75#ifdef movsd
76	xorps	%xmm1, %xmm1
77#endif
78	movsd	-32 * SIZE(X), %xmm1
79	andps	%xmm15, %xmm1
80	addq	$2 * SIZE, X
81	subq	$2, M
82	jle	.L998
83	ALIGN_3
84
85.L10:
86	movq	M,  I
87	sarq	$5, I
88	jle	.L14
89
90	movaps	-32 * SIZE(X), %xmm4
91	movaps	-28 * SIZE(X), %xmm5
92	movaps	-24 * SIZE(X), %xmm6
93	movaps	-20 * SIZE(X), %xmm7
94
95	movaps	-16 * SIZE(X), %xmm8
96	movaps	-12 * SIZE(X), %xmm9
97	movaps	 -8 * SIZE(X), %xmm10
98	movaps	 -4 * SIZE(X), %xmm11
99	decq	I
100	jle	.L12
101	ALIGN_3
102
103.L11:
104#ifdef PREFETCH
105	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
106#endif
107
108	andps	%xmm15, %xmm4
109	addps	%xmm4,  %xmm0
110	movaps	  0 * SIZE(X), %xmm4
111
112	andps	%xmm15, %xmm5
113	addps	%xmm5,  %xmm1
114	movaps	  4 * SIZE(X), %xmm5
115
116	andps	%xmm15, %xmm6
117	addps	%xmm6,  %xmm2
118	movaps	  8 * SIZE(X), %xmm6
119
120	andps	%xmm15, %xmm7
121	addps	%xmm7,  %xmm3
122	movaps	 12 * SIZE(X), %xmm7
123
124#if defined(PREFETCH) && !defined(FETCH128)
125	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
126#endif
127
128	andps	%xmm15, %xmm8
129	addps	%xmm8,  %xmm0
130	movaps	 16 * SIZE(X), %xmm8
131
132	andps	%xmm15, %xmm9
133	addps	%xmm9,  %xmm1
134	movaps	 20 * SIZE(X), %xmm9
135
136	andps	%xmm15, %xmm10
137	addps	%xmm10, %xmm2
138	movaps	 24 * SIZE(X), %xmm10
139
140	andps	%xmm15, %xmm11
141	addps	%xmm11, %xmm3
142	movaps	 28 * SIZE(X), %xmm11
143
144	subq	$-32 * SIZE, X
145	decq	I
146	jg	.L11
147	ALIGN_3
148
149.L12:
150	andps	%xmm15, %xmm4
151	addps	%xmm4,  %xmm0
152	andps	%xmm15, %xmm5
153	addps	%xmm5,  %xmm1
154
155	andps	%xmm15, %xmm6
156	addps	%xmm6,  %xmm2
157	andps	%xmm15, %xmm7
158	addps	%xmm7,  %xmm3
159
160	andps	%xmm15, %xmm8
161	addps	%xmm8,  %xmm0
162	andps	%xmm15, %xmm9
163	addps	%xmm9,  %xmm1
164
165	andps	%xmm15, %xmm10
166	addps	%xmm10, %xmm2
167	andps	%xmm15, %xmm11
168	addps	%xmm11, %xmm3
169
170	addq	$32 * SIZE, X
171	ALIGN_3
172
173.L14:
174	testq	$31,  M
175	jle	.L998
176
177.L15:
178	testq	$16, M
179	je	.L16
180
181	movaps	-32 * SIZE(X), %xmm4
182	andps	%xmm15, %xmm4
183	addps	%xmm4, %xmm0
184
185	movaps	-28 * SIZE(X), %xmm5
186	andps	%xmm15, %xmm5
187	addps	%xmm5, %xmm1
188
189	movaps	-24 * SIZE(X), %xmm4
190	andps	%xmm15, %xmm4
191	addps	%xmm4, %xmm0
192
193	movaps	-20 * SIZE(X), %xmm5
194	andps	%xmm15, %xmm5
195	addps	%xmm5, %xmm1
196
197	addq	$16 * SIZE, X
198	ALIGN_3
199
200.L16:
201	testq	$8, M
202	je	.L17
203
204	movaps	-32 * SIZE(X), %xmm4
205	andps	%xmm15, %xmm4
206	addps	%xmm4, %xmm0
207
208	movaps	-28 * SIZE(X), %xmm5
209	andps	%xmm15, %xmm5
210	addps	%xmm5, %xmm1
211
212	addq	$8 * SIZE, X
213	ALIGN_3
214
215.L17:
216	testq	$4, M
217	je	.L18
218
219	movaps	-32 * SIZE(X), %xmm6
220	andps	%xmm15, %xmm6
221	addps	%xmm6, %xmm2
222	addq	$4 * SIZE, X
223	ALIGN_3
224
225.L18:
226	testq	$2, M
227	je	.L19
228
229#ifdef movsd
230	xorps	%xmm7, %xmm7
231#endif
232	movsd	-32 * SIZE(X), %xmm7
233	andps	%xmm15, %xmm7
234	addps	%xmm7, %xmm3
235	addq	$2 * SIZE, X
236	ALIGN_3
237
238.L19:
239	testq	$1, M
240	je	.L998
241
242	movss	-32 * SIZE(X), %xmm6
243	andps	%xmm15, %xmm6
244	addps	%xmm6, %xmm2
245	jmp	.L998
246	ALIGN_4
247
248.L100:
249	movq	M,  I
250	sarq	$2, I
251	jle	.L105
252	ALIGN_4
253
254.L101:
255	movsd	(X), %xmm4
256	addq	INCX, X
257	movhps	(X), %xmm4
258	addq	INCX, X
259
260	andps	%xmm15, %xmm4
261	addps	%xmm4,  %xmm0
262
263	movsd	(X), %xmm5
264	addq	INCX, X
265	movhps	(X), %xmm5
266	addq	INCX, X
267
268	andps	%xmm15, %xmm5
269	addps	%xmm5,  %xmm1
270
271	decq	I
272	jg	.L101
273	ALIGN_4
274
275.L105:
276#ifdef movsd
277	xorps	%xmm4, %xmm4
278#endif
279	andq	$3,  M
280	jle	.L998
281	ALIGN_4
282
283.L106:
284	movsd	(X), %xmm4
285	andps	%xmm15, %xmm4
286	addps	%xmm4, %xmm0
287	addq	INCX, X
288	decq	M
289	jg	.L106
290	ALIGN_4
291
292.L998:
293	addps	%xmm1, %xmm0
294	addps	%xmm3, %xmm2
295	addps	%xmm2, %xmm0
296
297#ifndef HAVE_SSE3
298	movhlps	%xmm0, %xmm1
299	addps	%xmm1, %xmm0
300
301	movaps	%xmm0, %xmm1
302	shufps  $1, %xmm0, %xmm0
303	addss	 %xmm1, %xmm0
304#else
305	haddps	%xmm0, %xmm0
306	haddps	%xmm0, %xmm0
307#endif
308	ALIGN_4
309
310.L999:
311	RESTOREREGISTERS
312
313	ret
314
315	EPILOGUE
316