1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	ARG1	/* rdi */
43#define X	ARG2	/* rsi */
44#define INCX	ARG3	/* rdx */
45
46#define I	%rax
47
48#include "l1param.h"
49
50	PROLOGUE
51	PROFCODE
52
53	SAVEREGISTERS
54
55	pxor	%xmm0, %xmm0
56	testq	M, M
57	jle	.L999
58	pxor	%xmm1, %xmm1
59	testq	INCX, INCX
60	jle	.L999
61
62	pxor	%xmm2, %xmm2
63	leaq	(, INCX, SIZE), INCX
64	pxor	%xmm3, %xmm3
65	cmpq	$SIZE, INCX
66	jne	.L40
67
68	testq	$SIZE, X
69	je	.L05
70
71	movss	0 * SIZE(X), %xmm4
72	cvtss2sd %xmm4, %xmm6
73	mulsd	 %xmm6, %xmm6
74	addsd	 %xmm6, %xmm3
75	addq	INCX, X
76	decq	M
77	jle	.L998
78	ALIGN_3
79
80.L05:
81	movq	M,  I
82	sarq	$3, I
83	jle	.L14
84
85	movsd	0 * SIZE(X), %xmm4
86	movsd	2 * SIZE(X), %xmm5
87	movsd	4 * SIZE(X), %xmm6
88	movsd	6 * SIZE(X), %xmm7
89	addq	$8 * SIZE, X
90	decq	I
91	jle	.L12
92	ALIGN_3
93
94.L10:
95#ifdef PREFETCH
96	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
97#endif
98
99	cvtps2pd %xmm4, %xmm8
100	cvtps2pd %xmm5, %xmm9
101	cvtps2pd %xmm6, %xmm10
102	cvtps2pd %xmm7, %xmm11
103
104	movsd	0 * SIZE(X), %xmm4
105	movsd	2 * SIZE(X), %xmm5
106	movsd	4 * SIZE(X), %xmm6
107	movsd	6 * SIZE(X), %xmm7
108
109	mulpd	 %xmm8,  %xmm8
110	mulpd	 %xmm9,  %xmm9
111	mulpd	 %xmm10, %xmm10
112	mulpd	 %xmm11, %xmm11
113
114	addpd	 %xmm8,  %xmm0
115	addpd	 %xmm9,  %xmm1
116	addpd	 %xmm10, %xmm2
117	addpd	 %xmm11, %xmm3
118
119	addq	$8 * SIZE, X
120	decq	I
121	jg	.L10
122	ALIGN_3
123
124.L12:
125	cvtps2pd %xmm4, %xmm8
126	cvtps2pd %xmm5, %xmm9
127	cvtps2pd %xmm6, %xmm10
128	cvtps2pd %xmm7, %xmm11
129
130	mulpd	 %xmm8,  %xmm8
131	mulpd	 %xmm9,  %xmm9
132	mulpd	 %xmm10, %xmm10
133	mulpd	 %xmm11, %xmm11
134
135	addpd	 %xmm8,  %xmm0
136	addpd	 %xmm9,  %xmm1
137	addpd	 %xmm10, %xmm2
138	addpd	 %xmm11, %xmm3
139	ALIGN_3
140
141
142.L14:
143	testq	$4,  M
144	je	.L15
145
146	movsd	0 * SIZE(X), %xmm4
147	movsd	2 * SIZE(X), %xmm5
148	cvtps2pd %xmm4, %xmm6
149	cvtps2pd %xmm5, %xmm7
150	mulpd	 %xmm6, %xmm6
151	mulpd	 %xmm7, %xmm7
152	addpd	 %xmm6, %xmm0
153	addpd	 %xmm7, %xmm1
154	addq	$4 * SIZE, X
155	ALIGN_3
156
157.L15:
158	testq	$2,  M
159	je	.L16
160
161	movsd	0 * SIZE(X), %xmm4
162	cvtps2pd %xmm4, %xmm6
163	mulpd	 %xmm6, %xmm6
164	addpd	 %xmm6, %xmm2
165	addq	$2 * SIZE, X
166	ALIGN_3
167
168.L16:
169	testq	$1,  M
170	je	.L998
171
172	movss	0 * SIZE(X), %xmm4
173	cvtss2sd %xmm4, %xmm6
174	mulsd	 %xmm6, %xmm6
175	addsd	 %xmm6, %xmm3
176	jmp	.L998
177	ALIGN_4
178
179.L40:
180	movq	M,  I
181	sarq	$3, I
182	jle	.L44
183	ALIGN_4
184
185.L41:
186	movss	(X), %xmm4
187	addq	INCX, X
188	movss	(X), %xmm5
189	addq	INCX, X
190	movss	(X), %xmm6
191	addq	INCX, X
192	movss	(X), %xmm7
193	addq	INCX, X
194	movss	(X), %xmm8
195	addq	INCX, X
196	movss	(X), %xmm9
197	addq	INCX, X
198	movss	(X), %xmm10
199	addq	INCX, X
200	movss	(X), %xmm11
201	addq	INCX, X
202
203	cvtss2sd %xmm4,  %xmm4
204	cvtss2sd %xmm5,  %xmm5
205	cvtss2sd %xmm6,  %xmm6
206	cvtss2sd %xmm7,  %xmm7
207	cvtss2sd %xmm8,  %xmm8
208	cvtss2sd %xmm9,  %xmm9
209	cvtss2sd %xmm10, %xmm10
210	cvtss2sd %xmm11, %xmm11
211
212	mulsd	 %xmm4,  %xmm4
213	mulsd	 %xmm5,  %xmm5
214	mulsd	 %xmm6,  %xmm6
215	mulsd	 %xmm7,  %xmm7
216
217	addsd	 %xmm4,  %xmm0
218	addsd	 %xmm5,  %xmm1
219	addsd	 %xmm6,  %xmm2
220	addsd	 %xmm7,  %xmm3
221
222	mulsd	 %xmm8,  %xmm8
223	mulsd	 %xmm9,  %xmm9
224	mulsd	 %xmm10, %xmm10
225	mulsd	 %xmm11, %xmm11
226
227	addsd	 %xmm8,  %xmm0
228	addsd	 %xmm9,  %xmm1
229	addsd	 %xmm10, %xmm2
230	addsd	 %xmm11, %xmm3
231
232	decq	I
233	jg	.L41
234	ALIGN_3
235
236.L44:
237	testq	$4,  M
238	je	.L45
239
240	movss	(X), %xmm4
241	addq	INCX, X
242	movss	(X), %xmm5
243	addq	INCX, X
244	movss	(X), %xmm6
245	addq	INCX, X
246	movss	(X), %xmm7
247	addq	INCX, X
248
249	cvtss2sd %xmm4, %xmm8
250	cvtss2sd %xmm5, %xmm9
251	cvtss2sd %xmm6, %xmm10
252	cvtss2sd %xmm7, %xmm11
253
254	mulsd	 %xmm8,  %xmm8
255	mulsd	 %xmm9,  %xmm9
256	mulsd	 %xmm10, %xmm10
257	mulsd	 %xmm11, %xmm11
258
259	addsd	 %xmm8,  %xmm0
260	addsd	 %xmm9,  %xmm1
261	addsd	 %xmm10, %xmm2
262	addsd	 %xmm11, %xmm3
263	ALIGN_3
264
265.L45:
266	testq	$2,  M
267	je	.L46
268
269	movss	(X), %xmm4
270	addq	INCX, X
271	movss	(X), %xmm5
272	addq	INCX, X
273
274	cvtss2sd %xmm4, %xmm6
275	cvtss2sd %xmm5, %xmm7
276	mulsd	 %xmm6, %xmm6
277	mulsd	 %xmm7, %xmm7
278	addsd	 %xmm6, %xmm1
279	addsd	 %xmm7, %xmm2
280	ALIGN_3
281
282.L46:
283	testq	$1,  M
284	je	.L998
285
286	movss	(X), %xmm4
287	cvtss2sd %xmm4, %xmm6
288	mulsd	 %xmm6, %xmm6
289	addsd	 %xmm6, %xmm3
290	ALIGN_4
291
292.L998:
293	addpd	%xmm1, %xmm0
294	addpd	%xmm3, %xmm2
295	addpd	%xmm2, %xmm0
296
297#ifndef HAVE_SSE3
298	movapd	%xmm0, %xmm1
299	unpckhpd	%xmm0, %xmm0
300	addsd	%xmm1, %xmm0
301#else
302	haddpd	%xmm0, %xmm0
303#endif
304	ALIGN_4
305
306.L999:
307	sqrtsd	%xmm0,  %xmm0
308
309	cvtsd2ss %xmm0, %xmm0
310
311	RESTOREREGISTERS
312
313	ret
314
315	EPILOGUE
316
317