1 /*
2 * Copyright (c) 2005
3 * Eric Anholt. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 #include <liboil/liboilclasses.h>
31 #include <liboil/liboilfunction.h>
32 #include <emmintrin.h>
33 #include <xmmintrin.h>
34
35 #ifdef HAVE_I386
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
37 #else
38 #define SSE_FUNCTION
39 #endif
40
41 SSE_FUNCTION static void
add_f32_sse(float * dest,float * src1,float * src2,int n)42 add_f32_sse (float *dest, float *src1, float *src2, int n)
43 {
44 /* Initial operations to align the destination pointer */
45 for (; ((long)dest & 15) && (n > 0); n--) {
46 *dest++ = *src1++ + *src2++;
47 }
48 for (; n >= 4; n -= 4) {
49 __m128 xmm0, xmm1;
50 xmm0 = _mm_loadu_ps(src1);
51 xmm1 = _mm_loadu_ps(src2);
52 xmm0 = _mm_add_ps(xmm0, xmm1);
53 _mm_store_ps(dest, xmm0);
54 dest += 4;
55 src1 += 4;
56 src2 += 4;
57 }
58 for (; n > 0; n--) {
59 *dest++ = *src1++ + *src2++;
60 }
61 }
62 OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE);
63
64 SSE_FUNCTION static void
add_f64_sse2(double * dest,double * src1,double * src2,int n)65 add_f64_sse2 (double *dest, double *src1, double *src2, int n)
66 {
67 __m128d xmm0, xmm1;
68 while (((long)dest & 15) && (0 < n)) {
69 *dest++ = *src1++ + *src2++;
70 n--;
71 }
72 while (1 < n) {
73 xmm0 = _mm_loadu_pd(src1);
74 xmm1 = _mm_loadu_pd(src2);
75 xmm0 = _mm_add_pd(xmm0, xmm1);
76 _mm_store_pd(dest, xmm0);
77 dest += 2;
78 src1 += 2;
79 src2 += 2;
80 n -= 2;
81 }
82 while (0 < n) {
83 *dest++ = *src1++ + *src2++;
84 n--;
85 }
86 }
87 OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2);
88
89 SSE_FUNCTION static void
add_f64_sse2_unroll(double * dest,double * src1,double * src2,int n)90 add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
91 {
92 __m128d xmm0, xmm1;
93 while (((long)dest & 15) && (0 < n)) {
94 *dest++ = *src1++ + *src2++;
95 n--;
96 }
97 while (3 < n) {
98 xmm0 = _mm_loadu_pd(src1);
99 xmm1 = _mm_loadu_pd(src2);
100 xmm0 = _mm_add_pd(xmm0, xmm1);
101 _mm_store_pd(dest, xmm0);
102
103 xmm0 = _mm_loadu_pd(src1+2);
104 xmm1 = _mm_loadu_pd(src2+2);
105 xmm0 = _mm_add_pd(xmm0, xmm1);
106 _mm_store_pd(dest+2, xmm0);
107 dest += 4;
108 src1 += 4;
109 src2 += 4;
110 n -= 4;
111 }
112 while (1 < n) {
113 xmm0 = _mm_loadu_pd(src1);
114 xmm1 = _mm_loadu_pd(src2);
115 xmm0 = _mm_add_pd(xmm0, xmm1);
116 _mm_store_pd(dest, xmm0);
117 dest += 2;
118 src1 += 2;
119 src2 += 2;
120 n -= 2;
121 }
122 while (0 < n) {
123 *dest++ = *src1++ + *src2++;
124 n--;
125 }
126 }
127 OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2);
128
129 SSE_FUNCTION static void
subtract_f32_sse(float * dest,float * src1,float * src2,int n)130 subtract_f32_sse (float *dest, float *src1, float *src2, int n)
131 {
132 /* Initial operations to align the destination pointer */
133 for (; ((long)dest & 15) && (n > 0); n--) {
134 *dest++ = *src1++ - *src2++;
135 }
136 for (; n >= 4; n -= 4) {
137 __m128 xmm0, xmm1;
138 xmm0 = _mm_loadu_ps(src1);
139 xmm1 = _mm_loadu_ps(src2);
140 xmm0 = _mm_sub_ps(xmm0, xmm1);
141 _mm_store_ps(dest, xmm0);
142 dest += 4;
143 src1 += 4;
144 src2 += 4;
145 }
146 for (; n > 0; n--) {
147 *dest++ = *src1++ - *src2++;
148 }
149 }
150 OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE);
151
152 SSE_FUNCTION static void
multiply_f32_sse(float * dest,float * src1,float * src2,int n)153 multiply_f32_sse (float *dest, float *src1, float *src2, int n)
154 {
155 /* Initial operations to align the destination pointer */
156 for (; ((long)dest & 15) && (n > 0); n--) {
157 *dest++ = *src1++ * *src2++;
158 }
159 for (; n >= 4; n -= 4) {
160 __m128 xmm0, xmm1;
161 xmm0 = _mm_loadu_ps(src1);
162 xmm1 = _mm_loadu_ps(src2);
163 xmm0 = _mm_mul_ps(xmm0, xmm1);
164 _mm_store_ps(dest, xmm0);
165 dest += 4;
166 src1 += 4;
167 src2 += 4;
168 }
169 for (; n > 0; n--) {
170 *dest++ = *src1++ * *src2++;
171 }
172 }
173 OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE);
174
175 SSE_FUNCTION static void
divide_f32_sse(float * dest,float * src1,float * src2,int n)176 divide_f32_sse (float *dest, float *src1, float *src2, int n)
177 {
178 /* Initial operations to align the destination pointer */
179 for (; ((long)dest & 15) && (n > 0); n--) {
180 *dest++ = *src1++ / *src2++;
181 }
182 for (; n >= 4; n -= 4) {
183 __m128 xmm0, xmm1;
184 xmm0 = _mm_loadu_ps(src1);
185 xmm1 = _mm_loadu_ps(src2);
186 xmm0 = _mm_div_ps(xmm0, xmm1);
187 _mm_store_ps(dest, xmm0);
188 dest += 4;
189 src1 += 4;
190 src2 += 4;
191 }
192 for (; n > 0; n--) {
193 *dest++ = *src1++ / *src2++;
194 }
195 }
196 OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE);
197
198 SSE_FUNCTION static void
minimum_f32_sse(float * dest,float * src1,float * src2,int n)199 minimum_f32_sse (float *dest, float *src1, float *src2, int n)
200 {
201 /* Initial operations to align the destination pointer */
202 for (; ((long)dest & 15) && (n > 0); n--) {
203 *dest++ = *src1 < *src2 ? *src1 : *src2;
204 src1++;
205 src2++;
206 }
207 for (; n >= 4; n -= 4) {
208 __m128 xmm0, xmm1;
209 xmm0 = _mm_loadu_ps(src1);
210 xmm1 = _mm_loadu_ps(src2);
211 xmm0 = _mm_min_ps(xmm0, xmm1);
212 _mm_store_ps(dest, xmm0);
213 dest += 4;
214 src1 += 4;
215 src2 += 4;
216 }
217 for (; n > 0; n--) {
218 *dest++ = *src1 < *src2 ? *src1 : *src2;
219 src1++;
220 src2++;
221 }
222 }
223 OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE);
224
225 SSE_FUNCTION static void
maximum_f32_sse(float * dest,float * src1,float * src2,int n)226 maximum_f32_sse (float *dest, float *src1, float *src2, int n)
227 {
228 /* Initial operations to align the destination pointer */
229 for (; ((long)dest & 15) && (n > 0); n--) {
230 *dest++ = *src1 > *src2 ? *src1 : *src2;
231 src1++;
232 src2++;
233 }
234 for (; n >= 4; n -= 4) {
235 __m128 xmm0, xmm1;
236 xmm0 = _mm_loadu_ps(src1);
237 xmm1 = _mm_loadu_ps(src2);
238 xmm0 = _mm_max_ps(xmm0, xmm1);
239 _mm_store_ps(dest, xmm0);
240 dest += 4;
241 src1 += 4;
242 src2 += 4;
243 }
244 for (; n > 0; n--) {
245 *dest++ = *src1 > *src2 ? *src1 : *src2;
246 src1++;
247 src2++;
248 }
249 }
250 OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE);
251
252 SSE_FUNCTION static void
inverse_f32_sse(float * dest,float * src1,int n)253 inverse_f32_sse (float *dest, float *src1, int n)
254 {
255 /* Initial operations to align the destination pointer */
256 for (; ((long)dest & 15) && (n > 0); n--) {
257 *dest++ = 1.0 / *src1++;
258 }
259 for (; n >= 4; n -= 4) {
260 __m128 xmm0, xmm1;
261 /* While _mm_rcp_ps sounds promising, the results it gives are rather
262 * different from the 1.0 / src1 reference implementation, so do that.
263 */
264 xmm0 = _mm_set_ps1(1.0);
265 xmm1 = _mm_loadu_ps(src1);
266 xmm0 = _mm_div_ps(xmm0, xmm1);
267 _mm_store_ps(dest, xmm0);
268 dest += 4;
269 src1 += 4;
270 }
271 for (; n > 0; n--) {
272 *dest++ = 1.0 / *src1++;
273 }
274 }
275 OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE);
276
277 SSE_FUNCTION static void
negative_f32_sse(float * dest,float * src1,int n)278 negative_f32_sse (float *dest, float *src1, int n)
279 {
280 /* Initial operations to align the destination pointer */
281 for (; ((long)dest & 15) && (n > 0); n--) {
282 *dest++ = -(*src1++);
283 }
284 for (; n >= 4; n -= 4) {
285 __m128 xmm0, xmm1;
286 xmm0 = _mm_setzero_ps();
287 xmm1 = _mm_loadu_ps(src1);
288 xmm0 = _mm_sub_ps(xmm0, xmm1);
289 _mm_store_ps(dest, xmm0);
290 dest += 4;
291 src1 += 4;
292 }
293 for (; n > 0; n--) {
294 *dest++ = -(*src1++);
295 }
296 }
297 OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE);
298
299 SSE_FUNCTION static void
scalaradd_f32_ns_sse(float * dest,float * src1,float * val,int n)300 scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
301 {
302 __m128 xmm1;
303
304 /* Initial operations to align the destination pointer */
305 for (; ((long)dest & 15) && (n > 0); n--) {
306 *dest++ = *src1++ + *val;
307 }
308 xmm1 = _mm_load_ps1(val);
309 for (; n >= 4; n -= 4) {
310 __m128 xmm0;
311 xmm0 = _mm_loadu_ps(src1);
312 xmm0 = _mm_add_ps(xmm0, xmm1);
313 _mm_store_ps(dest, xmm0);
314 dest += 4;
315 src1 += 4;
316 }
317 for (; n > 0; n--) {
318 *dest++ = *src1++ + *val;
319 }
320 }
321 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
322
323 SSE_FUNCTION static void
scalarmultiply_f32_ns_sse(float * dest,float * src1,float * val,int n)324 scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
325 {
326 __m128 xmm1;
327
328 /* Initial operations to align the destination pointer */
329 for (; ((long)dest & 15) && (n > 0); n--) {
330 *dest++ = *src1++ * *val;
331 }
332 xmm1 = _mm_load_ps1(val);
333 for (; n >= 4; n -= 4) {
334 __m128 xmm0;
335 xmm0 = _mm_loadu_ps(src1);
336 xmm0 = _mm_mul_ps(xmm0, xmm1);
337 _mm_store_ps(dest, xmm0);
338 dest += 4;
339 src1 += 4;
340 }
341 for (; n > 0; n--) {
342 *dest++ = *src1++ * *val;
343 }
344 }
345 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
346
347 SSE_FUNCTION static void
scalarmultiply_f64_ns_sse2(double * dest,double * src1,double * val,int n)348 scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n)
349 {
350 __m128d xmm1;
351
352 /* Initial operations to align the destination pointer */
353 for (; ((long)dest & 15) && (n > 0); n--) {
354 *dest++ = *src1++ * *val;
355 }
356 xmm1 = _mm_load1_pd(val);
357 for (; n >= 2; n -= 2) {
358 __m128d xmm0;
359 xmm0 = _mm_loadu_pd(src1);
360 xmm0 = _mm_mul_pd(xmm0, xmm1);
361 _mm_store_pd(dest, xmm0);
362 dest += 2;
363 src1 += 2;
364 }
365 for (; n > 0; n--) {
366 *dest++ = *src1++ * *val;
367 }
368 }
369 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
370
371