1 /*
2  * Copyright (c) 2005
3  *	Eric Anholt.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30 #include <liboil/liboilclasses.h>
31 #include <liboil/liboilfunction.h>
32 #include <emmintrin.h>
33 #include <xmmintrin.h>
34 
35 #ifdef HAVE_I386
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
37 #else
38 #define SSE_FUNCTION
39 #endif
40 
41 SSE_FUNCTION static void
add_f32_sse(float * dest,float * src1,float * src2,int n)42 add_f32_sse (float *dest, float *src1, float *src2, int n)
43 {
44   /* Initial operations to align the destination pointer */
45   for (; ((long)dest & 15) && (n > 0); n--) {
46     *dest++ = *src1++ + *src2++;
47   }
48   for (; n >= 4; n -= 4) {
49     __m128 xmm0, xmm1;
50     xmm0 = _mm_loadu_ps(src1);
51     xmm1 = _mm_loadu_ps(src2);
52     xmm0 = _mm_add_ps(xmm0, xmm1);
53     _mm_store_ps(dest, xmm0);
54     dest += 4;
55     src1 += 4;
56     src2 += 4;
57   }
58   for (; n > 0; n--) {
59     *dest++ = *src1++ + *src2++;
60   }
61 }
62 OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE);
63 
64 SSE_FUNCTION static void
add_f64_sse2(double * dest,double * src1,double * src2,int n)65 add_f64_sse2 (double *dest, double *src1, double *src2, int n)
66 {
67   __m128d xmm0, xmm1;
68   while (((long)dest & 15) && (0 < n)) {
69     *dest++ = *src1++ + *src2++;
70     n--;
71   }
72   while (1 < n) {
73     xmm0 = _mm_loadu_pd(src1);
74     xmm1 = _mm_loadu_pd(src2);
75     xmm0 = _mm_add_pd(xmm0, xmm1);
76     _mm_store_pd(dest, xmm0);
77     dest += 2;
78     src1 += 2;
79     src2 += 2;
80     n -= 2;
81   }
82   while (0 < n) {
83     *dest++ = *src1++ + *src2++;
84     n--;
85   }
86 }
87 OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2);
88 
89 SSE_FUNCTION static void
add_f64_sse2_unroll(double * dest,double * src1,double * src2,int n)90 add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
91 {
92   __m128d xmm0, xmm1;
93   while (((long)dest & 15) && (0 < n)) {
94     *dest++ = *src1++ + *src2++;
95     n--;
96   }
97   while (3 < n) {
98     xmm0 = _mm_loadu_pd(src1);
99     xmm1 = _mm_loadu_pd(src2);
100     xmm0 = _mm_add_pd(xmm0, xmm1);
101     _mm_store_pd(dest, xmm0);
102 
103     xmm0 = _mm_loadu_pd(src1+2);
104     xmm1 = _mm_loadu_pd(src2+2);
105     xmm0 = _mm_add_pd(xmm0, xmm1);
106     _mm_store_pd(dest+2, xmm0);
107     dest += 4;
108     src1 += 4;
109     src2 += 4;
110     n -= 4;
111   }
112   while (1 < n) {
113     xmm0 = _mm_loadu_pd(src1);
114     xmm1 = _mm_loadu_pd(src2);
115     xmm0 = _mm_add_pd(xmm0, xmm1);
116     _mm_store_pd(dest, xmm0);
117     dest += 2;
118     src1 += 2;
119     src2 += 2;
120     n -= 2;
121   }
122   while (0 < n) {
123     *dest++ = *src1++ + *src2++;
124     n--;
125   }
126 }
127 OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2);
128 
129 SSE_FUNCTION static void
subtract_f32_sse(float * dest,float * src1,float * src2,int n)130 subtract_f32_sse (float *dest, float *src1, float *src2, int n)
131 {
132   /* Initial operations to align the destination pointer */
133   for (; ((long)dest & 15) && (n > 0); n--) {
134     *dest++ = *src1++ - *src2++;
135   }
136   for (; n >= 4; n -= 4) {
137     __m128 xmm0, xmm1;
138     xmm0 = _mm_loadu_ps(src1);
139     xmm1 = _mm_loadu_ps(src2);
140     xmm0 = _mm_sub_ps(xmm0, xmm1);
141     _mm_store_ps(dest, xmm0);
142     dest += 4;
143     src1 += 4;
144     src2 += 4;
145   }
146   for (; n > 0; n--) {
147     *dest++ = *src1++ - *src2++;
148   }
149 }
150 OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE);
151 
152 SSE_FUNCTION static void
multiply_f32_sse(float * dest,float * src1,float * src2,int n)153 multiply_f32_sse (float *dest, float *src1, float *src2, int n)
154 {
155   /* Initial operations to align the destination pointer */
156   for (; ((long)dest & 15) && (n > 0); n--) {
157     *dest++ = *src1++ * *src2++;
158   }
159   for (; n >= 4; n -= 4) {
160     __m128 xmm0, xmm1;
161     xmm0 = _mm_loadu_ps(src1);
162     xmm1 = _mm_loadu_ps(src2);
163     xmm0 = _mm_mul_ps(xmm0, xmm1);
164     _mm_store_ps(dest, xmm0);
165     dest += 4;
166     src1 += 4;
167     src2 += 4;
168   }
169   for (; n > 0; n--) {
170     *dest++ = *src1++ * *src2++;
171   }
172 }
173 OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE);
174 
175 SSE_FUNCTION static void
divide_f32_sse(float * dest,float * src1,float * src2,int n)176 divide_f32_sse (float *dest, float *src1, float *src2, int n)
177 {
178   /* Initial operations to align the destination pointer */
179   for (; ((long)dest & 15) && (n > 0); n--) {
180     *dest++ = *src1++ / *src2++;
181   }
182   for (; n >= 4; n -= 4) {
183     __m128 xmm0, xmm1;
184     xmm0 = _mm_loadu_ps(src1);
185     xmm1 = _mm_loadu_ps(src2);
186     xmm0 = _mm_div_ps(xmm0, xmm1);
187     _mm_store_ps(dest, xmm0);
188     dest += 4;
189     src1 += 4;
190     src2 += 4;
191   }
192   for (; n > 0; n--) {
193     *dest++ = *src1++ / *src2++;
194   }
195 }
196 OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE);
197 
198 SSE_FUNCTION static void
minimum_f32_sse(float * dest,float * src1,float * src2,int n)199 minimum_f32_sse (float *dest, float *src1, float *src2, int n)
200 {
201   /* Initial operations to align the destination pointer */
202   for (; ((long)dest & 15) && (n > 0); n--) {
203     *dest++ = *src1 < *src2 ? *src1 : *src2;
204     src1++;
205     src2++;
206   }
207   for (; n >= 4; n -= 4) {
208     __m128 xmm0, xmm1;
209     xmm0 = _mm_loadu_ps(src1);
210     xmm1 = _mm_loadu_ps(src2);
211     xmm0 = _mm_min_ps(xmm0, xmm1);
212     _mm_store_ps(dest, xmm0);
213     dest += 4;
214     src1 += 4;
215     src2 += 4;
216   }
217   for (; n > 0; n--) {
218     *dest++ = *src1 < *src2 ? *src1 : *src2;
219     src1++;
220     src2++;
221   }
222 }
223 OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE);
224 
225 SSE_FUNCTION static void
maximum_f32_sse(float * dest,float * src1,float * src2,int n)226 maximum_f32_sse (float *dest, float *src1, float *src2, int n)
227 {
228   /* Initial operations to align the destination pointer */
229   for (; ((long)dest & 15) && (n > 0); n--) {
230     *dest++ = *src1 > *src2 ? *src1 : *src2;
231     src1++;
232     src2++;
233   }
234   for (; n >= 4; n -= 4) {
235     __m128 xmm0, xmm1;
236     xmm0 = _mm_loadu_ps(src1);
237     xmm1 = _mm_loadu_ps(src2);
238     xmm0 = _mm_max_ps(xmm0, xmm1);
239     _mm_store_ps(dest, xmm0);
240     dest += 4;
241     src1 += 4;
242     src2 += 4;
243   }
244   for (; n > 0; n--) {
245     *dest++ = *src1 > *src2 ? *src1 : *src2;
246     src1++;
247     src2++;
248   }
249 }
250 OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE);
251 
252 SSE_FUNCTION static void
inverse_f32_sse(float * dest,float * src1,int n)253 inverse_f32_sse (float *dest, float *src1, int n)
254 {
255   /* Initial operations to align the destination pointer */
256   for (; ((long)dest & 15) && (n > 0); n--) {
257     *dest++ = 1.0 / *src1++;
258   }
259   for (; n >= 4; n -= 4) {
260     __m128 xmm0, xmm1;
261     /* While _mm_rcp_ps sounds promising, the results it gives are rather
262      * different from the 1.0 / src1 reference implementation, so do that.
263      */
264     xmm0 = _mm_set_ps1(1.0);
265     xmm1 = _mm_loadu_ps(src1);
266     xmm0 = _mm_div_ps(xmm0, xmm1);
267     _mm_store_ps(dest, xmm0);
268     dest += 4;
269     src1 += 4;
270   }
271   for (; n > 0; n--) {
272     *dest++ = 1.0 / *src1++;
273   }
274 }
275 OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE);
276 
277 SSE_FUNCTION static void
negative_f32_sse(float * dest,float * src1,int n)278 negative_f32_sse (float *dest, float *src1, int n)
279 {
280   /* Initial operations to align the destination pointer */
281   for (; ((long)dest & 15) && (n > 0); n--) {
282     *dest++ = -(*src1++);
283   }
284   for (; n >= 4; n -= 4) {
285     __m128 xmm0, xmm1;
286     xmm0 = _mm_setzero_ps();
287     xmm1 = _mm_loadu_ps(src1);
288     xmm0 = _mm_sub_ps(xmm0, xmm1);
289     _mm_store_ps(dest, xmm0);
290     dest += 4;
291     src1 += 4;
292   }
293   for (; n > 0; n--) {
294     *dest++ = -(*src1++);
295   }
296 }
297 OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE);
298 
299 SSE_FUNCTION static void
scalaradd_f32_ns_sse(float * dest,float * src1,float * val,int n)300 scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
301 {
302   __m128 xmm1;
303 
304   /* Initial operations to align the destination pointer */
305   for (; ((long)dest & 15) && (n > 0); n--) {
306     *dest++ = *src1++ + *val;
307   }
308   xmm1 = _mm_load_ps1(val);
309   for (; n >= 4; n -= 4) {
310     __m128 xmm0;
311     xmm0 = _mm_loadu_ps(src1);
312     xmm0 = _mm_add_ps(xmm0, xmm1);
313     _mm_store_ps(dest, xmm0);
314     dest += 4;
315     src1 += 4;
316   }
317   for (; n > 0; n--) {
318     *dest++ = *src1++ + *val;
319   }
320 }
321 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
322 
323 SSE_FUNCTION static void
scalarmultiply_f32_ns_sse(float * dest,float * src1,float * val,int n)324 scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
325 {
326   __m128 xmm1;
327 
328   /* Initial operations to align the destination pointer */
329   for (; ((long)dest & 15) && (n > 0); n--) {
330     *dest++ = *src1++ * *val;
331   }
332   xmm1 = _mm_load_ps1(val);
333   for (; n >= 4; n -= 4) {
334     __m128 xmm0;
335     xmm0 = _mm_loadu_ps(src1);
336     xmm0 = _mm_mul_ps(xmm0, xmm1);
337     _mm_store_ps(dest, xmm0);
338     dest += 4;
339     src1 += 4;
340   }
341   for (; n > 0; n--) {
342     *dest++ = *src1++ * *val;
343   }
344 }
345 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
346 
347 SSE_FUNCTION static void
scalarmultiply_f64_ns_sse2(double * dest,double * src1,double * val,int n)348 scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n)
349 {
350   __m128d xmm1;
351 
352   /* Initial operations to align the destination pointer */
353   for (; ((long)dest & 15) && (n > 0); n--) {
354     *dest++ = *src1++ * *val;
355   }
356   xmm1 = _mm_load1_pd(val);
357   for (; n >= 2; n -= 2) {
358     __m128d xmm0;
359     xmm0 = _mm_loadu_pd(src1);
360     xmm0 = _mm_mul_pd(xmm0, xmm1);
361     _mm_store_pd(dest, xmm0);
362     dest += 2;
363     src1 += 2;
364   }
365   for (; n > 0; n--) {
366     *dest++ = *src1++ * *val;
367   }
368 }
369 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
370 
371