1 /* GStreamer
2  * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Library General Public
6  * License as published by the Free Software Foundation; either
7  * version 2 of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Library General Public License for more details.
13  *
14  * You should have received a copy of the GNU Library General Public
15  * License along with this library; if not, write to the
16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17  * Boston, MA 02110-1301, USA.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #  include "config.h"
22 #endif
23 
24 #include "audio-resampler-x86-sse2.h"
25 
26 #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
27 #include <emmintrin.h>
28 
29 static inline void
inner_product_gint16_full_1_sse2(gint16 * o,const gint16 * a,const gint16 * b,gint len,const gint16 * icoeff,gint bstride)30 inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
31     const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
32 {
33   gint i;
34   __m128i sum, t;
35 
36   sum = _mm_setzero_si128 ();
37 
38   for (i = 0; i < len; i += 16) {
39     t = _mm_loadu_si128 ((__m128i *) (a + i));
40     sum =
41         _mm_add_epi32 (sum, _mm_madd_epi16 (t,
42             _mm_load_si128 ((__m128i *) (b + i + 0))));
43 
44     t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
45     sum =
46         _mm_add_epi32 (sum, _mm_madd_epi16 (t,
47             _mm_load_si128 ((__m128i *) (b + i + 8))));
48   }
49   sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
50   sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
51 
52   sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
53   sum = _mm_srai_epi32 (sum, PRECISION_S16);
54   sum = _mm_packs_epi32 (sum, sum);
55   *o = _mm_extract_epi16 (sum, 0);
56 }
57 
58 static inline void
inner_product_gint16_linear_1_sse2(gint16 * o,const gint16 * a,const gint16 * b,gint len,const gint16 * icoeff,gint bstride)59 inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
60     const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
61 {
62   gint i = 0;
63   __m128i sum[2], t;
64   __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
65   const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
66     (gint16 *) ((gint8 *) b + 1 * bstride)
67   };
68 
69   sum[0] = sum[1] = _mm_setzero_si128 ();
70   f = _mm_unpacklo_epi16 (f, sum[0]);
71 
72   for (; i < len; i += 16) {
73     t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
74     sum[0] =
75         _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
76             _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
77     sum[1] =
78         _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
79             _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
80 
81     t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
82     sum[0] =
83         _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
84             _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
85     sum[1] =
86         _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
87             _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
88   }
89   sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
90   sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
91 
92   sum[0] =
93       _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
94   sum[1] =
95       _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
96   sum[0] = _mm_add_epi32 (sum[0], sum[1]);
97 
98   sum[0] =
99       _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
100               3)));
101   sum[0] =
102       _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
103               1)));
104 
105   sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
106   sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
107   sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
108   *o = _mm_extract_epi16 (sum[0], 0);
109 }
110 
111 static inline void
inner_product_gint16_cubic_1_sse2(gint16 * o,const gint16 * a,const gint16 * b,gint len,const gint16 * icoeff,gint bstride)112 inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
113     const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
114 {
115   gint i = 0;
116   __m128i sum[4], t[4];
117   __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
118   const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
119     (gint16 *) ((gint8 *) b + 1 * bstride),
120     (gint16 *) ((gint8 *) b + 2 * bstride),
121     (gint16 *) ((gint8 *) b + 3 * bstride)
122   };
123 
124   sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
125   f = _mm_unpacklo_epi16 (f, sum[0]);
126 
127   for (; i < len; i += 8) {
128     t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
129     sum[0] =
130         _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
131             _mm_load_si128 ((__m128i *) (c[0] + i))));
132     sum[1] =
133         _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
134             _mm_load_si128 ((__m128i *) (c[1] + i))));
135     sum[2] =
136         _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
137             _mm_load_si128 ((__m128i *) (c[2] + i))));
138     sum[3] =
139         _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
140             _mm_load_si128 ((__m128i *) (c[3] + i))));
141   }
142   t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
143   t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
144   t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
145   t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
146 
147   sum[0] =
148       _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
149           t[1]));
150   sum[2] =
151       _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
152           t[3]));
153   sum[0] = _mm_add_epi32 (sum[0], sum[2]);
154 
155   sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
156   sum[0] = _mm_madd_epi16 (sum[0], f);
157 
158   sum[0] =
159       _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
160               3)));
161   sum[0] =
162       _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
163               1)));
164 
165   sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
166   sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
167   sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
168   *o = _mm_extract_epi16 (sum[0], 0);
169 }
170 
171 static inline void
inner_product_gdouble_full_1_sse2(gdouble * o,const gdouble * a,const gdouble * b,gint len,const gdouble * icoeff,gint bstride)172 inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
173     const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
174 {
175   gint i = 0;
176   __m128d sum = _mm_setzero_pd ();
177 
178   for (; i < len; i += 8) {
179     sum =
180         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
181             _mm_load_pd (b + i + 0)));
182     sum =
183         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
184             _mm_load_pd (b + i + 2)));
185     sum =
186         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
187             _mm_load_pd (b + i + 4)));
188     sum =
189         _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
190             _mm_load_pd (b + i + 6)));
191   }
192   sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
193   _mm_store_sd (o, sum);
194 }
195 
196 static inline void
inner_product_gdouble_linear_1_sse2(gdouble * o,const gdouble * a,const gdouble * b,gint len,const gdouble * icoeff,gint bstride)197 inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
198     const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
199 {
200   gint i = 0;
201   __m128d sum[2], t;
202   const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
203     (gdouble *) ((gint8 *) b + 1 * bstride)
204   };
205 
206   sum[0] = sum[1] = _mm_setzero_pd ();
207 
208   for (; i < len; i += 4) {
209     t = _mm_loadu_pd (a + i + 0);
210     sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
211     sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
212     t = _mm_loadu_pd (a + i + 2);
213     sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
214     sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
215   }
216   sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
217   sum[0] = _mm_add_pd (sum[0], sum[1]);
218   sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
219   _mm_store_sd (o, sum[0]);
220 }
221 
222 static inline void
inner_product_gdouble_cubic_1_sse2(gdouble * o,const gdouble * a,const gdouble * b,gint len,const gdouble * icoeff,gint bstride)223 inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
224     const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
225 {
226   gint i;
227   __m128d f[2], sum[4], t;
228   const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
229     (gdouble *) ((gint8 *) b + 1 * bstride),
230     (gdouble *) ((gint8 *) b + 2 * bstride),
231     (gdouble *) ((gint8 *) b + 3 * bstride)
232   };
233 
234   f[0] = _mm_loadu_pd (icoeff + 0);
235   f[1] = _mm_loadu_pd (icoeff + 2);
236   sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
237 
238   for (i = 0; i < len; i += 2) {
239     t = _mm_loadu_pd (a + i + 0);
240     sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
241     sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
242     sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
243     sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
244   }
245   sum[0] =
246       _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
247   sum[1] =
248       _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
249   sum[2] =
250       _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
251   sum[3] =
252       _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
253   sum[0] = _mm_add_pd (sum[0], sum[1]);
254   sum[2] = _mm_add_pd (sum[2], sum[3]);
255   sum[0] = _mm_add_pd (sum[0], sum[2]);
256   sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
257   _mm_store_sd (o, sum[0]);
258 }
259 
260 MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
261 MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
262 MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
263 
264 MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
265 MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
266 MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
267 
268 void
interpolate_gint16_linear_sse2(gpointer op,const gpointer ap,gint len,const gpointer icp,gint astride)269 interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
270     gint len, const gpointer icp, gint astride)
271 {
272   gint i = 0;
273   gint16 *o = op, *a = ap, *ic = icp;
274   __m128i ta, tb, t1, t2;
275   __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
276   const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
277     (gint16 *) ((gint8 *) a + 1 * astride)
278   };
279 
280   f = _mm_unpacklo_epi32 (f, f);
281   f = _mm_unpacklo_epi64 (f, f);
282 
283   for (; i < len; i += 8) {
284     ta = _mm_load_si128 ((__m128i *) (c[0] + i));
285     tb = _mm_load_si128 ((__m128i *) (c[1] + i));
286 
287     t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
288     t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
289 
290     t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
291     t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
292 
293     t1 = _mm_srai_epi32 (t1, PRECISION_S16);
294     t2 = _mm_srai_epi32 (t2, PRECISION_S16);
295 
296     t1 = _mm_packs_epi32 (t1, t2);
297     _mm_store_si128 ((__m128i *) (o + i), t1);
298   }
299 }
300 
301 void
interpolate_gint16_cubic_sse2(gpointer op,const gpointer ap,gint len,const gpointer icp,gint astride)302 interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
303     gint len, const gpointer icp, gint astride)
304 {
305   gint i = 0;
306   gint16 *o = op, *a = ap, *ic = icp;
307   __m128i ta, tb, tl1, tl2, th1, th2;
308   __m128i f[2];
309   const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
310     (gint16 *) ((gint8 *) a + 1 * astride),
311     (gint16 *) ((gint8 *) a + 2 * astride),
312     (gint16 *) ((gint8 *) a + 3 * astride)
313   };
314 
315   f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
316   f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
317 
318   for (; i < len; i += 8) {
319     ta = _mm_load_si128 ((__m128i *) (c[0] + i));
320     tb = _mm_load_si128 ((__m128i *) (c[1] + i));
321 
322     tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
323     th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
324 
325     ta = _mm_load_si128 ((__m128i *) (c[2] + i));
326     tb = _mm_load_si128 ((__m128i *) (c[3] + i));
327 
328     tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
329     th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
330 
331     tl1 = _mm_add_epi32 (tl1, tl2);
332     th1 = _mm_add_epi32 (th1, th2);
333 
334     tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
335     th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
336 
337     tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
338     th1 = _mm_srai_epi32 (th1, PRECISION_S16);
339 
340     tl1 = _mm_packs_epi32 (tl1, th1);
341     _mm_store_si128 ((__m128i *) (o + i), tl1);
342   }
343 }
344 
345 void
interpolate_gdouble_linear_sse2(gpointer op,const gpointer ap,gint len,const gpointer icp,gint astride)346 interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
347     gint len, const gpointer icp, gint astride)
348 {
349   gint i;
350   gdouble *o = op, *a = ap, *ic = icp;
351   __m128d f[2], t1, t2;
352   const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride),
353     (gdouble *) ((gint8 *) a + 1 * astride)
354   };
355 
356   f[0] = _mm_load1_pd (ic + 0);
357   f[1] = _mm_load1_pd (ic + 1);
358 
359   for (i = 0; i < len; i += 4) {
360     t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
361     t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
362     _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
363 
364     t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
365     t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
366     _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
367   }
368 }
369 
370 void
interpolate_gdouble_cubic_sse2(gpointer op,const gpointer ap,gint len,const gpointer icp,gint astride)371 interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
372     gint len, const gpointer icp, gint astride)
373 {
374   gint i;
375   gdouble *o = op, *a = ap, *ic = icp;
376   __m128d f[4], t[4];
377   const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride),
378     (gdouble *) ((gint8 *) a + 1 * astride),
379     (gdouble *) ((gint8 *) a + 2 * astride),
380     (gdouble *) ((gint8 *) a + 3 * astride)
381   };
382 
383   f[0] = _mm_load1_pd (ic + 0);
384   f[1] = _mm_load1_pd (ic + 1);
385   f[2] = _mm_load1_pd (ic + 2);
386   f[3] = _mm_load1_pd (ic + 3);
387 
388   for (i = 0; i < len; i += 2) {
389     t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
390     t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
391     t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
392     t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
393     t[0] = _mm_add_pd (t[0], t[1]);
394     t[2] = _mm_add_pd (t[2], t[3]);
395     _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
396   }
397 }
398 
399 #endif
400