1 /* GStreamer
2  * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Library General Public
6  * License as published by the Free Software Foundation; either
7  * version 2 of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Library General Public License for more details.
13  *
14  * You should have received a copy of the GNU Library General Public
15  * License along with this library; if not, write to the
16  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17  * Boston, MA 02110-1301, USA.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #  include "config.h"
22 #endif
23 
24 #include "audio-resampler-x86-sse41.h"
25 
26 #if 0
27 #define __SSE4_1__
28 #pragma GCC target("sse4.1")
29 #endif
30 
31 #if defined (__x86_64__) && \
32     defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && \
33     defined (__SSE4_1__)
34 
35 #include <emmintrin.h>
36 #include <smmintrin.h>
37 
38 static inline void
inner_product_gint32_full_1_sse41(gint32 * o,const gint32 * a,const gint32 * b,gint len,const gint32 * icoeff,gint bstride)39 inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
40     const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
41 {
42   gint i = 0;
43   __m128i sum, ta, tb;
44   gint64 res;
45 
46   sum = _mm_setzero_si128 ();
47 
48   for (; i < len; i += 8) {
49     ta = _mm_loadu_si128 ((__m128i *) (a + i));
50     tb = _mm_load_si128 ((__m128i *) (b + i));
51 
52     sum =
53         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
54             _mm_unpacklo_epi32 (tb, tb)));
55     sum =
56         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
57             _mm_unpackhi_epi32 (tb, tb)));
58 
59     ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
60     tb = _mm_load_si128 ((__m128i *) (b + i + 4));
61 
62     sum =
63         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
64             _mm_unpacklo_epi32 (tb, tb)));
65     sum =
66         _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
67             _mm_unpackhi_epi32 (tb, tb)));
68   }
69   sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
70   res = _mm_cvtsi128_si64 (sum);
71 
72   res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
73   *o = CLAMP (res, G_MININT32, G_MAXINT32);
74 }
75 
76 static inline void
inner_product_gint32_linear_1_sse41(gint32 * o,const gint32 * a,const gint32 * b,gint len,const gint32 * icoeff,gint bstride)77 inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
78     const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
79 {
80   gint i = 0;
81   gint64 res;
82   __m128i sum[2], ta, tb;
83   __m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
84   const gint32 *c[2] = { (gint32 *) ((gint8 *) b + 0 * bstride),
85     (gint32 *) ((gint8 *) b + 1 * bstride)
86   };
87 
88   sum[0] = sum[1] = _mm_setzero_si128 ();
89 
90   for (; i < len; i += 4) {
91     ta = _mm_loadu_si128 ((__m128i *) (a + i));
92 
93     tb = _mm_load_si128 ((__m128i *) (c[0] + i));
94     sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
95             _mm_unpacklo_epi32 (tb, tb)));
96     sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
97             _mm_unpackhi_epi32 (tb, tb)));
98 
99     tb = _mm_load_si128 ((__m128i *) (c[1] + i));
100     sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
101             _mm_unpacklo_epi32 (tb, tb)));
102     sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
103             _mm_unpackhi_epi32 (tb, tb)));
104   }
105   sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
106   sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
107   sum[0] =
108       _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
109   sum[1] =
110       _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
111   sum[0] = _mm_add_epi64 (sum[0], sum[1]);
112   sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
113   res = _mm_cvtsi128_si64 (sum[0]);
114 
115   res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
116   *o = CLAMP (res, G_MININT32, G_MAXINT32);
117 }
118 
119 static inline void
inner_product_gint32_cubic_1_sse41(gint32 * o,const gint32 * a,const gint32 * b,gint len,const gint32 * icoeff,gint bstride)120 inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
121     const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
122 {
123   gint i = 0;
124   gint64 res;
125   __m128i sum[4], ta, tb;
126   __m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
127   const gint32 *c[4] = { (gint32 *) ((gint8 *) b + 0 * bstride),
128     (gint32 *) ((gint8 *) b + 1 * bstride),
129     (gint32 *) ((gint8 *) b + 2 * bstride),
130     (gint32 *) ((gint8 *) b + 3 * bstride)
131   };
132 
133   sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
134 
135   for (; i < len; i += 4) {
136     ta = _mm_loadu_si128 ((__m128i *) (a + i));
137 
138     tb = _mm_load_si128 ((__m128i *) (c[0] + i));
139     sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
140             _mm_unpacklo_epi32 (tb, tb)));
141     sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
142             _mm_unpackhi_epi32 (tb, tb)));
143 
144     tb = _mm_load_si128 ((__m128i *) (c[1] + i));
145     sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
146             _mm_unpacklo_epi32 (tb, tb)));
147     sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
148             _mm_unpackhi_epi32 (tb, tb)));
149 
150     tb = _mm_load_si128 ((__m128i *) (c[2] + i));
151     sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
152             _mm_unpacklo_epi32 (tb, tb)));
153     sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
154             _mm_unpackhi_epi32 (tb, tb)));
155 
156     tb = _mm_load_si128 ((__m128i *) (c[3] + i));
157     sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
158             _mm_unpacklo_epi32 (tb, tb)));
159     sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
160             _mm_unpackhi_epi32 (tb, tb)));
161   }
162   sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
163   sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
164   sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
165   sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
166   sum[0] =
167       _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
168   sum[1] =
169       _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
170   sum[2] =
171       _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
172   sum[3] =
173       _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
174   sum[0] = _mm_add_epi64 (sum[0], sum[1]);
175   sum[2] = _mm_add_epi64 (sum[2], sum[3]);
176   sum[0] = _mm_add_epi64 (sum[0], sum[2]);
177   sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
178   res = _mm_cvtsi128_si64 (sum[0]);
179 
180   res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
181   *o = CLAMP (res, G_MININT32, G_MAXINT32);
182 }
183 
184 MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
185 MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
186 MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
187 
188 #endif
189