1 /* GStreamer
2 * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Library General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Library General Public License for more details.
13 *
14 * You should have received a copy of the GNU Library General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include "audio-resampler-x86-sse41.h"
25
26 #if 0
27 #define __SSE4_1__
28 #pragma GCC target("sse4.1")
29 #endif
30
31 #if defined (__x86_64__) && \
32 defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && \
33 defined (__SSE4_1__)
34
35 #include <emmintrin.h>
36 #include <smmintrin.h>
37
38 static inline void
inner_product_gint32_full_1_sse41(gint32 * o,const gint32 * a,const gint32 * b,gint len,const gint32 * icoeff,gint bstride)39 inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
40 const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
41 {
42 gint i = 0;
43 __m128i sum, ta, tb;
44 gint64 res;
45
46 sum = _mm_setzero_si128 ();
47
48 for (; i < len; i += 8) {
49 ta = _mm_loadu_si128 ((__m128i *) (a + i));
50 tb = _mm_load_si128 ((__m128i *) (b + i));
51
52 sum =
53 _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
54 _mm_unpacklo_epi32 (tb, tb)));
55 sum =
56 _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
57 _mm_unpackhi_epi32 (tb, tb)));
58
59 ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
60 tb = _mm_load_si128 ((__m128i *) (b + i + 4));
61
62 sum =
63 _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
64 _mm_unpacklo_epi32 (tb, tb)));
65 sum =
66 _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
67 _mm_unpackhi_epi32 (tb, tb)));
68 }
69 sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
70 res = _mm_cvtsi128_si64 (sum);
71
72 res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
73 *o = CLAMP (res, G_MININT32, G_MAXINT32);
74 }
75
76 static inline void
inner_product_gint32_linear_1_sse41(gint32 * o,const gint32 * a,const gint32 * b,gint len,const gint32 * icoeff,gint bstride)77 inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
78 const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
79 {
80 gint i = 0;
81 gint64 res;
82 __m128i sum[2], ta, tb;
83 __m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
84 const gint32 *c[2] = { (gint32 *) ((gint8 *) b + 0 * bstride),
85 (gint32 *) ((gint8 *) b + 1 * bstride)
86 };
87
88 sum[0] = sum[1] = _mm_setzero_si128 ();
89
90 for (; i < len; i += 4) {
91 ta = _mm_loadu_si128 ((__m128i *) (a + i));
92
93 tb = _mm_load_si128 ((__m128i *) (c[0] + i));
94 sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
95 _mm_unpacklo_epi32 (tb, tb)));
96 sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
97 _mm_unpackhi_epi32 (tb, tb)));
98
99 tb = _mm_load_si128 ((__m128i *) (c[1] + i));
100 sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
101 _mm_unpacklo_epi32 (tb, tb)));
102 sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
103 _mm_unpackhi_epi32 (tb, tb)));
104 }
105 sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
106 sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
107 sum[0] =
108 _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
109 sum[1] =
110 _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
111 sum[0] = _mm_add_epi64 (sum[0], sum[1]);
112 sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
113 res = _mm_cvtsi128_si64 (sum[0]);
114
115 res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
116 *o = CLAMP (res, G_MININT32, G_MAXINT32);
117 }
118
119 static inline void
inner_product_gint32_cubic_1_sse41(gint32 * o,const gint32 * a,const gint32 * b,gint len,const gint32 * icoeff,gint bstride)120 inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
121 const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
122 {
123 gint i = 0;
124 gint64 res;
125 __m128i sum[4], ta, tb;
126 __m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
127 const gint32 *c[4] = { (gint32 *) ((gint8 *) b + 0 * bstride),
128 (gint32 *) ((gint8 *) b + 1 * bstride),
129 (gint32 *) ((gint8 *) b + 2 * bstride),
130 (gint32 *) ((gint8 *) b + 3 * bstride)
131 };
132
133 sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
134
135 for (; i < len; i += 4) {
136 ta = _mm_loadu_si128 ((__m128i *) (a + i));
137
138 tb = _mm_load_si128 ((__m128i *) (c[0] + i));
139 sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
140 _mm_unpacklo_epi32 (tb, tb)));
141 sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
142 _mm_unpackhi_epi32 (tb, tb)));
143
144 tb = _mm_load_si128 ((__m128i *) (c[1] + i));
145 sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
146 _mm_unpacklo_epi32 (tb, tb)));
147 sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
148 _mm_unpackhi_epi32 (tb, tb)));
149
150 tb = _mm_load_si128 ((__m128i *) (c[2] + i));
151 sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
152 _mm_unpacklo_epi32 (tb, tb)));
153 sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
154 _mm_unpackhi_epi32 (tb, tb)));
155
156 tb = _mm_load_si128 ((__m128i *) (c[3] + i));
157 sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
158 _mm_unpacklo_epi32 (tb, tb)));
159 sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
160 _mm_unpackhi_epi32 (tb, tb)));
161 }
162 sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
163 sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
164 sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
165 sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
166 sum[0] =
167 _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
168 sum[1] =
169 _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
170 sum[2] =
171 _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
172 sum[3] =
173 _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
174 sum[0] = _mm_add_epi64 (sum[0], sum[1]);
175 sum[2] = _mm_add_epi64 (sum[2], sum[3]);
176 sum[0] = _mm_add_epi64 (sum[0], sum[2]);
177 sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
178 res = _mm_cvtsi128_si64 (sum[0]);
179
180 res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
181 *o = CLAMP (res, G_MININT32, G_MAXINT32);
182 }
183
184 MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
185 MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
186 MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
187
188 #endif
189