1 /* Spa
2 *
3 * Copyright © 2019 Wim Taymans
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "resample-native-impl.h"
26
27 #include <tmmintrin.h>
28
inner_product_ssse3(float * d,const float * SPA_RESTRICT s,const float * SPA_RESTRICT taps,uint32_t n_taps)29 static void inner_product_ssse3(float *d, const float * SPA_RESTRICT s,
30 const float * SPA_RESTRICT taps, uint32_t n_taps)
31 {
32 __m128 sum = _mm_setzero_ps();
33 __m128 t0, t1;
34 uint32_t i;
35
36 switch (SPA_PTR_ALIGNMENT(s, 16)) {
37 case 0:
38 for (i = 0; i < n_taps; i += 8) {
39 sum = _mm_add_ps(sum,
40 _mm_mul_ps(
41 _mm_load_ps(s + i + 0),
42 _mm_load_ps(taps + i + 0)));
43 sum = _mm_add_ps(sum,
44 _mm_mul_ps(
45 _mm_load_ps(s + i + 4),
46 _mm_load_ps(taps + i + 4)));
47 }
48 break;
49 case 4:
50 t0 = _mm_load_ps(s - 1);
51 for (i = 0; i < n_taps; i += 8) {
52 t1 = _mm_load_ps(s + i + 3);
53 t0 = (__m128)_mm_alignr_epi8((__m128i)t1, (__m128i)t0, 4);
54 sum = _mm_add_ps(sum,
55 _mm_mul_ps(t0, _mm_load_ps(taps + i + 0)));
56 t0 = t1;
57 t1 = _mm_load_ps(s + i + 7);
58 t0 = (__m128)_mm_alignr_epi8((__m128i)t1, (__m128i)t0, 4);
59 sum = _mm_add_ps(sum,
60 _mm_mul_ps(t0, _mm_load_ps(taps + i + 4)));
61 t0 = t1;
62 }
63 break;
64 case 8:
65 t0 = _mm_load_ps(s - 2);
66 for (i = 0; i < n_taps; i += 8) {
67 t1 = _mm_load_ps(s + i + 2);
68 t0 = (__m128)_mm_alignr_epi8((__m128i)t1, (__m128i)t0, 8);
69 sum = _mm_add_ps(sum,
70 _mm_mul_ps(t0, _mm_load_ps(taps + i + 0)));
71 t0 = t1;
72 t1 = _mm_load_ps(s + i + 6);
73 t0 = (__m128)_mm_alignr_epi8((__m128i)t1, (__m128i)t0, 8);
74 sum = _mm_add_ps(sum,
75 _mm_mul_ps(t0, _mm_load_ps(taps + i + 4)));
76 t0 = t1;
77 }
78 break;
79 case 12:
80 t0 = _mm_load_ps(s - 3);
81 for (i = 0; i < n_taps; i += 8) {
82 t1 = _mm_load_ps(s + i + 1);
83 t0 = (__m128)_mm_alignr_epi8((__m128i)t1, (__m128i)t0, 12);
84 sum = _mm_add_ps(sum,
85 _mm_mul_ps(t0, _mm_load_ps(taps + i + 0)));
86 t0 = t1;
87 t1 = _mm_load_ps(s + i + 5);
88 t0 = (__m128)_mm_alignr_epi8((__m128i)t1, (__m128i)t0, 12);
89 sum = _mm_add_ps(sum,
90 _mm_mul_ps(t0, _mm_load_ps(taps + i + 4)));
91 t0 = t1;
92 }
93 break;
94 }
95 sum = _mm_add_ps(sum, _mm_movehdup_ps(sum));
96 sum = _mm_add_ss(sum, _mm_movehl_ps(sum, sum));
97 _mm_store_ss(d, sum);
98 }
99
inner_product_ip_ssse3(float * d,const float * SPA_RESTRICT s,const float * SPA_RESTRICT t0,const float * SPA_RESTRICT t1,float x,uint32_t n_taps)100 static void inner_product_ip_ssse3(float *d, const float * SPA_RESTRICT s,
101 const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x,
102 uint32_t n_taps)
103 {
104 float sum[2] = { 0.0f, 0.0f };
105 uint32_t i;
106
107 for (i = 0; i < n_taps; i++) {
108 sum[0] += s[i] * t0[i];
109 sum[1] += s[i] * t1[i];
110 }
111 *d = (sum[1] - sum[0]) * x + sum[0];
112 }
113
114 MAKE_RESAMPLER_FULL(ssse3);
115 MAKE_RESAMPLER_INTER(ssse3);
116