1 #include "lipol.h"
2 
3 const __m128 two = _mm_set1_ps(2.f);
4 const __m128 four = _mm_set1_ps(4.f);
5 
6 lipol_ps::lipol_ps()
7 {
8     target = _mm_setzero_ps();
9     currentval = _mm_setzero_ps();
10     coef = _mm_set1_ps(0.25f);
11     coef_m1 = _mm_sub_ss(m128_one, coef);
12     m128_lipolstarter = _mm_set_ps(1.f, 0.75f, 0.5f, 0.25f);
13     set_blocksize(64);
14 }
15 
16 void lipol_ps::set_blocksize(int bs)
17 {
18     lipol_BLOCK_SIZE = _mm_cvt_si2ss(m128_zero, bs);
19     m128_bs4_inv = _mm_div_ss(m128_four, lipol_BLOCK_SIZE);
setenv(const char * name,const char * value,int overwrite)20 }
21 
22 void lipol_ps::multiply_block(float *src, unsigned int nquads)
23 {
24     __m128 y1, y2, dy;
25     initblock(y1, dy);
26     y2 = _mm_add_ps(y1, dy);
27     dy = _mm_mul_ps(dy, two);
28 
29     unsigned int n = nquads << 2;
30     for (unsigned int i = 0; (i < n); i += 8) // nquads must be multiple of 4
31     {
32         __m128 a = _mm_mul_ps(_mm_load_ps(src + i), y1);
33         _mm_store_ps(src + i, a);
34         y1 = _mm_add_ps(y1, dy);
35         __m128 b = _mm_mul_ps(_mm_load_ps(src + i + 4), y2);
36         _mm_store_ps(src + i + 4, b);
37         y2 = _mm_add_ps(y2, dy);
38     }
39 }
40 
41 void lipol_ps::multiply_block_sat1(float *src, unsigned int nquads)
42 {
43     __m128 y1, y2, dy;
44     initblock(y1, dy);
45     y2 = _mm_add_ps(y1, dy);
46     dy = _mm_mul_ps(dy, two);
47 
48     const __m128 satv = _mm_set1_ps(1.0f);
49 
50     for (unsigned int i = 0; (i < nquads << 2); i += 8) // nquads must be multiple of 4
51     {
52         _mm_store_ps(src + i, _mm_mul_ps(_mm_load_ps(src + i), y1));
53         y1 = _mm_min_ps(satv, _mm_add_ps(y1, dy));
54         _mm_store_ps(src + i + 4, _mm_mul_ps(_mm_load_ps(src + i + 4), y2));
55         y2 = _mm_min_ps(satv, _mm_add_ps(y2, dy));
56     }
57 }
58 
59 void lipol_ps::store_block(float *dst, unsigned int nquads)
60 {
61     __m128 y1, y2, dy;
62     initblock(y1, dy);
63     y2 = _mm_add_ps(y1, dy);
64     dy = _mm_mul_ps(dy, two);
65 
66     for (unsigned int i = 0; i < nquads << 2; i += 8) // nquads must be multiple of 4
67     {
68         _mm_store_ps(dst + i, y1);
69         y1 = _mm_add_ps(y1, dy);
70         _mm_store_ps(dst + i + 4, y2);
71         y2 = _mm_add_ps(y2, dy);
72     }
73 }
74 
75 void lipol_ps::add_block(float *src, unsigned int nquads)
76 {
77     __m128 y1, y2, dy;
78     initblock(y1, dy);
79     y2 = _mm_add_ps(y1, dy);
80     dy = _mm_mul_ps(dy, two);
81 
82     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
83     {
84         ((__m128 *)src)[i] = _mm_add_ps(((__m128 *)src)[i], y1);
85         y1 = _mm_add_ps(y1, dy);
86         ((__m128 *)src)[i + 1] = _mm_add_ps(((__m128 *)src)[i + 1], y2);
87         y2 = _mm_add_ps(y2, dy);
88     }
89 }
90 
91 void lipol_ps::subtract_block(float *src, unsigned int nquads)
92 {
93     __m128 y1, y2, dy;
94     initblock(y1, dy);
95     y2 = _mm_add_ps(y1, dy);
96     dy = _mm_mul_ps(dy, two);
97 
98     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
99     {
100         ((__m128 *)src)[i] = _mm_sub_ps(((__m128 *)src)[i], y1);
101         y1 = _mm_add_ps(y1, dy);
102         ((__m128 *)src)[i + 1] = _mm_sub_ps(((__m128 *)src)[i + 1], y2);
103         y2 = _mm_add_ps(y2, dy);
104     }
105 }
106 
107 void lipol_ps::multiply_2_blocks(float *__restrict src1, float *__restrict src2,
108                                  unsigned int nquads)
109 {
110     __m128 y1, y2, dy;
111     initblock(y1, dy);
112     y2 = _mm_add_ps(y1, dy);
113     dy = _mm_mul_ps(dy, two);
114 
115     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
116     {
117         ((__m128 *)src1)[i] = _mm_mul_ps(((__m128 *)src1)[i], y1);
118         ((__m128 *)src2)[i] = _mm_mul_ps(((__m128 *)src2)[i], y1);
119         y1 = _mm_add_ps(y1, dy);
120         ((__m128 *)src1)[i + 1] = _mm_mul_ps(((__m128 *)src1)[i + 1], y2);
121         ((__m128 *)src2)[i + 1] = _mm_mul_ps(((__m128 *)src2)[i + 1], y2);
122         y2 = _mm_add_ps(y2, dy);
123     }
124 }
125 
126 void lipol_ps::MAC_block_to(float *__restrict src, float *__restrict dst, unsigned int nquads)
127 {
128     __m128 y1, y2, dy;
129     initblock(y1, dy);
130     y2 = _mm_add_ps(y1, dy);
131     dy = _mm_mul_ps(dy, two);
132 
133     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
134     {
135         ((__m128 *)dst)[i] = _mm_add_ps(((__m128 *)dst)[i], _mm_mul_ps(((__m128 *)src)[i], y1));
136         y1 = _mm_add_ps(y1, dy);
137         ((__m128 *)dst)[i + 1] =
138             _mm_add_ps(((__m128 *)dst)[i + 1], _mm_mul_ps(((__m128 *)src)[i + 1], y2));
139         y2 = _mm_add_ps(y2, dy);
140     }
141 }
142 
143 void lipol_ps::MAC_2_blocks_to(float *__restrict src1, float *__restrict src2,
144                                float *__restrict dst1, float *__restrict dst2, unsigned int nquads)
145 {
146     __m128 y1, y2, dy;
147     initblock(y1, dy);
148     y2 = _mm_add_ps(y1, dy);
149     dy = _mm_mul_ps(dy, two);
150 
151     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
152     {
153         ((__m128 *)dst1)[i] = _mm_add_ps(((__m128 *)dst1)[i], _mm_mul_ps(((__m128 *)src1)[i], y1));
154         ((__m128 *)dst2)[i] = _mm_add_ps(((__m128 *)dst2)[i], _mm_mul_ps(((__m128 *)src2)[i], y1));
155         y1 = _mm_add_ps(y1, dy);
156         ((__m128 *)dst1)[i + 1] =
157             _mm_add_ps(((__m128 *)dst1)[i + 1], _mm_mul_ps(((__m128 *)src1)[i + 1], y2));
158         ((__m128 *)dst2)[i + 1] =
159             _mm_add_ps(((__m128 *)dst2)[i + 1], _mm_mul_ps(((__m128 *)src2)[i + 1], y2));
160         y2 = _mm_add_ps(y2, dy);
161     }
162 }
163 
164 void lipol_ps::multiply_block_to(float *__restrict src, float *__restrict dst, unsigned int nquads)
165 {
166     __m128 y1, y2, dy;
167     initblock(y1, dy);
168     y2 = _mm_add_ps(y1, dy);
169     dy = _mm_mul_ps(dy, two);
170 
171     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
172     {
173         __m128 a = _mm_mul_ps(((__m128 *)src)[i], y1);
174         ((__m128 *)dst)[i] = a;
175         y1 = _mm_add_ps(y1, dy);
176 
177         __m128 b = _mm_mul_ps(((__m128 *)src)[i + 1], y2);
178         ((__m128 *)dst)[i + 1] = b;
179         y2 = _mm_add_ps(y2, dy);
180     }
181 }
182 
183 void lipol_ps::multiply_2_blocks_to(float *__restrict src1, float *__restrict src2,
184                                     float *__restrict dst1, float *__restrict dst2,
185                                     unsigned int nquads)
186 {
187     __m128 y1, y2, dy;
188     initblock(y1, dy);
189     y2 = _mm_add_ps(y1, dy);
190     dy = _mm_mul_ps(dy, two);
191 
192     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
193     {
194         ((__m128 *)dst1)[i] = _mm_mul_ps(((__m128 *)src1)[i], y1);
195         ((__m128 *)dst2)[i] = _mm_mul_ps(((__m128 *)src2)[i], y1);
196         y1 = _mm_add_ps(y1, dy);
197         ((__m128 *)dst1)[i + 1] = _mm_mul_ps(((__m128 *)src1)[i + 1], y2);
198         ((__m128 *)dst2)[i + 1] = _mm_mul_ps(((__m128 *)src2)[i + 1], y2);
199         y2 = _mm_add_ps(y2, dy);
200     }
201 }
202 
203 void lipol_ps::trixpan_blocks(float *__restrict L, float *__restrict R, float *__restrict dL,
204                               float *__restrict dR, unsigned int nquads)
205 {
206     __m128 y, dy;
207     initblock(y, dy);
208 
209     for (unsigned int i = 0; i < nquads; i++)
210     {
211         __m128 a = _mm_max_ps(m128_zero, y);
212         __m128 b = _mm_min_ps(m128_zero, y);
213         __m128 tL = _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(m128_one, a), ((__m128 *)L)[i]),
214                                _mm_mul_ps(b, ((__m128 *)R)[i])); // L = (1-a)*L - b*R
215         __m128 tR =
216             _mm_add_ps(_mm_mul_ps(a, ((__m128 *)L)[i]),
217                        _mm_mul_ps(_mm_add_ps(m128_one, b), ((__m128 *)R)[i])); // R = a*L + (1+b)*R
218         ((__m128 *)dL)[i] = tL;
219         ((__m128 *)dR)[i] = tR;
220         y = _mm_add_ps(y, dy);
221     }
222 }
223 
224 void lipol_ps::fade_block_to(float *__restrict src1, float *__restrict src2, float *__restrict dst,
225                              unsigned int nquads)
226 {
227     __m128 y1, y2, dy;
228     initblock(y1, dy);
229     y2 = _mm_add_ps(y1, dy);
230     dy = _mm_mul_ps(dy, two);
231 
232     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
233     {
234         __m128 a = _mm_mul_ps(((__m128 *)src1)[i], _mm_sub_ps(m128_one, y1));
235         __m128 b = _mm_mul_ps(((__m128 *)src2)[i], y1);
236         ((__m128 *)dst)[i] = _mm_add_ps(a, b);
237         y1 = _mm_add_ps(y1, dy);
238 
239         a = _mm_mul_ps(((__m128 *)src1)[i + 1], _mm_sub_ps(m128_one, y2));
240         b = _mm_mul_ps(((__m128 *)src2)[i + 1], y2);
241         ((__m128 *)dst)[i + 1] = _mm_add_ps(a, b);
242         y2 = _mm_add_ps(y2, dy);
243     }
244 }
245 
246 void lipol_ps::fade_2_blocks_to(float *__restrict src11, float *__restrict src12,
247                                 float *__restrict src21, float *__restrict src22,
248                                 float *__restrict dst1, float *__restrict dst2, unsigned int nquads)
249 {
250     __m128 y1, y2, dy;
251     initblock(y1, dy);
252     y2 = _mm_add_ps(y1, dy);
253     dy = _mm_mul_ps(dy, two);
254 
255     for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
256     {
257         __m128 a = _mm_mul_ps(((__m128 *)src11)[i], _mm_sub_ps(m128_one, y1));
258         __m128 b = _mm_mul_ps(((__m128 *)src12)[i], y1);
259         ((__m128 *)dst1)[i] = _mm_add_ps(a, b);
260         a = _mm_mul_ps(((__m128 *)src21)[i], _mm_sub_ps(m128_one, y1));
261         b = _mm_mul_ps(((__m128 *)src22)[i], y1);
262         ((__m128 *)dst2)[i] = _mm_add_ps(a, b);
263         y1 = _mm_add_ps(y1, dy);
264 
265         a = _mm_mul_ps(((__m128 *)src11)[i + 1], _mm_sub_ps(m128_one, y2));
266         b = _mm_mul_ps(((__m128 *)src12)[i + 1], y2);
267         ((__m128 *)dst1)[i + 1] = _mm_add_ps(a, b);
268         a = _mm_mul_ps(((__m128 *)src21)[i + 1], _mm_sub_ps(m128_one, y2));
269         b = _mm_mul_ps(((__m128 *)src22)[i + 1], y2);
270         ((__m128 *)dst2)[i + 1] = _mm_add_ps(a, b);
271         y2 = _mm_add_ps(y2, dy);
272     }
273 }
274