1 #include "lipol.h"
2
3 const __m128 two = _mm_set1_ps(2.f);
4 const __m128 four = _mm_set1_ps(4.f);
5
6 lipol_ps::lipol_ps()
7 {
8 target = _mm_setzero_ps();
9 currentval = _mm_setzero_ps();
10 coef = _mm_set1_ps(0.25f);
11 coef_m1 = _mm_sub_ss(m128_one, coef);
12 m128_lipolstarter = _mm_set_ps(1.f, 0.75f, 0.5f, 0.25f);
13 set_blocksize(64);
14 }
15
16 void lipol_ps::set_blocksize(int bs)
17 {
18 lipol_BLOCK_SIZE = _mm_cvt_si2ss(m128_zero, bs);
19 m128_bs4_inv = _mm_div_ss(m128_four, lipol_BLOCK_SIZE);
setenv(const char * name,const char * value,int overwrite)20 }
21
22 void lipol_ps::multiply_block(float *src, unsigned int nquads)
23 {
24 __m128 y1, y2, dy;
25 initblock(y1, dy);
26 y2 = _mm_add_ps(y1, dy);
27 dy = _mm_mul_ps(dy, two);
28
29 unsigned int n = nquads << 2;
30 for (unsigned int i = 0; (i < n); i += 8) // nquads must be multiple of 4
31 {
32 __m128 a = _mm_mul_ps(_mm_load_ps(src + i), y1);
33 _mm_store_ps(src + i, a);
34 y1 = _mm_add_ps(y1, dy);
35 __m128 b = _mm_mul_ps(_mm_load_ps(src + i + 4), y2);
36 _mm_store_ps(src + i + 4, b);
37 y2 = _mm_add_ps(y2, dy);
38 }
39 }
40
41 void lipol_ps::multiply_block_sat1(float *src, unsigned int nquads)
42 {
43 __m128 y1, y2, dy;
44 initblock(y1, dy);
45 y2 = _mm_add_ps(y1, dy);
46 dy = _mm_mul_ps(dy, two);
47
48 const __m128 satv = _mm_set1_ps(1.0f);
49
50 for (unsigned int i = 0; (i < nquads << 2); i += 8) // nquads must be multiple of 4
51 {
52 _mm_store_ps(src + i, _mm_mul_ps(_mm_load_ps(src + i), y1));
53 y1 = _mm_min_ps(satv, _mm_add_ps(y1, dy));
54 _mm_store_ps(src + i + 4, _mm_mul_ps(_mm_load_ps(src + i + 4), y2));
55 y2 = _mm_min_ps(satv, _mm_add_ps(y2, dy));
56 }
57 }
58
59 void lipol_ps::store_block(float *dst, unsigned int nquads)
60 {
61 __m128 y1, y2, dy;
62 initblock(y1, dy);
63 y2 = _mm_add_ps(y1, dy);
64 dy = _mm_mul_ps(dy, two);
65
66 for (unsigned int i = 0; i < nquads << 2; i += 8) // nquads must be multiple of 4
67 {
68 _mm_store_ps(dst + i, y1);
69 y1 = _mm_add_ps(y1, dy);
70 _mm_store_ps(dst + i + 4, y2);
71 y2 = _mm_add_ps(y2, dy);
72 }
73 }
74
75 void lipol_ps::add_block(float *src, unsigned int nquads)
76 {
77 __m128 y1, y2, dy;
78 initblock(y1, dy);
79 y2 = _mm_add_ps(y1, dy);
80 dy = _mm_mul_ps(dy, two);
81
82 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
83 {
84 ((__m128 *)src)[i] = _mm_add_ps(((__m128 *)src)[i], y1);
85 y1 = _mm_add_ps(y1, dy);
86 ((__m128 *)src)[i + 1] = _mm_add_ps(((__m128 *)src)[i + 1], y2);
87 y2 = _mm_add_ps(y2, dy);
88 }
89 }
90
91 void lipol_ps::subtract_block(float *src, unsigned int nquads)
92 {
93 __m128 y1, y2, dy;
94 initblock(y1, dy);
95 y2 = _mm_add_ps(y1, dy);
96 dy = _mm_mul_ps(dy, two);
97
98 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
99 {
100 ((__m128 *)src)[i] = _mm_sub_ps(((__m128 *)src)[i], y1);
101 y1 = _mm_add_ps(y1, dy);
102 ((__m128 *)src)[i + 1] = _mm_sub_ps(((__m128 *)src)[i + 1], y2);
103 y2 = _mm_add_ps(y2, dy);
104 }
105 }
106
107 void lipol_ps::multiply_2_blocks(float *__restrict src1, float *__restrict src2,
108 unsigned int nquads)
109 {
110 __m128 y1, y2, dy;
111 initblock(y1, dy);
112 y2 = _mm_add_ps(y1, dy);
113 dy = _mm_mul_ps(dy, two);
114
115 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
116 {
117 ((__m128 *)src1)[i] = _mm_mul_ps(((__m128 *)src1)[i], y1);
118 ((__m128 *)src2)[i] = _mm_mul_ps(((__m128 *)src2)[i], y1);
119 y1 = _mm_add_ps(y1, dy);
120 ((__m128 *)src1)[i + 1] = _mm_mul_ps(((__m128 *)src1)[i + 1], y2);
121 ((__m128 *)src2)[i + 1] = _mm_mul_ps(((__m128 *)src2)[i + 1], y2);
122 y2 = _mm_add_ps(y2, dy);
123 }
124 }
125
126 void lipol_ps::MAC_block_to(float *__restrict src, float *__restrict dst, unsigned int nquads)
127 {
128 __m128 y1, y2, dy;
129 initblock(y1, dy);
130 y2 = _mm_add_ps(y1, dy);
131 dy = _mm_mul_ps(dy, two);
132
133 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
134 {
135 ((__m128 *)dst)[i] = _mm_add_ps(((__m128 *)dst)[i], _mm_mul_ps(((__m128 *)src)[i], y1));
136 y1 = _mm_add_ps(y1, dy);
137 ((__m128 *)dst)[i + 1] =
138 _mm_add_ps(((__m128 *)dst)[i + 1], _mm_mul_ps(((__m128 *)src)[i + 1], y2));
139 y2 = _mm_add_ps(y2, dy);
140 }
141 }
142
143 void lipol_ps::MAC_2_blocks_to(float *__restrict src1, float *__restrict src2,
144 float *__restrict dst1, float *__restrict dst2, unsigned int nquads)
145 {
146 __m128 y1, y2, dy;
147 initblock(y1, dy);
148 y2 = _mm_add_ps(y1, dy);
149 dy = _mm_mul_ps(dy, two);
150
151 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
152 {
153 ((__m128 *)dst1)[i] = _mm_add_ps(((__m128 *)dst1)[i], _mm_mul_ps(((__m128 *)src1)[i], y1));
154 ((__m128 *)dst2)[i] = _mm_add_ps(((__m128 *)dst2)[i], _mm_mul_ps(((__m128 *)src2)[i], y1));
155 y1 = _mm_add_ps(y1, dy);
156 ((__m128 *)dst1)[i + 1] =
157 _mm_add_ps(((__m128 *)dst1)[i + 1], _mm_mul_ps(((__m128 *)src1)[i + 1], y2));
158 ((__m128 *)dst2)[i + 1] =
159 _mm_add_ps(((__m128 *)dst2)[i + 1], _mm_mul_ps(((__m128 *)src2)[i + 1], y2));
160 y2 = _mm_add_ps(y2, dy);
161 }
162 }
163
164 void lipol_ps::multiply_block_to(float *__restrict src, float *__restrict dst, unsigned int nquads)
165 {
166 __m128 y1, y2, dy;
167 initblock(y1, dy);
168 y2 = _mm_add_ps(y1, dy);
169 dy = _mm_mul_ps(dy, two);
170
171 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
172 {
173 __m128 a = _mm_mul_ps(((__m128 *)src)[i], y1);
174 ((__m128 *)dst)[i] = a;
175 y1 = _mm_add_ps(y1, dy);
176
177 __m128 b = _mm_mul_ps(((__m128 *)src)[i + 1], y2);
178 ((__m128 *)dst)[i + 1] = b;
179 y2 = _mm_add_ps(y2, dy);
180 }
181 }
182
183 void lipol_ps::multiply_2_blocks_to(float *__restrict src1, float *__restrict src2,
184 float *__restrict dst1, float *__restrict dst2,
185 unsigned int nquads)
186 {
187 __m128 y1, y2, dy;
188 initblock(y1, dy);
189 y2 = _mm_add_ps(y1, dy);
190 dy = _mm_mul_ps(dy, two);
191
192 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
193 {
194 ((__m128 *)dst1)[i] = _mm_mul_ps(((__m128 *)src1)[i], y1);
195 ((__m128 *)dst2)[i] = _mm_mul_ps(((__m128 *)src2)[i], y1);
196 y1 = _mm_add_ps(y1, dy);
197 ((__m128 *)dst1)[i + 1] = _mm_mul_ps(((__m128 *)src1)[i + 1], y2);
198 ((__m128 *)dst2)[i + 1] = _mm_mul_ps(((__m128 *)src2)[i + 1], y2);
199 y2 = _mm_add_ps(y2, dy);
200 }
201 }
202
203 void lipol_ps::trixpan_blocks(float *__restrict L, float *__restrict R, float *__restrict dL,
204 float *__restrict dR, unsigned int nquads)
205 {
206 __m128 y, dy;
207 initblock(y, dy);
208
209 for (unsigned int i = 0; i < nquads; i++)
210 {
211 __m128 a = _mm_max_ps(m128_zero, y);
212 __m128 b = _mm_min_ps(m128_zero, y);
213 __m128 tL = _mm_sub_ps(_mm_mul_ps(_mm_sub_ps(m128_one, a), ((__m128 *)L)[i]),
214 _mm_mul_ps(b, ((__m128 *)R)[i])); // L = (1-a)*L - b*R
215 __m128 tR =
216 _mm_add_ps(_mm_mul_ps(a, ((__m128 *)L)[i]),
217 _mm_mul_ps(_mm_add_ps(m128_one, b), ((__m128 *)R)[i])); // R = a*L + (1+b)*R
218 ((__m128 *)dL)[i] = tL;
219 ((__m128 *)dR)[i] = tR;
220 y = _mm_add_ps(y, dy);
221 }
222 }
223
224 void lipol_ps::fade_block_to(float *__restrict src1, float *__restrict src2, float *__restrict dst,
225 unsigned int nquads)
226 {
227 __m128 y1, y2, dy;
228 initblock(y1, dy);
229 y2 = _mm_add_ps(y1, dy);
230 dy = _mm_mul_ps(dy, two);
231
232 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
233 {
234 __m128 a = _mm_mul_ps(((__m128 *)src1)[i], _mm_sub_ps(m128_one, y1));
235 __m128 b = _mm_mul_ps(((__m128 *)src2)[i], y1);
236 ((__m128 *)dst)[i] = _mm_add_ps(a, b);
237 y1 = _mm_add_ps(y1, dy);
238
239 a = _mm_mul_ps(((__m128 *)src1)[i + 1], _mm_sub_ps(m128_one, y2));
240 b = _mm_mul_ps(((__m128 *)src2)[i + 1], y2);
241 ((__m128 *)dst)[i + 1] = _mm_add_ps(a, b);
242 y2 = _mm_add_ps(y2, dy);
243 }
244 }
245
246 void lipol_ps::fade_2_blocks_to(float *__restrict src11, float *__restrict src12,
247 float *__restrict src21, float *__restrict src22,
248 float *__restrict dst1, float *__restrict dst2, unsigned int nquads)
249 {
250 __m128 y1, y2, dy;
251 initblock(y1, dy);
252 y2 = _mm_add_ps(y1, dy);
253 dy = _mm_mul_ps(dy, two);
254
255 for (unsigned int i = 0; i < nquads; i += 2) // nquads must be multiple of 4
256 {
257 __m128 a = _mm_mul_ps(((__m128 *)src11)[i], _mm_sub_ps(m128_one, y1));
258 __m128 b = _mm_mul_ps(((__m128 *)src12)[i], y1);
259 ((__m128 *)dst1)[i] = _mm_add_ps(a, b);
260 a = _mm_mul_ps(((__m128 *)src21)[i], _mm_sub_ps(m128_one, y1));
261 b = _mm_mul_ps(((__m128 *)src22)[i], y1);
262 ((__m128 *)dst2)[i] = _mm_add_ps(a, b);
263 y1 = _mm_add_ps(y1, dy);
264
265 a = _mm_mul_ps(((__m128 *)src11)[i + 1], _mm_sub_ps(m128_one, y2));
266 b = _mm_mul_ps(((__m128 *)src12)[i + 1], y2);
267 ((__m128 *)dst1)[i + 1] = _mm_add_ps(a, b);
268 a = _mm_mul_ps(((__m128 *)src21)[i + 1], _mm_sub_ps(m128_one, y2));
269 b = _mm_mul_ps(((__m128 *)src22)[i + 1], y2);
270 ((__m128 *)dst2)[i + 1] = _mm_add_ps(a, b);
271 y2 = _mm_add_ps(y2, dy);
272 }
273 }
274