1 /*****************************************************************
2 * gavl - a general purpose audio/video processing library
3 *
4 * Copyright (c) 2001 - 2011 Members of the Gmerlin project
5 * gmerlin-general@lists.sourceforge.net
6 * http://gmerlin.sourceforge.net
7 *
8 * This program is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 * *****************************************************************/
21
22 /* SSE Optimized scaling (x) */
23
24 #include <config.h>
25 #include <attributes.h>
26
27 #include <stdio.h>
28 #include <gavl/gavl.h>
29 #include <video.h>
30 #include <scale.h>
31
32 #include "../sse/sse.h"
33
34 static void
scale_float_x_4_x_bilinear_sse(gavl_video_scale_context_t * ctx,int scanline,uint8_t * dest_start)35 scale_float_x_4_x_bilinear_sse(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
36 {
37 int i, imax;
38 uint8_t * src, * dst, *src_start;
39
40 imax = ctx->dst_size;
41
42 src_start = ctx->src + scanline * ctx->src_stride;
43 dst = dest_start;
44
45 for(i = 0; i < imax; i++)
46 {
47 src = src_start + 16*ctx->table_h.pixels[i].index;
48
49 /* Load factor */
50 movss_m2r(ctx->table_h.pixels[i].factor_f[0], xmm0);
51 shufps_r2ri(xmm0, xmm0, 0x00);
52
53 /* Load src1 */
54 movaps_m2r(*src, xmm5);
55
56 /* Load src2 */
57 movaps_m2r(*(src+16), xmm4);
58
59 /* xmm4 = src1 - src2 */
60 subps_r2r(xmm4, xmm5);
61
62 /* xmm4 = (src1 - src2)*f */
63 mulps_r2r(xmm0, xmm5);
64
65 /* xmm4 = (src1 - src2)*f + src2 */
66
67 addps_r2r(xmm4, xmm5);
68
69 /* Store */
70 movaps_r2m(xmm5, *dst);
71 dst += 16;
72 }
73 }
74
75 static void
scale_float_x_4_x_quadratic_sse(gavl_video_scale_context_t * ctx,int scanline,uint8_t * dest_start)76 scale_float_x_4_x_quadratic_sse(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
77 {
78 int i, imax;
79 uint8_t * src, * dst, *src_start;
80
81 imax = ctx->dst_size;
82
83 src_start = ctx->src + scanline * ctx->src_stride;
84 dst = dest_start;
85
86 for(i = 0; i < imax; i++)
87 {
88 xorps_r2r(xmm5, xmm5);
89 src = src_start + 16*ctx->table_h.pixels[i].index;
90
91 /* Load factor */
92 movss_m2r(ctx->table_h.pixels[i].factor_f[0], xmm0);
93 shufps_r2ri(xmm0, xmm0, 0x00);
94
95 /* Load src1 */
96 movaps_m2r(*src, xmm4);
97
98 /* Multiply */
99 mulps_r2r(xmm0, xmm4);
100
101 /* Add */
102 addps_r2r(xmm4, xmm5);
103
104 /* Load factor */
105 movss_m2r(ctx->table_h.pixels[i].factor_f[1], xmm0);
106 shufps_r2ri(xmm0, xmm0, 0x00);
107
108 /* Load src1 */
109 movaps_m2r(*(src+16), xmm4);
110
111 /* Multiply */
112 mulps_r2r(xmm0, xmm4);
113
114 /* Add */
115 addps_r2r(xmm4, xmm5);
116
117 /* Load factor */
118 movss_m2r(ctx->table_h.pixels[i].factor_f[2], xmm0);
119 shufps_r2ri(xmm0, xmm0, 0x00);
120
121 /* Load src1 */
122 movaps_m2r(*(src+32), xmm4);
123
124 /* Multiply */
125 mulps_r2r(xmm0, xmm4);
126
127 /* Add */
128 addps_r2r(xmm4, xmm5);
129
130 /* Store */
131 movaps_r2m(xmm5, *dst);
132 dst += 16;
133
134 }
135
136 }
137
138 static void
scale_float_x_4_x_bicubic_sse(gavl_video_scale_context_t * ctx,int scanline,uint8_t * dest_start)139 scale_float_x_4_x_bicubic_sse(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
140 {
141 int i, imax;
142 uint8_t * src, * dst, *src_start;
143
144 imax = ctx->dst_size;
145
146 src_start = ctx->src + scanline * ctx->src_stride;
147 dst = dest_start;
148
149 movups_m2r(ctx->min_values_f[0], xmm6);
150 movups_m2r(ctx->max_values_f[0], xmm7);
151
152 for(i = 0; i < imax; i++)
153 {
154 xorps_r2r(xmm5, xmm5);
155 src = src_start + 16*ctx->table_h.pixels[i].index;
156
157 /* Load factor */
158 movss_m2r(ctx->table_h.pixels[i].factor_f[0], xmm0);
159 shufps_r2ri(xmm0, xmm0, 0x00);
160
161 /* Load src1 */
162 movaps_m2r(*src, xmm4);
163
164 /* Multiply */
165 mulps_r2r(xmm0, xmm4);
166
167 /* Add */
168 addps_r2r(xmm4, xmm5);
169
170 /* Load factor */
171 movss_m2r(ctx->table_h.pixels[i].factor_f[1], xmm0);
172 shufps_r2ri(xmm0, xmm0, 0x00);
173
174 /* Load src1 */
175 movaps_m2r(*(src+16), xmm4);
176
177 /* Multiply */
178 mulps_r2r(xmm0, xmm4);
179
180 /* Add */
181 addps_r2r(xmm4, xmm5);
182
183 /* Load factor */
184 movss_m2r(ctx->table_h.pixels[i].factor_f[2], xmm0);
185 shufps_r2ri(xmm0, xmm0, 0x00);
186
187 /* Load src1 */
188 movaps_m2r(*(src+32), xmm4);
189
190 /* Multiply */
191 mulps_r2r(xmm0, xmm4);
192
193 /* Add */
194 addps_r2r(xmm4, xmm5);
195
196 /* Load factor */
197 movss_m2r(ctx->table_h.pixels[i].factor_f[3], xmm0);
198 shufps_r2ri(xmm0, xmm0, 0x00);
199
200 /* Load src1 */
201 movaps_m2r(*(src+48), xmm4);
202
203 /* Multiply */
204 mulps_r2r(xmm0, xmm4);
205
206 /* Add */
207 addps_r2r(xmm4, xmm5);
208
209 /* Clip */
210 minps_r2r(xmm7, xmm5);
211 maxps_r2r(xmm6, xmm5);
212
213 /* Store */
214 movaps_r2m(xmm5, *dst);
215 dst += 16;
216
217 }
218
219 }
220
221 static void
scale_float_x_4_x_bicubic_noclip_sse(gavl_video_scale_context_t * ctx,int scanline,uint8_t * dest_start)222 scale_float_x_4_x_bicubic_noclip_sse(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
223 {
224 int i, imax;
225 uint8_t * src, * dst, *src_start;
226
227 imax = ctx->dst_size;
228
229 src_start = ctx->src + scanline * ctx->src_stride;
230 dst = dest_start;
231
232
233 for(i = 0; i < imax; i++)
234 {
235 xorps_r2r(xmm5, xmm5);
236 src = src_start + 16*ctx->table_h.pixels[i].index;
237
238 /* Load factor */
239 movss_m2r(ctx->table_h.pixels[i].factor_f[0], xmm0);
240 shufps_r2ri(xmm0, xmm0, 0x00);
241
242 /* Load src1 */
243 movaps_m2r(*src, xmm4);
244
245 /* Multiply */
246 mulps_r2r(xmm0, xmm4);
247
248 /* Add */
249 addps_r2r(xmm4, xmm5);
250
251 /* Load factor */
252 movss_m2r(ctx->table_h.pixels[i].factor_f[1], xmm0);
253 shufps_r2ri(xmm0, xmm0, 0x00);
254
255 /* Load src1 */
256 movaps_m2r(*(src+16), xmm4);
257
258 /* Multiply */
259 mulps_r2r(xmm0, xmm4);
260
261 /* Add */
262 addps_r2r(xmm4, xmm5);
263
264 /* Load factor */
265 movss_m2r(ctx->table_h.pixels[i].factor_f[2], xmm0);
266 shufps_r2ri(xmm0, xmm0, 0x00);
267
268 /* Load src1 */
269 movaps_m2r(*(src+32), xmm4);
270
271 /* Multiply */
272 mulps_r2r(xmm0, xmm4);
273
274 /* Add */
275 addps_r2r(xmm4, xmm5);
276
277 /* Load factor */
278 movss_m2r(ctx->table_h.pixels[i].factor_f[3], xmm0);
279 shufps_r2ri(xmm0, xmm0, 0x00);
280
281 /* Load src1 */
282 movaps_m2r(*(src+48), xmm4);
283
284 /* Multiply */
285 mulps_r2r(xmm0, xmm4);
286
287 /* Add */
288 addps_r2r(xmm4, xmm5);
289
290
291 /* Store */
292 movaps_r2m(xmm5, *dst);
293 dst += 16;
294
295 }
296
297 }
298
299 static void
scale_float_x_4_x_generic_sse(gavl_video_scale_context_t * ctx,int scanline,uint8_t * dest_start)300 scale_float_x_4_x_generic_sse(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
301 {
302 int i, imax, j;
303 uint8_t * src, * dst, *src_start;
304
305 imax = ctx->dst_size;
306
307 src_start = ctx->src + scanline * ctx->src_stride;
308 dst = dest_start;
309
310 movups_m2r(ctx->min_values_f[0], xmm6);
311 movups_m2r(ctx->max_values_f[0], xmm7);
312
313 for(i = 0; i < imax; i++)
314 {
315 xorps_r2r(xmm5, xmm5);
316 src = src_start + 16*ctx->table_h.pixels[i].index;
317
318 for(j = 0; j < ctx->table_h.factors_per_pixel; j++)
319 {
320 /* Load factor */
321 movss_m2r(ctx->table_h.pixels[i].factor_f[j], xmm0);
322 shufps_r2ri(xmm0, xmm0, 0x00);
323
324 /* Load src1 */
325 movaps_m2r(*src, xmm4);
326
327 /* Multiply */
328 mulps_r2r(xmm0, xmm4);
329
330 /* Add */
331 addps_r2r(xmm4, xmm5);
332
333 src += 16;
334 }
335
336 /* Clip */
337 minps_r2r(xmm7, xmm5);
338 maxps_r2r(xmm6, xmm5);
339
340 /* Store */
341 movaps_r2m(xmm5, *dst);
342 dst += 16;
343
344 }
345
346 }
347
348 #if 0
349 static void
350 scale_float_x_1_x_bicubic_sse(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
351 {
352
353 }
354
355 static void
356 scale_float_x_1_x_bicubic_noclip_sse(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
357 {
358
359 }
360 #endif
361
gavl_init_scale_funcs_bilinear_x_sse(gavl_scale_funcs_t * tab)362 void gavl_init_scale_funcs_bilinear_x_sse(gavl_scale_funcs_t * tab)
363 {
364 tab->funcs_x.scale_float_x_4 = scale_float_x_4_x_bilinear_sse;
365 }
366
gavl_init_scale_funcs_quadratic_x_sse(gavl_scale_funcs_t * tab)367 void gavl_init_scale_funcs_quadratic_x_sse(gavl_scale_funcs_t * tab)
368 {
369 tab->funcs_x.scale_float_x_4 = scale_float_x_4_x_quadratic_sse;
370 }
371
gavl_init_scale_funcs_bicubic_x_sse(gavl_scale_funcs_t * tab)372 void gavl_init_scale_funcs_bicubic_x_sse(gavl_scale_funcs_t * tab)
373 {
374 tab->funcs_x.scale_float_x_4 = scale_float_x_4_x_bicubic_sse;
375
376 }
377
gavl_init_scale_funcs_bicubic_x_noclip_sse(gavl_scale_funcs_t * tab)378 void gavl_init_scale_funcs_bicubic_x_noclip_sse(gavl_scale_funcs_t * tab)
379 {
380 tab->funcs_x.scale_float_x_4 = scale_float_x_4_x_bicubic_noclip_sse;
381
382 }
383
gavl_init_scale_funcs_generic_x_sse(gavl_scale_funcs_t * tab)384 void gavl_init_scale_funcs_generic_x_sse(gavl_scale_funcs_t * tab)
385 {
386 tab->funcs_x.scale_float_x_4 = scale_float_x_4_x_generic_sse;
387
388 }
389