1 /*****************************************************************
2  * gavl - a general purpose audio/video processing library
3  *
4  * Copyright (c) 2001 - 2011 Members of the Gmerlin project
5  * gmerlin-general@lists.sourceforge.net
6  * http://gmerlin.sourceforge.net
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation, either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  * *****************************************************************/
21 
22 /* SSE Optimized scaling (linear, y) */
23 
24 
25 
26 static void (FUNC_NAME)(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
27   {
28   int i, imax;
29   uint8_t * src;
30   uint8_t * src_start;
31   uint8_t * dst;
32 
33   /* Load factor */
34   movss_m2r(ctx->table_v.pixels[scanline].factor_f[0], xmm0);
35   shufps_r2ri(xmm0, xmm0, 0x00);
36 
37   dst = dest_start;
38 
39   src_start =
40     ctx->src + ctx->table_v.pixels[scanline].index * ctx->src_stride;
41 
42   /* While source is not aligned... */
43   imax = (((long)(src_start)) % 16)/4;
44 
45   for(i = 0; i < imax; i++)
46     {
47     src = src_start;
48 
49     movss_m2r(*src, xmm1);
50     movss_m2r(*(src+ctx->src_stride), xmm2);
51     subss_r2r(xmm2, xmm1);
52     mulss_r2r(xmm0, xmm1);
53     addss_r2r(xmm2, xmm1);
54     movss_r2m(xmm1, *dst);
55     dst+=4;
56     src_start+=4;
57     }
58 
59   /* SSE routines scale 8 numbers (= 32 bytes) at once */
60   imax = (ctx->dst_size * 4 * WIDTH_MUL - (dst - dest_start)) / /* Bytes left */
61           (32); /* Bytes processed at once */
62 
63   for(i = 0; i < imax; i++)
64     {
65     src = src_start;
66 
67     movaps_m2r(*src, xmm1);
68     movaps_m2r(*(src+ctx->src_stride), xmm2);
69 
70     movaps_m2r(*(src+16), xmm3);
71     movaps_m2r(*(src+ctx->src_stride+16), xmm4);
72 
73     subps_r2r(xmm2, xmm1);
74     mulps_r2r(xmm0, xmm1);
75     addps_r2r(xmm2, xmm1);
76     movups_r2m(xmm1, *dst);
77 
78     subps_r2r(xmm4, xmm3);
79     mulps_r2r(xmm0, xmm3);
80     addps_r2r(xmm4, xmm3);
81     movups_r2m(xmm3, *(dst+16));
82 
83     dst += 32;
84     src_start += 32;
85     }
86 
87   imax = (ctx->dst_size * 4 * WIDTH_MUL - (dst - dest_start)) / 4;
88 
89   //  imax = (ctx->dst_size * WIDTH_MUL);
90 
91   if(!imax)
92     return;
93 
94   for(i = 0; i < imax; i++)
95     {
96     src = src_start;
97 
98     movss_m2r(*src, xmm1);
99     movss_m2r(*(src+ctx->src_stride), xmm2);
100     subss_r2r(xmm2, xmm1);
101     mulss_r2r(xmm0, xmm1);
102     addss_r2r(xmm2, xmm1);
103     movss_r2m(xmm1, *dst);
104 
105     dst+=4;
106     src_start+=4;
107     }
108   }
109 
110 #undef FUNC_NAME
111 #undef NUM_TAPS
112 #undef WIDTH_MUL
113 #undef ACCUM
114 #undef ACCUM_C
115 #undef OUTPUT
116 #undef OUTPUT_C
117 
118 #ifdef INIT_GLOBAL
119 #undef INIT_GLOBAL
120 #endif
121 
122 #ifdef INIT_C
123 #undef INIT_C
124 #endif
125 
126 #undef INIT
127 
128