1 /*****************************************************************
2 * gavl - a general purpose audio/video processing library
3 *
4 * Copyright (c) 2001 - 2011 Members of the Gmerlin project
5 * gmerlin-general@lists.sourceforge.net
6 * http://gmerlin.sourceforge.net
7 *
8 * This program is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation, either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20 * *****************************************************************/
21
22 #include <config.h>
23
24 #include <stdio.h>
25 #include <gavl/gavl.h>
26 #include <gavl/gavldsp.h>
27 #include <dsp.h>
28 #include <attributes.h>
29 #include "mmx.h"
30
31 #ifdef MMXEXT
32 #define MOVQ_R2M(reg,mem) movntq_r2m(reg, mem)
33 #else
34 #define MOVQ_R2M(reg,mem) movq_r2m(reg, mem)
35 #endif
36
37 #if 0
38 static mmx_t mm_tmp;
39 #define DUMP_MM(name, reg) MOVQ_R2M(reg, mm_tmp);\
40 fprintf(stderr, "%s: %016llx\n", name, mm_tmp.q);
41 #endif
42
43
44
interpolate_8_mmx(const uint8_t * src_1,const uint8_t * src_2,uint8_t * dst,int num,float fac)45 static void interpolate_8_mmx(const uint8_t * src_1, const uint8_t * src_2,
46 uint8_t * dst, int num, float fac)
47 {
48 int i, imax;
49 int32_t tmp;
50 int32_t fac_i;
51 int32_t anti_fac;
52
53 fac_i = (float)(fac * 0x4000 + 0.5);
54 anti_fac = 0x4000 - fac_i;
55
56 // fprintf(stderr, "interpolate_8_mmx %d %d\n", fac, anti_fac);
57
58 imax = num / 8;
59 // imax = 0;
60
61 /* Load factors */
62
63 /*
64 * mm0: Input1
65 * mm1: Input2
66 * mm2: Factor1
67 * mm3: Factor1
68 * mm4: Output1
69 * mm5: Output2
70 * mm6: Scratch
71 * mm7: 0
72 */
73
74 pxor_r2r(mm7, mm7);
75
76 /* Load factor1 */
77 movd_m2r(fac_i, mm2);
78 movq_r2r(mm2, mm6);
79 psllq_i2r(16, mm6);
80 por_r2r(mm6, mm2);
81 movq_r2r(mm2, mm6);
82 psllq_i2r(32, mm6);
83 por_r2r(mm6, mm2);
84 /* Load factor2 */
85 movd_m2r(anti_fac, mm3);
86 movq_r2r(mm3, mm6);
87 psllq_i2r(16, mm6);
88 por_r2r(mm6, mm3);
89 movq_r2r(mm3, mm6);
90 psllq_i2r(32, mm6);
91 por_r2r(mm6, mm3);
92
93 for(i = 0; i < imax; i++)
94 {
95 /* Load input 1 */
96 movq_m2r(*src_1,mm0);
97 movq_r2r(mm0,mm1);
98 punpcklbw_r2r(mm7, mm0);
99 punpckhbw_r2r(mm7, mm1);
100 psllw_i2r(7, mm0);
101 psllw_i2r(7, mm1);
102
103 /* Accumulate mm0 */
104 pmulhw_r2r(mm2, mm0);
105 movq_r2r(mm0, mm4);
106 /* Accumulate mm1 */
107 pmulhw_r2r(mm2, mm1);
108 movq_r2r(mm1, mm5);
109
110 /* Load input 2 */
111 movq_m2r(*(src_2),mm0);
112 movq_r2r(mm0,mm1);
113 punpcklbw_r2r(mm7, mm0);
114 punpckhbw_r2r(mm7, mm1);
115 psllw_i2r(7, mm0);
116 psllw_i2r(7, mm1);
117
118 /* Accumulate mm0 */
119 pmulhw_r2r(mm3, mm0);
120 paddsw_r2r(mm0, mm4);
121 /* Accumulate mm1 */
122 pmulhw_r2r(mm3, mm1);
123 paddsw_r2r(mm1, mm5);
124
125 psraw_i2r(5, mm4);
126 psraw_i2r(5, mm5);
127
128 packuswb_r2r(mm5, mm4);
129
130 MOVQ_R2M(mm4, *dst);
131
132 dst += 8;
133 src_1 += 8;
134 src_2 += 8;
135 }
136
137 emms();
138
139 imax = num % 8;
140 // imax = num;
141
142 if(!imax)
143 return;
144
145 for(i = 0; i < imax; i++)
146 {
147 tmp = (*src_1 * fac_i + *src_2 * anti_fac) >> 15;
148 *dst = (uint8_t)((tmp & ~0xFF)?((-tmp) >> 31) : tmp);
149 /* Accum */
150 dst++;
151 src_1++;
152 src_2++;
153 }
154
155
156 }
157
gavl_dsp_init_mmx(gavl_dsp_funcs_t * funcs,int quality)158 void gavl_dsp_init_mmx(gavl_dsp_funcs_t * funcs,
159 int quality)
160 {
161 if(quality < 3)
162 funcs->interpolate_8 = interpolate_8_mmx;
163 }
164