1 /*****************************************************************************
2 * merge.c : Merge (line blending) routines for the VLC deinterlacer
3 *****************************************************************************
4 * Copyright (C) 2011 VLC authors and VideoLAN
5 * $Id: 94cdd775ac70a548f23b3efbca65d6259d77ca39 $
6 *
7 * Author: Sam Hocevar <sam@zoy.org> (generic C routine)
8 * Sigmund Augdal Helberg <sigmunau@videolan.org> (MMXEXT, 3DNow, SSE2)
9 * Eric Petit <eric.petit@lapsus.org> (Altivec)
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU Lesser General Public License as published by
13 * the Free Software Foundation; either version 2.1 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public License
22 * along with this program; if not, write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
24 *****************************************************************************/
25
26 #ifdef HAVE_CONFIG_H
27 # include "config.h"
28 #endif
29
30 #include <stdlib.h>
31 #include <stdint.h>
32
33 #include <vlc_common.h>
34 #include <vlc_cpu.h>
35 #include "merge.h"
36
37 #ifdef CAN_COMPILE_MMXEXT
38 # include "mmx.h"
39 #endif
40
41 #ifdef HAVE_ALTIVEC_H
42 # undef bool
43 # include <altivec.h>
44 # define bool _Bool
45 #endif
46
47 /*****************************************************************************
48 * Merge (line blending) routines
49 *****************************************************************************/
50
Merge8BitGeneric(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)51 void Merge8BitGeneric( void *_p_dest, const void *_p_s1,
52 const void *_p_s2, size_t i_bytes )
53 {
54 uint8_t *p_dest = _p_dest;
55 const uint8_t *p_s1 = _p_s1;
56 const uint8_t *p_s2 = _p_s2;
57
58 for( ; i_bytes > 0; i_bytes-- )
59 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
60 }
61
Merge16BitGeneric(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)62 void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
63 const void *_p_s2, size_t i_bytes )
64 {
65 uint16_t *p_dest = _p_dest;
66 const uint16_t *p_s1 = _p_s1;
67 const uint16_t *p_s2 = _p_s2;
68
69 for( size_t i_words = i_bytes / 2; i_words > 0; i_words-- )
70 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
71 }
72
73 #if defined(CAN_COMPILE_MMXEXT)
74 VLC_MMX
MergeMMXEXT(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)75 void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
76 size_t i_bytes )
77 {
78 uint8_t *p_dest = _p_dest;
79 const uint8_t *p_s1 = _p_s1;
80 const uint8_t *p_s2 = _p_s2;
81
82 for( ; i_bytes >= 8; i_bytes -= 8 )
83 {
84 __asm__ __volatile__( "movq %2,%%mm1;"
85 "pavgb %1, %%mm1;"
86 "movq %%mm1, %0" :"=m" (*p_dest):
87 "m" (*p_s1),
88 "m" (*p_s2) : "mm1" );
89 p_dest += 8;
90 p_s1 += 8;
91 p_s2 += 8;
92 }
93
94 for( ; i_bytes > 0; i_bytes-- )
95 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
96 }
97 #endif
98
99 #if defined(CAN_COMPILE_3DNOW)
100 VLC_MMX
Merge3DNow(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)101 void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
102 size_t i_bytes )
103 {
104 uint8_t *p_dest = _p_dest;
105 const uint8_t *p_s1 = _p_s1;
106 const uint8_t *p_s2 = _p_s2;
107
108 for( ; i_bytes >= 8; i_bytes -= 8 )
109 {
110 __asm__ __volatile__( "movq %2,%%mm1;"
111 "pavgusb %1, %%mm1;"
112 "movq %%mm1, %0" :"=m" (*p_dest):
113 "m" (*p_s1),
114 "m" (*p_s2) : "mm1" );
115 p_dest += 8;
116 p_s1 += 8;
117 p_s2 += 8;
118 }
119
120 for( ; i_bytes > 0; i_bytes-- )
121 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
122 }
123 #endif
124
125 #if defined(CAN_COMPILE_SSE)
126 VLC_SSE
Merge8BitSSE2(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)127 void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
128 size_t i_bytes )
129 {
130 uint8_t *p_dest = _p_dest;
131 const uint8_t *p_s1 = _p_s1;
132 const uint8_t *p_s2 = _p_s2;
133
134 for( ; i_bytes > 0 && ((uintptr_t)p_s1 & 15); i_bytes-- )
135 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
136
137 for( ; i_bytes >= 16; i_bytes -= 16 )
138 {
139 __asm__ __volatile__( "movdqu %2,%%xmm1;"
140 "pavgb %1, %%xmm1;"
141 "movdqu %%xmm1, %0" :"=m" (*p_dest):
142 "m" (*p_s1),
143 "m" (*p_s2) : "xmm1" );
144 p_dest += 16;
145 p_s1 += 16;
146 p_s2 += 16;
147 }
148
149 for( ; i_bytes > 0; i_bytes-- )
150 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
151 }
152
153 VLC_SSE
Merge16BitSSE2(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)154 void Merge16BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
155 size_t i_bytes )
156 {
157 uint16_t *p_dest = _p_dest;
158 const uint16_t *p_s1 = _p_s1;
159 const uint16_t *p_s2 = _p_s2;
160
161 size_t i_words = i_bytes / 2;
162 for( ; i_words > 0 && ((uintptr_t)p_s1 & 15); i_words-- )
163 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
164
165 for( ; i_words >= 8; i_words -= 8 )
166 {
167 __asm__ __volatile__( "movdqu %2,%%xmm1;"
168 "pavgw %1, %%xmm1;"
169 "movdqu %%xmm1, %0" :"=m" (*p_dest):
170 "m" (*p_s1),
171 "m" (*p_s2) : "xmm1" );
172 p_dest += 8;
173 p_s1 += 8;
174 p_s2 += 8;
175 }
176
177 for( ; i_words > 0; i_words-- )
178 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
179 }
180
181 #endif
182
183 #ifdef CAN_COMPILE_C_ALTIVEC
MergeAltivec(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)184 void MergeAltivec( void *_p_dest, const void *_p_s1,
185 const void *_p_s2, size_t i_bytes )
186 {
187 uint8_t *p_dest = _p_dest;
188 const uint8_t *p_s1 = _p_s1;
189 const uint8_t *p_s2 = _p_s2;
190 uint8_t *p_end = p_dest + i_bytes - 15;
191
192 /* Use C until the first 16-bytes aligned destination pixel */
193 while( (uintptr_t)p_dest & 0xF )
194 {
195 *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
196 }
197
198 if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
199 {
200 /* Unaligned source */
201 vector unsigned char s1v, s2v, destv;
202 vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
203 vector unsigned char perm1v, perm2v;
204
205 perm1v = vec_lvsl( 0, p_s1 );
206 perm2v = vec_lvsl( 0, p_s2 );
207 s1oldv = vec_ld( 0, p_s1 );
208 s2oldv = vec_ld( 0, p_s2 );
209
210 while( p_dest < p_end )
211 {
212 s1newv = vec_ld( 16, p_s1 );
213 s2newv = vec_ld( 16, p_s2 );
214 s1v = vec_perm( s1oldv, s1newv, perm1v );
215 s2v = vec_perm( s2oldv, s2newv, perm2v );
216 s1oldv = s1newv;
217 s2oldv = s2newv;
218 destv = vec_avg( s1v, s2v );
219 vec_st( destv, 0, p_dest );
220
221 p_s1 += 16;
222 p_s2 += 16;
223 p_dest += 16;
224 }
225 }
226 else
227 {
228 /* Aligned source */
229 vector unsigned char s1v, s2v, destv;
230
231 while( p_dest < p_end )
232 {
233 s1v = vec_ld( 0, p_s1 );
234 s2v = vec_ld( 0, p_s2 );
235 destv = vec_avg( s1v, s2v );
236 vec_st( destv, 0, p_dest );
237
238 p_s1 += 16;
239 p_s2 += 16;
240 p_dest += 16;
241 }
242 }
243
244 p_end += 15;
245
246 while( p_dest < p_end )
247 *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
248 }
249 #endif
250
251 /*****************************************************************************
252 * EndMerge routines
253 *****************************************************************************/
254
255 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
EndMMX(void)256 void EndMMX( void )
257 {
258 __asm__ __volatile__( "emms" :: );
259 }
260 #endif
261
262 #if defined(CAN_COMPILE_3DNOW)
End3DNow(void)263 void End3DNow( void )
264 {
265 __asm__ __volatile__( "femms" :: );
266 }
267 #endif
268