1 /*****************************************************************************
2  * merge.c : Merge (line blending) routines for the VLC deinterlacer
3  *****************************************************************************
4  * Copyright (C) 2011 VLC authors and VideoLAN
5  * $Id: 94cdd775ac70a548f23b3efbca65d6259d77ca39 $
6  *
7  * Author: Sam Hocevar <sam@zoy.org>                      (generic C routine)
8  *         Sigmund Augdal Helberg <sigmunau@videolan.org> (MMXEXT, 3DNow, SSE2)
9  *         Eric Petit <eric.petit@lapsus.org>             (Altivec)
10  *
11  * This program is free software; you can redistribute it and/or modify it
12  * under the terms of the GNU Lesser General Public License as published by
13  * the Free Software Foundation; either version 2.1 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * along with this program; if not, write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
24  *****************************************************************************/
25 
26 #ifdef HAVE_CONFIG_H
27 #   include "config.h"
28 #endif
29 
30 #include <stdlib.h>
31 #include <stdint.h>
32 
33 #include <vlc_common.h>
34 #include <vlc_cpu.h>
35 #include "merge.h"
36 
37 #ifdef CAN_COMPILE_MMXEXT
38 #   include "mmx.h"
39 #endif
40 
41 #ifdef HAVE_ALTIVEC_H
42 #   undef bool
43 #   include <altivec.h>
44 #   define bool _Bool
45 #endif
46 
47 /*****************************************************************************
48  * Merge (line blending) routines
49  *****************************************************************************/
50 
Merge8BitGeneric(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)51 void Merge8BitGeneric( void *_p_dest, const void *_p_s1,
52                        const void *_p_s2, size_t i_bytes )
53 {
54     uint8_t *p_dest = _p_dest;
55     const uint8_t *p_s1 = _p_s1;
56     const uint8_t *p_s2 = _p_s2;
57 
58     for( ; i_bytes > 0; i_bytes-- )
59         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
60 }
61 
Merge16BitGeneric(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)62 void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
63                         const void *_p_s2, size_t i_bytes )
64 {
65     uint16_t *p_dest = _p_dest;
66     const uint16_t *p_s1 = _p_s1;
67     const uint16_t *p_s2 = _p_s2;
68 
69     for( size_t i_words = i_bytes / 2; i_words > 0; i_words-- )
70         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
71 }
72 
73 #if defined(CAN_COMPILE_MMXEXT)
74 VLC_MMX
MergeMMXEXT(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)75 void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
76                   size_t i_bytes )
77 {
78     uint8_t *p_dest = _p_dest;
79     const uint8_t *p_s1 = _p_s1;
80     const uint8_t *p_s2 = _p_s2;
81 
82     for( ; i_bytes >= 8; i_bytes -= 8 )
83     {
84         __asm__  __volatile__( "movq %2,%%mm1;"
85                                "pavgb %1, %%mm1;"
86                                "movq %%mm1, %0" :"=m" (*p_dest):
87                                                  "m" (*p_s1),
88                                                  "m" (*p_s2) : "mm1" );
89         p_dest += 8;
90         p_s1 += 8;
91         p_s2 += 8;
92     }
93 
94     for( ; i_bytes > 0; i_bytes-- )
95         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
96 }
97 #endif
98 
99 #if defined(CAN_COMPILE_3DNOW)
100 VLC_MMX
Merge3DNow(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)101 void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
102                  size_t i_bytes )
103 {
104     uint8_t *p_dest = _p_dest;
105     const uint8_t *p_s1 = _p_s1;
106     const uint8_t *p_s2 = _p_s2;
107 
108     for( ; i_bytes >= 8; i_bytes -= 8 )
109     {
110         __asm__  __volatile__( "movq %2,%%mm1;"
111                                "pavgusb %1, %%mm1;"
112                                "movq %%mm1, %0" :"=m" (*p_dest):
113                                                  "m" (*p_s1),
114                                                  "m" (*p_s2) : "mm1" );
115         p_dest += 8;
116         p_s1 += 8;
117         p_s2 += 8;
118     }
119 
120     for( ; i_bytes > 0; i_bytes-- )
121         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
122 }
123 #endif
124 
125 #if defined(CAN_COMPILE_SSE)
126 VLC_SSE
Merge8BitSSE2(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)127 void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
128                     size_t i_bytes )
129 {
130     uint8_t *p_dest = _p_dest;
131     const uint8_t *p_s1 = _p_s1;
132     const uint8_t *p_s2 = _p_s2;
133 
134     for( ; i_bytes > 0 && ((uintptr_t)p_s1 & 15); i_bytes-- )
135         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
136 
137     for( ; i_bytes >= 16; i_bytes -= 16 )
138     {
139         __asm__  __volatile__( "movdqu %2,%%xmm1;"
140                                "pavgb %1, %%xmm1;"
141                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
142                                                  "m" (*p_s1),
143                                                  "m" (*p_s2) : "xmm1" );
144         p_dest += 16;
145         p_s1 += 16;
146         p_s2 += 16;
147     }
148 
149     for( ; i_bytes > 0; i_bytes-- )
150         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
151 }
152 
153 VLC_SSE
Merge16BitSSE2(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)154 void Merge16BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
155                      size_t i_bytes )
156 {
157     uint16_t *p_dest = _p_dest;
158     const uint16_t *p_s1 = _p_s1;
159     const uint16_t *p_s2 = _p_s2;
160 
161     size_t i_words = i_bytes / 2;
162     for( ; i_words > 0 && ((uintptr_t)p_s1 & 15); i_words-- )
163         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
164 
165     for( ; i_words >= 8; i_words -= 8 )
166     {
167         __asm__  __volatile__( "movdqu %2,%%xmm1;"
168                                "pavgw %1, %%xmm1;"
169                                "movdqu %%xmm1, %0" :"=m" (*p_dest):
170                                                  "m" (*p_s1),
171                                                  "m" (*p_s2) : "xmm1" );
172         p_dest += 8;
173         p_s1 += 8;
174         p_s2 += 8;
175     }
176 
177     for( ; i_words > 0; i_words-- )
178         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
179 }
180 
181 #endif
182 
183 #ifdef CAN_COMPILE_C_ALTIVEC
MergeAltivec(void * _p_dest,const void * _p_s1,const void * _p_s2,size_t i_bytes)184 void MergeAltivec( void *_p_dest, const void *_p_s1,
185                    const void *_p_s2, size_t i_bytes )
186 {
187     uint8_t *p_dest = _p_dest;
188     const uint8_t *p_s1 = _p_s1;
189     const uint8_t *p_s2 = _p_s2;
190     uint8_t *p_end  = p_dest + i_bytes - 15;
191 
192     /* Use C until the first 16-bytes aligned destination pixel */
193     while( (uintptr_t)p_dest & 0xF )
194     {
195         *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
196     }
197 
198     if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
199     {
200         /* Unaligned source */
201         vector unsigned char s1v, s2v, destv;
202         vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
203         vector unsigned char perm1v, perm2v;
204 
205         perm1v = vec_lvsl( 0, p_s1 );
206         perm2v = vec_lvsl( 0, p_s2 );
207         s1oldv = vec_ld( 0, p_s1 );
208         s2oldv = vec_ld( 0, p_s2 );
209 
210         while( p_dest < p_end )
211         {
212             s1newv = vec_ld( 16, p_s1 );
213             s2newv = vec_ld( 16, p_s2 );
214             s1v    = vec_perm( s1oldv, s1newv, perm1v );
215             s2v    = vec_perm( s2oldv, s2newv, perm2v );
216             s1oldv = s1newv;
217             s2oldv = s2newv;
218             destv  = vec_avg( s1v, s2v );
219             vec_st( destv, 0, p_dest );
220 
221             p_s1   += 16;
222             p_s2   += 16;
223             p_dest += 16;
224         }
225     }
226     else
227     {
228         /* Aligned source */
229         vector unsigned char s1v, s2v, destv;
230 
231         while( p_dest < p_end )
232         {
233             s1v   = vec_ld( 0, p_s1 );
234             s2v   = vec_ld( 0, p_s2 );
235             destv = vec_avg( s1v, s2v );
236             vec_st( destv, 0, p_dest );
237 
238             p_s1   += 16;
239             p_s2   += 16;
240             p_dest += 16;
241         }
242     }
243 
244     p_end += 15;
245 
246     while( p_dest < p_end )
247         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
248 }
249 #endif
250 
251 /*****************************************************************************
252  * EndMerge routines
253  *****************************************************************************/
254 
255 #if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
EndMMX(void)256 void EndMMX( void )
257 {
258     __asm__ __volatile__( "emms" :: );
259 }
260 #endif
261 
262 #if defined(CAN_COMPILE_3DNOW)
End3DNow(void)263 void End3DNow( void )
264 {
265     __asm__ __volatile__( "femms" :: );
266 }
267 #endif
268