1 /*
2  * generic alpha renderers for all YUV modes and RGB depths
3  * Optimized by Nick and Michael.
4  *
5  * This file is part of MPlayer.
6  *
7  * MPlayer is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * MPlayer is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License along
18  * with MPlayer; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21 
22 #undef PREFETCH
23 #undef EMMS
24 #undef PREFETCHW
25 #undef PAVGB
26 
27 #if HAVE_AMD3DNOW
28 #define PREFETCH  "prefetch"
29 #define PREFETCHW "prefetchw"
30 #define PAVGB	  "pavgusb"
31 #elif HAVE_MMX2
32 #define PREFETCH "prefetchnta"
33 #define PREFETCHW "prefetcht0"
34 #define PAVGB	  "pavgb"
35 #else
36 #define PREFETCH " # nop"
37 #define PREFETCHW " # nop"
38 #endif
39 
40 #if HAVE_AMD3DNOW
41 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
42 #define EMMS     "femms"
43 #else
44 #define EMMS     "emms"
45 #endif
46 
47 #if HAVE_SSE2
48 ATTR_TARGET_SSE2
muladd_src_unpacked(__m128i dstlo,__m128i dsthi,__m128i src,__m128i srcalo,__m128i srcahi)49 static inline __m128i muladd_src_unpacked(__m128i dstlo, __m128i dsthi, __m128i src, __m128i srcalo, __m128i srcahi)
50 {
51     // (mmhigh,mmlow) = (dst * (srca * 256)) / 65536 (= (dst * srca) >> 8)
52     __m128i mmlow = _mm_mulhi_epu16(dstlo, srcalo);
53     __m128i mmhigh = _mm_mulhi_epu16(dsthi, srcahi);
54 
55     __m128i res = _mm_packus_epi16(mmlow, mmhigh);
56 
57     return _mm_add_epi8(res, src);
58 }
59 
60 ATTR_TARGET_SSE2
muladd_src(__m128i dst,__m128i src,__m128i srca)61 static inline __m128i muladd_src(__m128i dst, __m128i src, __m128i srca)
62 {
63     __m128i zero = _mm_setzero_si128();
64     __m128i dstlo = _mm_unpacklo_epi8(dst, zero);
65     __m128i dsthi = _mm_unpackhi_epi8(dst, zero);
66     __m128i srcalo = _mm_unpacklo_epi8(zero, srca);
67     __m128i srcahi = _mm_unpackhi_epi8(zero, srca);
68     return muladd_src_unpacked(dstlo, dsthi, src, srcalo, srcahi);
69 }
70 
71 ATTR_TARGET_SSE2
alphamask(__m128i orig,__m128i blended,__m128i srca)72 static inline __m128i alphamask(__m128i orig, __m128i blended, __m128i srca)
73 {
74     __m128i zero = _mm_setzero_si128();
75     // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
76     // thus no need to mask res
77     __m128i mask = _mm_cmpeq_epi8(srca, zero);
78     orig = _mm_and_si128(orig, mask);
79     return _mm_or_si128(blended, orig);
80 }
81 
82 // Special version that compares alpha in 16 bit chunks instead of bytewise
83 ATTR_TARGET_SSE2
alphamask16(__m128i orig,__m128i blended,__m128i srca)84 static inline __m128i alphamask16(__m128i orig, __m128i blended, __m128i srca)
85 {
86     __m128i zero = _mm_setzero_si128();
87     // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
88     // thus no need to mask res
89     __m128i mask = _mm_cmpeq_epi16(srca, zero);
90     orig = _mm_and_si128(orig, mask);
91     return _mm_or_si128(blended, orig);
92 }
93 #endif
94 
95 #if HAVE_SSE2
96 ATTR_TARGET_SSE2
97 #endif
RENAME(vo_draw_alpha_yv12)98 static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
99     int y;
100 #if defined(FAST_OSD) && !HAVE_MMX && !HAVE_SSE2
101     w=w>>1;
102 #endif
103 #if HAVE_MMX
104     __asm__ volatile(
105         "pcmpeqb %%mm5, %%mm5\n\t" // F..F
106         "movq %%mm5, %%mm4\n\t"
107         "movq %%mm5, %%mm7\n\t"
108         "psllw $8, %%mm5\n\t" //FF00FF00FF00
109         "psrlw $8, %%mm4\n\t" //00FF00FF00FF
110         ::);
111 #endif
112     for(y=0;y<h;y++){
113         register int x;
114 #if HAVE_MMX
115     __asm__ volatile(
116 	PREFETCHW" %0\n\t"
117 	PREFETCH" %1\n\t"
118 	PREFETCH" %2\n\t"
119 	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
120     for(x=0;x<w;x+=8){
121 	__asm__ volatile(
122 		"movl %1, %%eax\n\t"
123 		"orl 4%1, %%eax\n\t"
124 		" jz 1f\n\t"
125 		PREFETCHW" 32%0\n\t"
126 		PREFETCH" 32%1\n\t"
127 		PREFETCH" 32%2\n\t"
128 		"movq	%0, %%mm0\n\t" // dstbase
129 		"movq	%%mm0, %%mm1\n\t"
130 		"pand %%mm4, %%mm0\n\t" 	//0Y0Y0Y0Y
131 		"psrlw $8, %%mm1\n\t"		//0Y0Y0Y0Y
132 		"movq	%1, %%mm2\n\t" 		//srca HGFEDCBA
133 		"paddb	%%mm7, %%mm2\n\t"
134 		"movq %%mm2, %%mm3\n\t"
135 		"pand %%mm4, %%mm2\n\t" 	//0G0E0C0A
136 		"psrlw $8, %%mm3\n\t"		//0H0F0D0B
137 		"pmullw	%%mm2, %%mm0\n\t"
138 		"pmullw	%%mm3, %%mm1\n\t"
139 		"psrlw	$8, %%mm0\n\t"
140 		"pand %%mm5, %%mm1\n\t"
141 		"por %%mm1, %%mm0\n\t"
142 		"paddb	%2, %%mm0\n\t"
143 		"movq	%%mm0, %0\n\t"
144 		"1:\n\t"
145 		:: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
146 		: "%eax");
147 	}
148 #elif HAVE_SSE2
149         __m128i zero = _mm_setzero_si128();
150         for(x=0;x+15<w;x+=16){
151             __m128i mmsrc, mmdst, res;
152             __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
153 
154             int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
155             if (alpha == 0xffff) continue;
156 
157             mmdst = _mm_loadu_si128((const __m128i *)(dstbase + x));
158             mmsrc = _mm_load_si128((const __m128i *)(src + x));
159 
160             res = muladd_src(mmdst, mmsrc, mmsrca);
161 
162             // _mm_maskmoveu_s128 would be an alternative but slower
163             res = alphamask(mmdst, res, mmsrca);
164             _mm_storeu_si128((__m128i *)(dstbase + x), res);
165         }
166         for(;x<w;x++){
167             if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
168         }
169 #else /* HAVE_SSE2 */
170         for(x=0;x<w;x++){
171 #ifdef FAST_OSD
172             if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
173             if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1];
174 #else
175             if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
176 #endif
177         }
178 #endif
179         src+=srcstride;
180         srca+=srcstride;
181         dstbase+=dststride;
182     }
183 #if HAVE_MMX
184 	__asm__ volatile(EMMS:::"memory");
185 #endif
186     return;
187 }
188 
189 #if HAVE_SSE2
190 ATTR_TARGET_SSE2
191 #endif
RENAME(vo_draw_alpha_yuy2)192 static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
193     int y;
194 #if defined(FAST_OSD) && !HAVE_MMX
195     w=w>>1;
196 #endif
197 #if HAVE_MMX
198     __asm__ volatile(
199         "pxor %%mm7, %%mm7\n\t"
200         "pcmpeqb %%mm5, %%mm5\n\t" // F..F
201         "movq %%mm5, %%mm6\n\t"
202         "movq %%mm5, %%mm4\n\t"
203         "psllw $8, %%mm5\n\t" //FF00FF00FF00
204         "psrlw $8, %%mm4\n\t" //00FF00FF00FF
205         ::);
206 #endif
207     for(y=0;y<h;y++){
208         register int x;
209 #if HAVE_MMX
210     __asm__ volatile(
211 	PREFETCHW" %0\n\t"
212 	PREFETCH" %1\n\t"
213 	PREFETCH" %2\n\t"
214 	::"m"(*dstbase),"m"(*srca),"m"(*src));
215     for(x=0;x<w;x+=4){
216 	__asm__ volatile(
217 		"movl %1, %%eax\n\t"
218 		"orl %%eax, %%eax\n\t"
219 		" jz 1f\n\t"
220 		PREFETCHW" 32%0\n\t"
221 		PREFETCH" 32%1\n\t"
222 		PREFETCH" 32%2\n\t"
223 		"movq	%0, %%mm0\n\t" // dstbase
224 		"movq	%%mm0, %%mm1\n\t"
225 		"pand %%mm4, %%mm0\n\t" 	//0Y0Y0Y0Y
226 		"movd	%%eax, %%mm2\n\t"	//srca 0000DCBA
227 		"paddb	%%mm6, %%mm2\n\t"
228 		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0D0C0B0A
229 		"pmullw	%%mm2, %%mm0\n\t"
230 		"psrlw	$8, %%mm0\n\t"
231 		"pand %%mm5, %%mm1\n\t" 	//U0V0U0V0
232 		"movd %2, %%mm2\n\t"		//src 0000DCBA
233 		"punpcklbw %%mm7, %%mm2\n\t"	//src 0D0C0B0A
234 		"por %%mm1, %%mm0\n\t"
235 		"paddb	%%mm2, %%mm0\n\t"
236 		"movq	%%mm0, %0\n\t"
237 		"1:\n\t"
238 		:: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
239 		: "%eax");
240 	}
241 #elif HAVE_SSE2
242         __m128i zero = _mm_setzero_si128();
243         __m128i ymask = _mm_set1_epi16(0xff);
244         __m128i uvofs = _mm_set1_epi16(0x8000);
245         for(x=0;x+15<w;x+=16){
246             __m128i mmsrc, mmsrcalo, mmsrcahi, mmdst, mmdst2, mmlow, mmhigh, mmy, mmuv, res;
247             __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
248             int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
249             if (alpha == 0xffff) continue;
250 
251             mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 2*x));
252             mmdst2 = _mm_loadu_si128((const __m128i *)(dstbase + 2*x + 16));
253 
254             // convert UV to signed
255             mmdst = _mm_xor_si128(mmdst, uvofs);
256             mmdst2 = _mm_xor_si128(mmdst2, uvofs);
257 
258             mmsrc = _mm_load_si128((const __m128i *)(src + x));
259             mmsrcalo = _mm_unpacklo_epi8(zero, mmsrca);
260             mmsrcahi = _mm_unpackhi_epi8(zero, mmsrca);
261 
262             mmy = muladd_src_unpacked(_mm_and_si128(mmdst, ymask), _mm_and_si128(mmdst2, ymask), mmsrc, mmsrcalo, mmsrcahi);
263 
264             // mmuv = ((dst(uv) ^ 128) * (srca * 256)) / 65536 ^ 128 (= (((dst - 128) * srca) >> 8)) + 128
265             mmlow = _mm_srai_epi16(mmdst, 8);
266             mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo);
267             mmhigh = _mm_srai_epi16(mmdst2, 8);
268             mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi);
269 
270             mmuv = _mm_packs_epi16(mmlow, mmhigh);
271 
272             res = _mm_unpacklo_epi8(mmy, mmuv);
273             res = alphamask16(mmdst, res, mmsrcalo);
274             // convert UV to unsigned
275             res = _mm_xor_si128(res, uvofs);
276             _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res);
277 
278             res = _mm_unpackhi_epi8(mmy, mmuv);
279             res = alphamask16(mmdst2, res, mmsrcahi);
280             // convert UV to unsigned
281             res = _mm_xor_si128(res, uvofs);
282             _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res);
283         }
284         for(;x<w;x++){
285             if(srca[x]) {
286                dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
287                dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
288             }
289         }
290 #else /* HAVE_SSE2 */
291         for(x=0;x<w;x++){
292 #ifdef FAST_OSD
293             if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
294             if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1];
295 #else
296             if(srca[x]) {
297                dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
298                dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
299            }
300 #endif
301         }
302 #endif
303 	src+=srcstride;
304         srca+=srcstride;
305         dstbase+=dststride;
306     }
307 #if HAVE_MMX
308 	__asm__ volatile(EMMS:::"memory");
309 #endif
310     return;
311 }
312 
313 #if HAVE_SSE2
314 ATTR_TARGET_SSE2
315 #endif
RENAME(vo_draw_alpha_uyvy)316 static inline void RENAME(vo_draw_alpha_uyvy)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
317   int y;
318 #if defined(FAST_OSD)
319   w=w>>1;
320 #endif
321   for(y=0;y<h;y++){
322     register int x;
323 #if HAVE_SSE2
324         __m128i zero = _mm_setzero_si128();
325         __m128i uvofs = _mm_set1_epi16(0x80);
326         for(x=0;x+15<w;x+=16){
327             __m128i mmsrc, mmsrcalo, mmsrcahi, mmdst, mmdst2, mmlow, mmhigh, mmy, mmuv, res;
328             __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
329             int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
330             if (alpha == 0xffff) continue;
331 
332             mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 2*x));
333             mmdst2 = _mm_loadu_si128((const __m128i *)(dstbase + 2*x + 16));
334 
335             // convert UV to signed
336             mmdst = _mm_xor_si128(mmdst, uvofs);
337             mmdst2 = _mm_xor_si128(mmdst2, uvofs);
338 
339             mmsrc = _mm_load_si128((const __m128i *)(src + x));
340             mmsrcalo = _mm_unpacklo_epi8(zero, mmsrca);
341             mmsrcahi = _mm_unpackhi_epi8(zero, mmsrca);
342 
343             mmy = muladd_src_unpacked(_mm_srli_epi16(mmdst, 8), _mm_srli_epi16(mmdst2, 8), mmsrc, mmsrcalo, mmsrcahi);
344 
345             // mmuv = ((dst(uv) ^ 128) * (srca * 256)) / 65536 ^ 128 (= (((dst - 128) * srca) >> 8)) + 128
346             // sign-extend and multiply
347             mmlow = _mm_slli_epi16(mmdst, 8);
348             mmlow = _mm_srai_epi16(mmlow, 8);
349             mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo);
350             mmhigh = _mm_slli_epi16(mmdst2, 8);
351             mmhigh = _mm_srai_epi16(mmhigh, 8);
352             mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi);
353 
354             mmuv = _mm_packs_epi16(mmlow, mmhigh);
355 
356             res = _mm_unpacklo_epi8(mmuv, mmy);
357             res = alphamask16(mmdst, res, mmsrcalo);
358             // convert UV to unsigned
359             res = _mm_xor_si128(res, uvofs);
360             _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res);
361 
362             res = _mm_unpackhi_epi8(mmuv, mmy);
363             res = alphamask16(mmdst2, res, mmsrcahi);
364             // convert UV to unsigned
365             res = _mm_xor_si128(res, uvofs);
366             _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res);
367         }
368         for(;x<w;x++){
369             if(srca[x]) {
370 	       dstbase[2*x+1]=((dstbase[2*x+1]*srca[x])>>8)+src[x];
371                dstbase[2*x]=((((signed)dstbase[2*x]-128)*srca[x])>>8)+128;
372             }
373         }
374 #else /* HAVE_SSE2 */
375     for(x=0;x<w;x++){
376 #ifdef FAST_OSD
377       if(srca[2*x+0]) dstbase[4*x+2]=src[2*x+0];
378       if(srca[2*x+1]) dstbase[4*x+0]=src[2*x+1];
379 #else
380       if(srca[x]) {
381 	dstbase[2*x+1]=((dstbase[2*x+1]*srca[x])>>8)+src[x];
382 	dstbase[2*x]=((((signed)dstbase[2*x]-128)*srca[x])>>8)+128;
383       }
384 #endif
385     }
386 #endif
387     src+=srcstride;
388     srca+=srcstride;
389     dstbase+=dststride;
390   }
391 }
392 
393 #define REPL3X(out, sd1, sa1, sd2, sa2, in) \
394 do { \
395    __m128i shuf012 = _mm_shufflelo_epi16(in, 0x40); \
396    __m128i shuf345 = _mm_shufflelo_epi16(in, 0xA5); \
397    __m128i repl3x_mmtmp = _mm_unpacklo_epi64(shuf012, shuf345); \
398    repl3x_mmtmp = _mm_and_si128(repl3x_mmtmp, one_in_three_mask); \
399    out = _mm_or_si128(_mm_or_si128(repl3x_mmtmp, _mm_s##sd1##li_si128(repl3x_mmtmp, sa1)), _mm_s##sd2##li_si128(repl3x_mmtmp, sa2)); \
400 } while (0)
401 
402 #if HAVE_SSE2
403 ATTR_TARGET_SSE2
404 #endif
RENAME(vo_draw_alpha_rgb24)405 static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
406     int y;
407 #if HAVE_MMX
408     __asm__ volatile(
409         "pxor %%mm7, %%mm7\n\t"
410         "pcmpeqb %%mm6, %%mm6\n\t" // F..F
411         ::);
412 #endif
413     for(y=0;y<h;y++){
414         register unsigned char *dst = dstbase;
415         register int x;
416 #if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX || HAVE_SSE2)
417 #if HAVE_MMX
418     __asm__ volatile(
419 	PREFETCHW" %0\n\t"
420 	PREFETCH" %1\n\t"
421 	PREFETCH" %2\n\t"
422 	::"m"(*dst),"m"(*srca),"m"(*src):"memory");
423     for(x=0;x<w;x+=2){
424      if(srca[x] || srca[x+1])
425 	__asm__ volatile(
426 		PREFETCHW" 32%0\n\t"
427 		PREFETCH" 32%1\n\t"
428 		PREFETCH" 32%2\n\t"
429 		"movq	%0, %%mm0\n\t" // dstbase
430 		"movq	%%mm0, %%mm1\n\t"
431 		"movq	%%mm0, %%mm5\n\t"
432 		"punpcklbw %%mm7, %%mm0\n\t"
433 		"punpckhbw %%mm7, %%mm1\n\t"
434 		"movd	%1, %%mm2\n\t" // srca ABCD0000
435 		"paddb	%%mm6, %%mm2\n\t"
436 		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
437 		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
438 		"psrlq  $8, %%mm2\n\t" // srca AAABBBB0
439 		"movq	%%mm2, %%mm3\n\t"
440 		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0B
441 		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B00
442 		"pmullw	%%mm2, %%mm0\n\t"
443 		"pmullw	%%mm3, %%mm1\n\t"
444 		"psrlw	$8, %%mm0\n\t"
445 		"psrlw	$8, %%mm1\n\t"
446 		"packuswb %%mm1, %%mm0\n\t"
447 		"movd %2, %%mm2	\n\t" // src ABCD0000
448 		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
449 		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
450 		"psrlq  $8, %%mm2\n\t" // src AAABBBB0
451 		"paddb	%%mm2, %%mm0\n\t"
452 		"pand	%4, %%mm5\n\t"
453 		"pand	%3, %%mm0\n\t"
454 		"por	%%mm0, %%mm5\n\t"
455 		"movq	%%mm5, %0\n\t"
456 		:: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
457 		dst += 6;
458 	}
459 #elif HAVE_SSE2
460         __m128i one_in_three_mask = _mm_set_epi32(0xff0000ffu, 0x0000ff00u, 0x00ff0000u, 0xff0000ffu);
461         __m128i zero = _mm_setzero_si128();
462         for(x=0;x+15<w;x+=16){
463             __m128i mmsrc, mmtmp, mmtmpa, mmdst, res;
464             __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
465             int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
466             if (alpha == 0xffff) { dst += 48; continue; }
467 
468             mmsrc = _mm_load_si128((const __m128i *)(src + x));
469 
470             if ((alpha & 0x3f) != 0x3f) {
471                 mmdst = _mm_loadu_si128((const __m128i *)dst);
472                 REPL3X(mmtmpa, l, 1, l, 2, mmsrca);
473                 REPL3X(mmtmp, l, 1, l, 2, mmsrc);
474                 res = muladd_src(mmdst, mmtmp, mmtmpa);
475                 res = alphamask(mmdst, res, mmtmpa);
476                 _mm_storeu_si128((__m128i *)dst, res);
477             }
478             dst += 16;
479 
480             mmsrca = _mm_srli_si128(mmsrca, 5);
481             mmsrc = _mm_srli_si128(mmsrc, 5);
482 
483             if ((alpha & 0x7e0) != 0x7e0) {
484                 mmdst = _mm_loadu_si128((const __m128i *)dst);
485                 REPL3X(mmtmpa, l, 1, r, 1, mmsrca);
486                 REPL3X(mmtmp, l, 1, r, 1, mmsrc);
487                 res = muladd_src(mmdst, mmtmp, mmtmpa);
488                 res = alphamask(mmdst, res, mmtmpa);
489                 _mm_storeu_si128((__m128i *)dst, res);
490             }
491             dst += 16;
492 
493             mmsrc = _mm_srli_si128(mmsrc, 5);
494             mmsrca = _mm_srli_si128(mmsrca, 5);
495 
496             if ((alpha & 0xfc00) != 0xfc00) {
497                 mmdst = _mm_loadu_si128((const __m128i *)dst);
498                 REPL3X(mmtmpa, r, 1, r, 2, mmsrca);
499                 REPL3X(mmtmp, r, 1, r, 2, mmsrc);
500                 res = muladd_src(mmdst, mmtmp, mmtmpa);
501                 res = alphamask(mmdst, res, mmtmpa);
502                 _mm_storeu_si128((__m128i *)dst, res);
503             }
504             dst += 16;
505         }
506         for(;x<w;x++){
507             if(srca[x]){
508 		dst[0]=((dst[0]*srca[x])>>8)+src[x];
509 		dst[1]=((dst[1]*srca[x])>>8)+src[x];
510 		dst[2]=((dst[2]*srca[x])>>8)+src[x];
511             }
512             dst+=3; // 24bpp
513         }
514 #else /* HAVE_SSE2 */
515     for(x=0;x<w;x++){
516         if(srca[x]){
517 	    __asm__ volatile(
518 		"movzbl (%0), %%ecx\n\t"
519 		"movzbl 1(%0), %%eax\n\t"
520 
521 		"imull %1, %%ecx\n\t"
522 		"imull %1, %%eax\n\t"
523 
524 		"addl %2, %%ecx\n\t"
525 		"addl %2, %%eax\n\t"
526 
527 		"movb %%ch, (%0)\n\t"
528 		"movb %%ah, 1(%0)\n\t"
529 
530                 "movzbl 2(%0), %%eax\n\t"
531 		"imull %1, %%eax\n\t"
532 		"addl %2, %%eax\n\t"
533 		"movb %%ah, 2(%0)\n\t"
534 		:
535 		:"D" (dst),
536 		 "r" ((unsigned)srca[x]),
537 		 "r" (((unsigned)src[x])<<8)
538 		:"%eax", "%ecx"
539 		);
540             }
541 	    dst += 3;
542         }
543 #endif /* !HAVE_MMX */
544 #else /*non x86 arch or x86_64 with MMX and SSE2 disabled */
545         for(x=0;x<w;x++){
546             if(srca[x]){
547 #ifdef FAST_OSD
548 		dst[0]=dst[1]=dst[2]=src[x];
549 #else
550 		dst[0]=((dst[0]*srca[x])>>8)+src[x];
551 		dst[1]=((dst[1]*srca[x])>>8)+src[x];
552 		dst[2]=((dst[2]*srca[x])>>8)+src[x];
553 #endif
554             }
555             dst+=3; // 24bpp
556         }
557 #endif /* arch_x86 */
558         src+=srcstride;
559         srca+=srcstride;
560         dstbase+=dststride;
561     }
562 #if HAVE_MMX
563 	__asm__ volatile(EMMS:::"memory");
564 #endif
565     return;
566 }
567 
568 #if HAVE_SSE2
569 ATTR_TARGET_SSE2
570 #endif
RENAME(vo_draw_alpha_rgb32)571 static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
572     int y;
573 #if HAVE_BIGENDIAN
574     dstbase++;
575 #endif
576 #if HAVE_MMX
577 #if HAVE_AMD3DNOW
578     __asm__ volatile(
579         "pxor %%mm7, %%mm7\n\t"
580         "pcmpeqb %%mm6, %%mm6\n\t" // F..F
581         ::);
582 #else /* HAVE_AMD3DNOW */
583     __asm__ volatile(
584         "pxor %%mm7, %%mm7\n\t"
585         "pcmpeqb %%mm5, %%mm5\n\t" // F..F
586         "movq %%mm5, %%mm4\n\t"
587         "psllw $8, %%mm5\n\t" //FF00FF00FF00
588         "psrlw $8, %%mm4\n\t" //00FF00FF00FF
589         ::);
590 #endif /* HAVE_AMD3DNOW */
591 #endif /* HAVE_MMX */
592     for(y=0;y<h;y++){
593         register int x;
594 #if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX || HAVE_SSE2)
595 #if HAVE_MMX
596 #if HAVE_AMD3DNOW
597     __asm__ volatile(
598 	PREFETCHW" %0\n\t"
599 	PREFETCH" %1\n\t"
600 	PREFETCH" %2\n\t"
601 	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
602     for(x=0;x<w;x+=2){
603      if(srca[x] || srca[x+1])
604 	__asm__ volatile(
605 		PREFETCHW" 32%0\n\t"
606 		PREFETCH" 32%1\n\t"
607 		PREFETCH" 32%2\n\t"
608 		"movq	%0, %%mm0\n\t" // dstbase
609 		"movq	%%mm0, %%mm1\n\t"
610 		"punpcklbw %%mm7, %%mm0\n\t"
611 		"punpckhbw %%mm7, %%mm1\n\t"
612 		"movd	%1, %%mm2\n\t" // srca ABCD0000
613 		"paddb	%%mm6, %%mm2\n\t"
614 		"punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
615 		"punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
616 		"movq	%%mm2, %%mm3\n\t"
617 		"punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
618 		"punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
619 		"pmullw	%%mm2, %%mm0\n\t"
620 		"pmullw	%%mm3, %%mm1\n\t"
621 		"psrlw	$8, %%mm0\n\t"
622 		"psrlw	$8, %%mm1\n\t"
623 		"packuswb %%mm1, %%mm0\n\t"
624 		"movd %2, %%mm2	\n\t" // src ABCD0000
625 		"punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
626 		"punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
627 		"paddb	%%mm2, %%mm0\n\t"
628 		"movq	%%mm0, %0\n\t"
629 		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
630 	}
631 #else //this is faster for intels crap
632     __asm__ volatile(
633 	PREFETCHW" %0\n\t"
634 	PREFETCH" %1\n\t"
635 	PREFETCH" %2\n\t"
636 	::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
637     for(x=0;x<w;x+=4){
638 	__asm__ volatile(
639 		"movl %1, %%eax\n\t"
640 		"orl %%eax, %%eax\n\t"
641 		" jz 1f\n\t"
642 		PREFETCHW" 32%0\n\t"
643 		PREFETCH" 32%1\n\t"
644 		PREFETCH" 32%2\n\t"
645 		"movq	%0, %%mm0\n\t" // dstbase
646 		"movq	%%mm0, %%mm1\n\t"
647 		"pand %%mm4, %%mm0\n\t" 	//0R0B0R0B
648 		"psrlw $8, %%mm1\n\t"		//0?0G0?0G
649 		"movd	%%eax, %%mm2\n\t" 	//srca 0000DCBA
650 		"paddb	%3, %%mm2\n\t"
651 		"punpcklbw %%mm2, %%mm2\n\t"	//srca DDCCBBAA
652 		"movq %%mm2, %%mm3\n\t"
653 		"punpcklbw %%mm7, %%mm2\n\t"	//srca 0B0B0A0A
654 		"pmullw	%%mm2, %%mm0\n\t"
655 		"pmullw	%%mm2, %%mm1\n\t"
656 		"psrlw	$8, %%mm0\n\t"
657 		"pand %%mm5, %%mm1\n\t"
658 		"por %%mm1, %%mm0\n\t"
659 		"movd %2, %%mm2	\n\t"		//src 0000DCBA
660 		"punpcklbw %%mm2, %%mm2\n\t" 	//src DDCCBBAA
661 		"movq %%mm2, %%mm6\n\t"
662 		"punpcklbw %%mm2, %%mm2\n\t"	//src BBBBAAAA
663 		"paddb	%%mm2, %%mm0\n\t"
664 		"movq	%%mm0, %0\n\t"
665 
666 		"movq	8%0, %%mm0\n\t" // dstbase
667 		"movq	%%mm0, %%mm1\n\t"
668 		"pand %%mm4, %%mm0\n\t" 	//0R0B0R0B
669 		"psrlw $8, %%mm1\n\t"		//0?0G0?0G
670 		"punpckhbw %%mm7, %%mm3\n\t"	//srca 0D0D0C0C
671 		"pmullw	%%mm3, %%mm0\n\t"
672 		"pmullw	%%mm3, %%mm1\n\t"
673 		"psrlw	$8, %%mm0\n\t"
674 		"pand %%mm5, %%mm1\n\t"
675 		"por %%mm1, %%mm0\n\t"
676 		"punpckhbw %%mm6, %%mm6\n\t"	//src DDDDCCCC
677 		"paddb	%%mm6, %%mm0\n\t"
678 		"movq	%%mm0, 8%0\n\t"
679 		"1:\n\t"
680 		:: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]), "m" (bFF)
681 		: "%eax");
682 	}
683 #endif
684 #elif HAVE_SSE2
685         __m128i zero = _mm_setzero_si128();
686         __m128i mmsrca = _mm_setzero_si128();
687         for(x=0;x<w;x+=4){
688             __m128i mmsrc, mmdst, mmsrcexp, mmsrcaexp, res;
689             if ((x & 15) == 0) {
690                 int alpha;
691                 mmsrca = _mm_load_si128((const __m128i *)(srca + x));
692                 alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
693                 if (alpha == 0xffff) { x += 12; continue; }
694                 mmsrc = _mm_load_si128((const __m128i *)(src + x));
695             }
696 
697             mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 4*x));
698 
699             mmsrcaexp = _mm_unpacklo_epi8(mmsrca, mmsrca);
700             mmsrcaexp = _mm_unpacklo_epi8(mmsrcaexp, mmsrcaexp);
701             mmsrcexp = _mm_unpacklo_epi8(mmsrc, mmsrc);
702             mmsrcexp = _mm_unpacklo_epi8(mmsrcexp, mmsrcexp);
703 
704             res = muladd_src(mmdst, mmsrcexp, mmsrcaexp);
705 
706             res = alphamask(mmdst, res, mmsrcaexp);
707             _mm_storeu_si128((__m128i *)(dstbase + 4*x), res);
708 
709             mmsrca = _mm_srli_si128(mmsrca, 4);
710             mmsrc = _mm_srli_si128(mmsrc, 4);
711         }
712         for(;x<w;x++){
713             if(srca[x]){
714 		dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
715 		dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
716 		dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
717 	    }
718         }
719 #else /* HAVE_SSE2 */
720     for(x=0;x<w;x++){
721         if(srca[x]){
722 	    __asm__ volatile(
723 		"movzbl (%0), %%ecx\n\t"
724 		"movzbl 1(%0), %%eax\n\t"
725 		"movzbl 2(%0), %%edx\n\t"
726 
727 		"imull %1, %%ecx\n\t"
728 		"imull %1, %%eax\n\t"
729 		"imull %1, %%edx\n\t"
730 
731  		"addl %2, %%ecx\n\t"
732 		"addl %2, %%eax\n\t"
733 		"addl %2, %%edx\n\t"
734 
735 		"movb %%ch, (%0)\n\t"
736 		"movb %%ah, 1(%0)\n\t"
737 		"movb %%dh, 2(%0)\n\t"
738 
739 		:
740 		:"r" (&dstbase[4*x]),
741 		 "r" ((unsigned)srca[x]),
742 		 "r" (((unsigned)src[x])<<8)
743 		:"%eax", "%ecx", "%edx"
744 		);
745             }
746         }
747 #endif /* HAVE_MMX */
748 #else /*non x86 arch or x86_64 with MMX disabled */
749         for(x=0;x<w;x++){
750             if(srca[x]){
751 #ifdef FAST_OSD
752 		dstbase[4*x+0]=dstbase[4*x+1]=dstbase[4*x+2]=src[x];
753 #else
754 		dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
755 		dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
756 		dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
757 #endif
758             }
759         }
760 #endif /* arch_x86 */
761         src+=srcstride;
762         srca+=srcstride;
763         dstbase+=dststride;
764     }
765 #if HAVE_MMX
766 	__asm__ volatile(EMMS:::"memory");
767 #endif
768     return;
769 }
770