1 /*
2 * generic alpha renderers for all YUV modes and RGB depths
3 * Optimized by Nick and Michael.
4 *
5 * This file is part of MPlayer.
6 *
7 * MPlayer is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * MPlayer is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 */
21
22 #undef PREFETCH
23 #undef EMMS
24 #undef PREFETCHW
25 #undef PAVGB
26
27 #if HAVE_AMD3DNOW
28 #define PREFETCH "prefetch"
29 #define PREFETCHW "prefetchw"
30 #define PAVGB "pavgusb"
31 #elif HAVE_MMX2
32 #define PREFETCH "prefetchnta"
33 #define PREFETCHW "prefetcht0"
34 #define PAVGB "pavgb"
35 #else
36 #define PREFETCH " # nop"
37 #define PREFETCHW " # nop"
38 #endif
39
40 #if HAVE_AMD3DNOW
41 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
42 #define EMMS "femms"
43 #else
44 #define EMMS "emms"
45 #endif
46
47 #if HAVE_SSE2
48 ATTR_TARGET_SSE2
muladd_src_unpacked(__m128i dstlo,__m128i dsthi,__m128i src,__m128i srcalo,__m128i srcahi)49 static inline __m128i muladd_src_unpacked(__m128i dstlo, __m128i dsthi, __m128i src, __m128i srcalo, __m128i srcahi)
50 {
51 // (mmhigh,mmlow) = (dst * (srca * 256)) / 65536 (= (dst * srca) >> 8)
52 __m128i mmlow = _mm_mulhi_epu16(dstlo, srcalo);
53 __m128i mmhigh = _mm_mulhi_epu16(dsthi, srcahi);
54
55 __m128i res = _mm_packus_epi16(mmlow, mmhigh);
56
57 return _mm_add_epi8(res, src);
58 }
59
60 ATTR_TARGET_SSE2
muladd_src(__m128i dst,__m128i src,__m128i srca)61 static inline __m128i muladd_src(__m128i dst, __m128i src, __m128i srca)
62 {
63 __m128i zero = _mm_setzero_si128();
64 __m128i dstlo = _mm_unpacklo_epi8(dst, zero);
65 __m128i dsthi = _mm_unpackhi_epi8(dst, zero);
66 __m128i srcalo = _mm_unpacklo_epi8(zero, srca);
67 __m128i srcahi = _mm_unpackhi_epi8(zero, srca);
68 return muladd_src_unpacked(dstlo, dsthi, src, srcalo, srcahi);
69 }
70
71 ATTR_TARGET_SSE2
alphamask(__m128i orig,__m128i blended,__m128i srca)72 static inline __m128i alphamask(__m128i orig, __m128i blended, __m128i srca)
73 {
74 __m128i zero = _mm_setzero_si128();
75 // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
76 // thus no need to mask res
77 __m128i mask = _mm_cmpeq_epi8(srca, zero);
78 orig = _mm_and_si128(orig, mask);
79 return _mm_or_si128(blended, orig);
80 }
81
82 // Special version that compares alpha in 16 bit chunks instead of bytewise
83 ATTR_TARGET_SSE2
alphamask16(__m128i orig,__m128i blended,__m128i srca)84 static inline __m128i alphamask16(__m128i orig, __m128i blended, __m128i srca)
85 {
86 __m128i zero = _mm_setzero_si128();
87 // if (!srca) res |= dst --- assumes srca == 0 implies src == 0,
88 // thus no need to mask res
89 __m128i mask = _mm_cmpeq_epi16(srca, zero);
90 orig = _mm_and_si128(orig, mask);
91 return _mm_or_si128(blended, orig);
92 }
93 #endif
94
95 #if HAVE_SSE2
96 ATTR_TARGET_SSE2
97 #endif
RENAME(vo_draw_alpha_yv12)98 static inline void RENAME(vo_draw_alpha_yv12)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
99 int y;
100 #if defined(FAST_OSD) && !HAVE_MMX && !HAVE_SSE2
101 w=w>>1;
102 #endif
103 #if HAVE_MMX
104 __asm__ volatile(
105 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
106 "movq %%mm5, %%mm4\n\t"
107 "movq %%mm5, %%mm7\n\t"
108 "psllw $8, %%mm5\n\t" //FF00FF00FF00
109 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
110 ::);
111 #endif
112 for(y=0;y<h;y++){
113 register int x;
114 #if HAVE_MMX
115 __asm__ volatile(
116 PREFETCHW" %0\n\t"
117 PREFETCH" %1\n\t"
118 PREFETCH" %2\n\t"
119 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
120 for(x=0;x<w;x+=8){
121 __asm__ volatile(
122 "movl %1, %%eax\n\t"
123 "orl 4%1, %%eax\n\t"
124 " jz 1f\n\t"
125 PREFETCHW" 32%0\n\t"
126 PREFETCH" 32%1\n\t"
127 PREFETCH" 32%2\n\t"
128 "movq %0, %%mm0\n\t" // dstbase
129 "movq %%mm0, %%mm1\n\t"
130 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
131 "psrlw $8, %%mm1\n\t" //0Y0Y0Y0Y
132 "movq %1, %%mm2\n\t" //srca HGFEDCBA
133 "paddb %%mm7, %%mm2\n\t"
134 "movq %%mm2, %%mm3\n\t"
135 "pand %%mm4, %%mm2\n\t" //0G0E0C0A
136 "psrlw $8, %%mm3\n\t" //0H0F0D0B
137 "pmullw %%mm2, %%mm0\n\t"
138 "pmullw %%mm3, %%mm1\n\t"
139 "psrlw $8, %%mm0\n\t"
140 "pand %%mm5, %%mm1\n\t"
141 "por %%mm1, %%mm0\n\t"
142 "paddb %2, %%mm0\n\t"
143 "movq %%mm0, %0\n\t"
144 "1:\n\t"
145 :: "m" (dstbase[x]), "m" (srca[x]), "m" (src[x])
146 : "%eax");
147 }
148 #elif HAVE_SSE2
149 __m128i zero = _mm_setzero_si128();
150 for(x=0;x+15<w;x+=16){
151 __m128i mmsrc, mmdst, res;
152 __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
153
154 int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
155 if (alpha == 0xffff) continue;
156
157 mmdst = _mm_loadu_si128((const __m128i *)(dstbase + x));
158 mmsrc = _mm_load_si128((const __m128i *)(src + x));
159
160 res = muladd_src(mmdst, mmsrc, mmsrca);
161
162 // _mm_maskmoveu_s128 would be an alternative but slower
163 res = alphamask(mmdst, res, mmsrca);
164 _mm_storeu_si128((__m128i *)(dstbase + x), res);
165 }
166 for(;x<w;x++){
167 if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
168 }
169 #else /* HAVE_SSE2 */
170 for(x=0;x<w;x++){
171 #ifdef FAST_OSD
172 if(srca[2*x+0]) dstbase[2*x+0]=src[2*x+0];
173 if(srca[2*x+1]) dstbase[2*x+1]=src[2*x+1];
174 #else
175 if(srca[x]) dstbase[x]=((dstbase[x]*srca[x])>>8)+src[x];
176 #endif
177 }
178 #endif
179 src+=srcstride;
180 srca+=srcstride;
181 dstbase+=dststride;
182 }
183 #if HAVE_MMX
184 __asm__ volatile(EMMS:::"memory");
185 #endif
186 return;
187 }
188
189 #if HAVE_SSE2
190 ATTR_TARGET_SSE2
191 #endif
RENAME(vo_draw_alpha_yuy2)192 static inline void RENAME(vo_draw_alpha_yuy2)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
193 int y;
194 #if defined(FAST_OSD) && !HAVE_MMX
195 w=w>>1;
196 #endif
197 #if HAVE_MMX
198 __asm__ volatile(
199 "pxor %%mm7, %%mm7\n\t"
200 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
201 "movq %%mm5, %%mm6\n\t"
202 "movq %%mm5, %%mm4\n\t"
203 "psllw $8, %%mm5\n\t" //FF00FF00FF00
204 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
205 ::);
206 #endif
207 for(y=0;y<h;y++){
208 register int x;
209 #if HAVE_MMX
210 __asm__ volatile(
211 PREFETCHW" %0\n\t"
212 PREFETCH" %1\n\t"
213 PREFETCH" %2\n\t"
214 ::"m"(*dstbase),"m"(*srca),"m"(*src));
215 for(x=0;x<w;x+=4){
216 __asm__ volatile(
217 "movl %1, %%eax\n\t"
218 "orl %%eax, %%eax\n\t"
219 " jz 1f\n\t"
220 PREFETCHW" 32%0\n\t"
221 PREFETCH" 32%1\n\t"
222 PREFETCH" 32%2\n\t"
223 "movq %0, %%mm0\n\t" // dstbase
224 "movq %%mm0, %%mm1\n\t"
225 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
226 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
227 "paddb %%mm6, %%mm2\n\t"
228 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
229 "pmullw %%mm2, %%mm0\n\t"
230 "psrlw $8, %%mm0\n\t"
231 "pand %%mm5, %%mm1\n\t" //U0V0U0V0
232 "movd %2, %%mm2\n\t" //src 0000DCBA
233 "punpcklbw %%mm7, %%mm2\n\t" //src 0D0C0B0A
234 "por %%mm1, %%mm0\n\t"
235 "paddb %%mm2, %%mm0\n\t"
236 "movq %%mm0, %0\n\t"
237 "1:\n\t"
238 :: "m" (dstbase[x*2]), "m" (srca[x]), "m" (src[x])
239 : "%eax");
240 }
241 #elif HAVE_SSE2
242 __m128i zero = _mm_setzero_si128();
243 __m128i ymask = _mm_set1_epi16(0xff);
244 __m128i uvofs = _mm_set1_epi16(0x8000);
245 for(x=0;x+15<w;x+=16){
246 __m128i mmsrc, mmsrcalo, mmsrcahi, mmdst, mmdst2, mmlow, mmhigh, mmy, mmuv, res;
247 __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
248 int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
249 if (alpha == 0xffff) continue;
250
251 mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 2*x));
252 mmdst2 = _mm_loadu_si128((const __m128i *)(dstbase + 2*x + 16));
253
254 // convert UV to signed
255 mmdst = _mm_xor_si128(mmdst, uvofs);
256 mmdst2 = _mm_xor_si128(mmdst2, uvofs);
257
258 mmsrc = _mm_load_si128((const __m128i *)(src + x));
259 mmsrcalo = _mm_unpacklo_epi8(zero, mmsrca);
260 mmsrcahi = _mm_unpackhi_epi8(zero, mmsrca);
261
262 mmy = muladd_src_unpacked(_mm_and_si128(mmdst, ymask), _mm_and_si128(mmdst2, ymask), mmsrc, mmsrcalo, mmsrcahi);
263
264 // mmuv = ((dst(uv) ^ 128) * (srca * 256)) / 65536 ^ 128 (= (((dst - 128) * srca) >> 8)) + 128
265 mmlow = _mm_srai_epi16(mmdst, 8);
266 mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo);
267 mmhigh = _mm_srai_epi16(mmdst2, 8);
268 mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi);
269
270 mmuv = _mm_packs_epi16(mmlow, mmhigh);
271
272 res = _mm_unpacklo_epi8(mmy, mmuv);
273 res = alphamask16(mmdst, res, mmsrcalo);
274 // convert UV to unsigned
275 res = _mm_xor_si128(res, uvofs);
276 _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res);
277
278 res = _mm_unpackhi_epi8(mmy, mmuv);
279 res = alphamask16(mmdst2, res, mmsrcahi);
280 // convert UV to unsigned
281 res = _mm_xor_si128(res, uvofs);
282 _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res);
283 }
284 for(;x<w;x++){
285 if(srca[x]) {
286 dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
287 dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
288 }
289 }
290 #else /* HAVE_SSE2 */
291 for(x=0;x<w;x++){
292 #ifdef FAST_OSD
293 if(srca[2*x+0]) dstbase[4*x+0]=src[2*x+0];
294 if(srca[2*x+1]) dstbase[4*x+2]=src[2*x+1];
295 #else
296 if(srca[x]) {
297 dstbase[2*x]=((dstbase[2*x]*srca[x])>>8)+src[x];
298 dstbase[2*x+1]=((((signed)dstbase[2*x+1]-128)*srca[x])>>8)+128;
299 }
300 #endif
301 }
302 #endif
303 src+=srcstride;
304 srca+=srcstride;
305 dstbase+=dststride;
306 }
307 #if HAVE_MMX
308 __asm__ volatile(EMMS:::"memory");
309 #endif
310 return;
311 }
312
313 #if HAVE_SSE2
314 ATTR_TARGET_SSE2
315 #endif
RENAME(vo_draw_alpha_uyvy)316 static inline void RENAME(vo_draw_alpha_uyvy)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
317 int y;
318 #if defined(FAST_OSD)
319 w=w>>1;
320 #endif
321 for(y=0;y<h;y++){
322 register int x;
323 #if HAVE_SSE2
324 __m128i zero = _mm_setzero_si128();
325 __m128i uvofs = _mm_set1_epi16(0x80);
326 for(x=0;x+15<w;x+=16){
327 __m128i mmsrc, mmsrcalo, mmsrcahi, mmdst, mmdst2, mmlow, mmhigh, mmy, mmuv, res;
328 __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
329 int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
330 if (alpha == 0xffff) continue;
331
332 mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 2*x));
333 mmdst2 = _mm_loadu_si128((const __m128i *)(dstbase + 2*x + 16));
334
335 // convert UV to signed
336 mmdst = _mm_xor_si128(mmdst, uvofs);
337 mmdst2 = _mm_xor_si128(mmdst2, uvofs);
338
339 mmsrc = _mm_load_si128((const __m128i *)(src + x));
340 mmsrcalo = _mm_unpacklo_epi8(zero, mmsrca);
341 mmsrcahi = _mm_unpackhi_epi8(zero, mmsrca);
342
343 mmy = muladd_src_unpacked(_mm_srli_epi16(mmdst, 8), _mm_srli_epi16(mmdst2, 8), mmsrc, mmsrcalo, mmsrcahi);
344
345 // mmuv = ((dst(uv) ^ 128) * (srca * 256)) / 65536 ^ 128 (= (((dst - 128) * srca) >> 8)) + 128
346 // sign-extend and multiply
347 mmlow = _mm_slli_epi16(mmdst, 8);
348 mmlow = _mm_srai_epi16(mmlow, 8);
349 mmlow = _mm_mulhi_epi16(mmlow, mmsrcalo);
350 mmhigh = _mm_slli_epi16(mmdst2, 8);
351 mmhigh = _mm_srai_epi16(mmhigh, 8);
352 mmhigh = _mm_mulhi_epi16(mmhigh, mmsrcahi);
353
354 mmuv = _mm_packs_epi16(mmlow, mmhigh);
355
356 res = _mm_unpacklo_epi8(mmuv, mmy);
357 res = alphamask16(mmdst, res, mmsrcalo);
358 // convert UV to unsigned
359 res = _mm_xor_si128(res, uvofs);
360 _mm_storeu_si128((__m128i *)(dstbase + 2 * x), res);
361
362 res = _mm_unpackhi_epi8(mmuv, mmy);
363 res = alphamask16(mmdst2, res, mmsrcahi);
364 // convert UV to unsigned
365 res = _mm_xor_si128(res, uvofs);
366 _mm_storeu_si128((__m128i *)(dstbase + 2 * x + 16), res);
367 }
368 for(;x<w;x++){
369 if(srca[x]) {
370 dstbase[2*x+1]=((dstbase[2*x+1]*srca[x])>>8)+src[x];
371 dstbase[2*x]=((((signed)dstbase[2*x]-128)*srca[x])>>8)+128;
372 }
373 }
374 #else /* HAVE_SSE2 */
375 for(x=0;x<w;x++){
376 #ifdef FAST_OSD
377 if(srca[2*x+0]) dstbase[4*x+2]=src[2*x+0];
378 if(srca[2*x+1]) dstbase[4*x+0]=src[2*x+1];
379 #else
380 if(srca[x]) {
381 dstbase[2*x+1]=((dstbase[2*x+1]*srca[x])>>8)+src[x];
382 dstbase[2*x]=((((signed)dstbase[2*x]-128)*srca[x])>>8)+128;
383 }
384 #endif
385 }
386 #endif
387 src+=srcstride;
388 srca+=srcstride;
389 dstbase+=dststride;
390 }
391 }
392
393 #define REPL3X(out, sd1, sa1, sd2, sa2, in) \
394 do { \
395 __m128i shuf012 = _mm_shufflelo_epi16(in, 0x40); \
396 __m128i shuf345 = _mm_shufflelo_epi16(in, 0xA5); \
397 __m128i repl3x_mmtmp = _mm_unpacklo_epi64(shuf012, shuf345); \
398 repl3x_mmtmp = _mm_and_si128(repl3x_mmtmp, one_in_three_mask); \
399 out = _mm_or_si128(_mm_or_si128(repl3x_mmtmp, _mm_s##sd1##li_si128(repl3x_mmtmp, sa1)), _mm_s##sd2##li_si128(repl3x_mmtmp, sa2)); \
400 } while (0)
401
402 #if HAVE_SSE2
403 ATTR_TARGET_SSE2
404 #endif
RENAME(vo_draw_alpha_rgb24)405 static inline void RENAME(vo_draw_alpha_rgb24)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
406 int y;
407 #if HAVE_MMX
408 __asm__ volatile(
409 "pxor %%mm7, %%mm7\n\t"
410 "pcmpeqb %%mm6, %%mm6\n\t" // F..F
411 ::);
412 #endif
413 for(y=0;y<h;y++){
414 register unsigned char *dst = dstbase;
415 register int x;
416 #if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX || HAVE_SSE2)
417 #if HAVE_MMX
418 __asm__ volatile(
419 PREFETCHW" %0\n\t"
420 PREFETCH" %1\n\t"
421 PREFETCH" %2\n\t"
422 ::"m"(*dst),"m"(*srca),"m"(*src):"memory");
423 for(x=0;x<w;x+=2){
424 if(srca[x] || srca[x+1])
425 __asm__ volatile(
426 PREFETCHW" 32%0\n\t"
427 PREFETCH" 32%1\n\t"
428 PREFETCH" 32%2\n\t"
429 "movq %0, %%mm0\n\t" // dstbase
430 "movq %%mm0, %%mm1\n\t"
431 "movq %%mm0, %%mm5\n\t"
432 "punpcklbw %%mm7, %%mm0\n\t"
433 "punpckhbw %%mm7, %%mm1\n\t"
434 "movd %1, %%mm2\n\t" // srca ABCD0000
435 "paddb %%mm6, %%mm2\n\t"
436 "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
437 "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
438 "psrlq $8, %%mm2\n\t" // srca AAABBBB0
439 "movq %%mm2, %%mm3\n\t"
440 "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0B
441 "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B00
442 "pmullw %%mm2, %%mm0\n\t"
443 "pmullw %%mm3, %%mm1\n\t"
444 "psrlw $8, %%mm0\n\t"
445 "psrlw $8, %%mm1\n\t"
446 "packuswb %%mm1, %%mm0\n\t"
447 "movd %2, %%mm2 \n\t" // src ABCD0000
448 "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
449 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
450 "psrlq $8, %%mm2\n\t" // src AAABBBB0
451 "paddb %%mm2, %%mm0\n\t"
452 "pand %4, %%mm5\n\t"
453 "pand %3, %%mm0\n\t"
454 "por %%mm0, %%mm5\n\t"
455 "movq %%mm5, %0\n\t"
456 :: "m" (dst[0]), "m" (srca[x]), "m" (src[x]), "m"(mask24hl), "m"(mask24lh));
457 dst += 6;
458 }
459 #elif HAVE_SSE2
460 __m128i one_in_three_mask = _mm_set_epi32(0xff0000ffu, 0x0000ff00u, 0x00ff0000u, 0xff0000ffu);
461 __m128i zero = _mm_setzero_si128();
462 for(x=0;x+15<w;x+=16){
463 __m128i mmsrc, mmtmp, mmtmpa, mmdst, res;
464 __m128i mmsrca = _mm_load_si128((const __m128i *)(srca + x));
465 int alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
466 if (alpha == 0xffff) { dst += 48; continue; }
467
468 mmsrc = _mm_load_si128((const __m128i *)(src + x));
469
470 if ((alpha & 0x3f) != 0x3f) {
471 mmdst = _mm_loadu_si128((const __m128i *)dst);
472 REPL3X(mmtmpa, l, 1, l, 2, mmsrca);
473 REPL3X(mmtmp, l, 1, l, 2, mmsrc);
474 res = muladd_src(mmdst, mmtmp, mmtmpa);
475 res = alphamask(mmdst, res, mmtmpa);
476 _mm_storeu_si128((__m128i *)dst, res);
477 }
478 dst += 16;
479
480 mmsrca = _mm_srli_si128(mmsrca, 5);
481 mmsrc = _mm_srli_si128(mmsrc, 5);
482
483 if ((alpha & 0x7e0) != 0x7e0) {
484 mmdst = _mm_loadu_si128((const __m128i *)dst);
485 REPL3X(mmtmpa, l, 1, r, 1, mmsrca);
486 REPL3X(mmtmp, l, 1, r, 1, mmsrc);
487 res = muladd_src(mmdst, mmtmp, mmtmpa);
488 res = alphamask(mmdst, res, mmtmpa);
489 _mm_storeu_si128((__m128i *)dst, res);
490 }
491 dst += 16;
492
493 mmsrc = _mm_srli_si128(mmsrc, 5);
494 mmsrca = _mm_srli_si128(mmsrca, 5);
495
496 if ((alpha & 0xfc00) != 0xfc00) {
497 mmdst = _mm_loadu_si128((const __m128i *)dst);
498 REPL3X(mmtmpa, r, 1, r, 2, mmsrca);
499 REPL3X(mmtmp, r, 1, r, 2, mmsrc);
500 res = muladd_src(mmdst, mmtmp, mmtmpa);
501 res = alphamask(mmdst, res, mmtmpa);
502 _mm_storeu_si128((__m128i *)dst, res);
503 }
504 dst += 16;
505 }
506 for(;x<w;x++){
507 if(srca[x]){
508 dst[0]=((dst[0]*srca[x])>>8)+src[x];
509 dst[1]=((dst[1]*srca[x])>>8)+src[x];
510 dst[2]=((dst[2]*srca[x])>>8)+src[x];
511 }
512 dst+=3; // 24bpp
513 }
514 #else /* HAVE_SSE2 */
515 for(x=0;x<w;x++){
516 if(srca[x]){
517 __asm__ volatile(
518 "movzbl (%0), %%ecx\n\t"
519 "movzbl 1(%0), %%eax\n\t"
520
521 "imull %1, %%ecx\n\t"
522 "imull %1, %%eax\n\t"
523
524 "addl %2, %%ecx\n\t"
525 "addl %2, %%eax\n\t"
526
527 "movb %%ch, (%0)\n\t"
528 "movb %%ah, 1(%0)\n\t"
529
530 "movzbl 2(%0), %%eax\n\t"
531 "imull %1, %%eax\n\t"
532 "addl %2, %%eax\n\t"
533 "movb %%ah, 2(%0)\n\t"
534 :
535 :"D" (dst),
536 "r" ((unsigned)srca[x]),
537 "r" (((unsigned)src[x])<<8)
538 :"%eax", "%ecx"
539 );
540 }
541 dst += 3;
542 }
543 #endif /* !HAVE_MMX */
544 #else /*non x86 arch or x86_64 with MMX and SSE2 disabled */
545 for(x=0;x<w;x++){
546 if(srca[x]){
547 #ifdef FAST_OSD
548 dst[0]=dst[1]=dst[2]=src[x];
549 #else
550 dst[0]=((dst[0]*srca[x])>>8)+src[x];
551 dst[1]=((dst[1]*srca[x])>>8)+src[x];
552 dst[2]=((dst[2]*srca[x])>>8)+src[x];
553 #endif
554 }
555 dst+=3; // 24bpp
556 }
557 #endif /* arch_x86 */
558 src+=srcstride;
559 srca+=srcstride;
560 dstbase+=dststride;
561 }
562 #if HAVE_MMX
563 __asm__ volatile(EMMS:::"memory");
564 #endif
565 return;
566 }
567
568 #if HAVE_SSE2
569 ATTR_TARGET_SSE2
570 #endif
RENAME(vo_draw_alpha_rgb32)571 static inline void RENAME(vo_draw_alpha_rgb32)(int w,int h, unsigned char* src, unsigned char *srca, int srcstride, unsigned char* dstbase,int dststride){
572 int y;
573 #if HAVE_BIGENDIAN
574 dstbase++;
575 #endif
576 #if HAVE_MMX
577 #if HAVE_AMD3DNOW
578 __asm__ volatile(
579 "pxor %%mm7, %%mm7\n\t"
580 "pcmpeqb %%mm6, %%mm6\n\t" // F..F
581 ::);
582 #else /* HAVE_AMD3DNOW */
583 __asm__ volatile(
584 "pxor %%mm7, %%mm7\n\t"
585 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
586 "movq %%mm5, %%mm4\n\t"
587 "psllw $8, %%mm5\n\t" //FF00FF00FF00
588 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
589 ::);
590 #endif /* HAVE_AMD3DNOW */
591 #endif /* HAVE_MMX */
592 for(y=0;y<h;y++){
593 register int x;
594 #if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX || HAVE_SSE2)
595 #if HAVE_MMX
596 #if HAVE_AMD3DNOW
597 __asm__ volatile(
598 PREFETCHW" %0\n\t"
599 PREFETCH" %1\n\t"
600 PREFETCH" %2\n\t"
601 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
602 for(x=0;x<w;x+=2){
603 if(srca[x] || srca[x+1])
604 __asm__ volatile(
605 PREFETCHW" 32%0\n\t"
606 PREFETCH" 32%1\n\t"
607 PREFETCH" 32%2\n\t"
608 "movq %0, %%mm0\n\t" // dstbase
609 "movq %%mm0, %%mm1\n\t"
610 "punpcklbw %%mm7, %%mm0\n\t"
611 "punpckhbw %%mm7, %%mm1\n\t"
612 "movd %1, %%mm2\n\t" // srca ABCD0000
613 "paddb %%mm6, %%mm2\n\t"
614 "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
615 "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
616 "movq %%mm2, %%mm3\n\t"
617 "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
618 "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
619 "pmullw %%mm2, %%mm0\n\t"
620 "pmullw %%mm3, %%mm1\n\t"
621 "psrlw $8, %%mm0\n\t"
622 "psrlw $8, %%mm1\n\t"
623 "packuswb %%mm1, %%mm0\n\t"
624 "movd %2, %%mm2 \n\t" // src ABCD0000
625 "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
626 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
627 "paddb %%mm2, %%mm0\n\t"
628 "movq %%mm0, %0\n\t"
629 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]));
630 }
631 #else //this is faster for intels crap
632 __asm__ volatile(
633 PREFETCHW" %0\n\t"
634 PREFETCH" %1\n\t"
635 PREFETCH" %2\n\t"
636 ::"m"(*dstbase),"m"(*srca),"m"(*src):"memory");
637 for(x=0;x<w;x+=4){
638 __asm__ volatile(
639 "movl %1, %%eax\n\t"
640 "orl %%eax, %%eax\n\t"
641 " jz 1f\n\t"
642 PREFETCHW" 32%0\n\t"
643 PREFETCH" 32%1\n\t"
644 PREFETCH" 32%2\n\t"
645 "movq %0, %%mm0\n\t" // dstbase
646 "movq %%mm0, %%mm1\n\t"
647 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
648 "psrlw $8, %%mm1\n\t" //0?0G0?0G
649 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
650 "paddb %3, %%mm2\n\t"
651 "punpcklbw %%mm2, %%mm2\n\t" //srca DDCCBBAA
652 "movq %%mm2, %%mm3\n\t"
653 "punpcklbw %%mm7, %%mm2\n\t" //srca 0B0B0A0A
654 "pmullw %%mm2, %%mm0\n\t"
655 "pmullw %%mm2, %%mm1\n\t"
656 "psrlw $8, %%mm0\n\t"
657 "pand %%mm5, %%mm1\n\t"
658 "por %%mm1, %%mm0\n\t"
659 "movd %2, %%mm2 \n\t" //src 0000DCBA
660 "punpcklbw %%mm2, %%mm2\n\t" //src DDCCBBAA
661 "movq %%mm2, %%mm6\n\t"
662 "punpcklbw %%mm2, %%mm2\n\t" //src BBBBAAAA
663 "paddb %%mm2, %%mm0\n\t"
664 "movq %%mm0, %0\n\t"
665
666 "movq 8%0, %%mm0\n\t" // dstbase
667 "movq %%mm0, %%mm1\n\t"
668 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
669 "psrlw $8, %%mm1\n\t" //0?0G0?0G
670 "punpckhbw %%mm7, %%mm3\n\t" //srca 0D0D0C0C
671 "pmullw %%mm3, %%mm0\n\t"
672 "pmullw %%mm3, %%mm1\n\t"
673 "psrlw $8, %%mm0\n\t"
674 "pand %%mm5, %%mm1\n\t"
675 "por %%mm1, %%mm0\n\t"
676 "punpckhbw %%mm6, %%mm6\n\t" //src DDDDCCCC
677 "paddb %%mm6, %%mm0\n\t"
678 "movq %%mm0, 8%0\n\t"
679 "1:\n\t"
680 :: "m" (dstbase[4*x]), "m" (srca[x]), "m" (src[x]), "m" (bFF)
681 : "%eax");
682 }
683 #endif
684 #elif HAVE_SSE2
685 __m128i zero = _mm_setzero_si128();
686 __m128i mmsrca = _mm_setzero_si128();
687 for(x=0;x<w;x+=4){
688 __m128i mmsrc, mmdst, mmsrcexp, mmsrcaexp, res;
689 if ((x & 15) == 0) {
690 int alpha;
691 mmsrca = _mm_load_si128((const __m128i *)(srca + x));
692 alpha = _mm_movemask_epi8(_mm_cmpeq_epi8(mmsrca, zero));
693 if (alpha == 0xffff) { x += 12; continue; }
694 mmsrc = _mm_load_si128((const __m128i *)(src + x));
695 }
696
697 mmdst = _mm_loadu_si128((const __m128i *)(dstbase + 4*x));
698
699 mmsrcaexp = _mm_unpacklo_epi8(mmsrca, mmsrca);
700 mmsrcaexp = _mm_unpacklo_epi8(mmsrcaexp, mmsrcaexp);
701 mmsrcexp = _mm_unpacklo_epi8(mmsrc, mmsrc);
702 mmsrcexp = _mm_unpacklo_epi8(mmsrcexp, mmsrcexp);
703
704 res = muladd_src(mmdst, mmsrcexp, mmsrcaexp);
705
706 res = alphamask(mmdst, res, mmsrcaexp);
707 _mm_storeu_si128((__m128i *)(dstbase + 4*x), res);
708
709 mmsrca = _mm_srli_si128(mmsrca, 4);
710 mmsrc = _mm_srli_si128(mmsrc, 4);
711 }
712 for(;x<w;x++){
713 if(srca[x]){
714 dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
715 dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
716 dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
717 }
718 }
719 #else /* HAVE_SSE2 */
720 for(x=0;x<w;x++){
721 if(srca[x]){
722 __asm__ volatile(
723 "movzbl (%0), %%ecx\n\t"
724 "movzbl 1(%0), %%eax\n\t"
725 "movzbl 2(%0), %%edx\n\t"
726
727 "imull %1, %%ecx\n\t"
728 "imull %1, %%eax\n\t"
729 "imull %1, %%edx\n\t"
730
731 "addl %2, %%ecx\n\t"
732 "addl %2, %%eax\n\t"
733 "addl %2, %%edx\n\t"
734
735 "movb %%ch, (%0)\n\t"
736 "movb %%ah, 1(%0)\n\t"
737 "movb %%dh, 2(%0)\n\t"
738
739 :
740 :"r" (&dstbase[4*x]),
741 "r" ((unsigned)srca[x]),
742 "r" (((unsigned)src[x])<<8)
743 :"%eax", "%ecx", "%edx"
744 );
745 }
746 }
747 #endif /* HAVE_MMX */
748 #else /*non x86 arch or x86_64 with MMX disabled */
749 for(x=0;x<w;x++){
750 if(srca[x]){
751 #ifdef FAST_OSD
752 dstbase[4*x+0]=dstbase[4*x+1]=dstbase[4*x+2]=src[x];
753 #else
754 dstbase[4*x+0]=((dstbase[4*x+0]*srca[x])>>8)+src[x];
755 dstbase[4*x+1]=((dstbase[4*x+1]*srca[x])>>8)+src[x];
756 dstbase[4*x+2]=((dstbase[4*x+2]*srca[x])>>8)+src[x];
757 #endif
758 }
759 }
760 #endif /* arch_x86 */
761 src+=srcstride;
762 srca+=srcstride;
763 dstbase+=dststride;
764 }
765 #if HAVE_MMX
766 __asm__ volatile(EMMS:::"memory");
767 #endif
768 return;
769 }
770