1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "yuv_row.h"
6 #include "mozilla/SSE.h"
7 
8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
10 
11 extern "C" {
12 
13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
14 #if defined(__clang__)
15 // clang-cl has a bug where it doesn't mangle names in inline asm
16 // so let's do the mangling in the preprocessor (ugh)
17 // (but we still need to declare a dummy extern for the parser)
18 extern void* _kCoefficientsRgbY;
19 #define kCoefficientsRgbY _kCoefficientsRgbY
20 #endif
21 
22 __declspec(naked)
FastConvertYUVToRGB32Row_SSE(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)23 void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
24                                   const uint8* u_buf,
25                                   const uint8* v_buf,
26                                   uint8* rgb_buf,
27                                   int width) {
28   __asm {
29     pushad
30     mov       edx, [esp + 32 + 4]   // Y
31     mov       edi, [esp + 32 + 8]   // U
32     mov       esi, [esp + 32 + 12]  // V
33     mov       ebp, [esp + 32 + 16]  // rgb
34     mov       ecx, [esp + 32 + 20]  // width
35     jmp       convertend
36 
37  convertloop :
38     movzx     eax, byte ptr [edi]
39     add       edi, 1
40     movzx     ebx, byte ptr [esi]
41     add       esi, 1
42     movq      mm0, [kCoefficientsRgbU + 8 * eax]
43     movzx     eax, byte ptr [edx]
44     paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
45     movzx     ebx, byte ptr [edx + 1]
46     movq      mm1, [kCoefficientsRgbY + 8 * eax]
47     add       edx, 2
48     movq      mm2, [kCoefficientsRgbY + 8 * ebx]
49     paddsw    mm1, mm0
50     paddsw    mm2, mm0
51     psraw     mm1, 6
52     psraw     mm2, 6
53     packuswb  mm1, mm2
54     movntq    [ebp], mm1
55     add       ebp, 8
56  convertend :
57     sub       ecx, 2
58     jns       convertloop
59 
60     and       ecx, 1  // odd number of pixels?
61     jz        convertdone
62 
63     movzx     eax, byte ptr [edi]
64     movq      mm0, [kCoefficientsRgbU + 8 * eax]
65     movzx     eax, byte ptr [esi]
66     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
67     movzx     eax, byte ptr [edx]
68     movq      mm1, [kCoefficientsRgbY + 8 * eax]
69     paddsw    mm1, mm0
70     psraw     mm1, 6
71     packuswb  mm1, mm1
72     movd      [ebp], mm1
73  convertdone :
74 
75     popad
76     ret
77   }
78 }
79 
80 __declspec(naked)
ConvertYUVToRGB32Row_SSE(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width,int step)81 void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
82                               const uint8* u_buf,
83                               const uint8* v_buf,
84                               uint8* rgb_buf,
85                               int width,
86                               int step) {
87   __asm {
88     pushad
89     mov       edx, [esp + 32 + 4]   // Y
90     mov       edi, [esp + 32 + 8]   // U
91     mov       esi, [esp + 32 + 12]  // V
92     mov       ebp, [esp + 32 + 16]  // rgb
93     mov       ecx, [esp + 32 + 20]  // width
94     mov       ebx, [esp + 32 + 24]  // step
95     jmp       wend
96 
97  wloop :
98     movzx     eax, byte ptr [edi]
99     add       edi, ebx
100     movq      mm0, [kCoefficientsRgbU + 8 * eax]
101     movzx     eax, byte ptr [esi]
102     add       esi, ebx
103     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
104     movzx     eax, byte ptr [edx]
105     add       edx, ebx
106     movq      mm1, [kCoefficientsRgbY + 8 * eax]
107     movzx     eax, byte ptr [edx]
108     add       edx, ebx
109     movq      mm2, [kCoefficientsRgbY + 8 * eax]
110     paddsw    mm1, mm0
111     paddsw    mm2, mm0
112     psraw     mm1, 6
113     psraw     mm2, 6
114     packuswb  mm1, mm2
115     movntq    [ebp], mm1
116     add       ebp, 8
117  wend :
118     sub       ecx, 2
119     jns       wloop
120 
121     and       ecx, 1  // odd number of pixels?
122     jz        wdone
123 
124     movzx     eax, byte ptr [edi]
125     movq      mm0, [kCoefficientsRgbU + 8 * eax]
126     movzx     eax, byte ptr [esi]
127     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
128     movzx     eax, byte ptr [edx]
129     movq      mm1, [kCoefficientsRgbY + 8 * eax]
130     paddsw    mm1, mm0
131     psraw     mm1, 6
132     packuswb  mm1, mm1
133     movd      [ebp], mm1
134  wdone :
135 
136     popad
137     ret
138   }
139 }
140 
141 __declspec(naked)
RotateConvertYUVToRGB32Row_SSE(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width,int ystep,int uvstep)142 void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
143                                     const uint8* u_buf,
144                                     const uint8* v_buf,
145                                     uint8* rgb_buf,
146                                     int width,
147                                     int ystep,
148                                     int uvstep) {
149   __asm {
150     pushad
151     mov       edx, [esp + 32 + 4]   // Y
152     mov       edi, [esp + 32 + 8]   // U
153     mov       esi, [esp + 32 + 12]  // V
154     mov       ebp, [esp + 32 + 16]  // rgb
155     mov       ecx, [esp + 32 + 20]  // width
156     jmp       wend
157 
158  wloop :
159     movzx     eax, byte ptr [edi]
160     mov       ebx, [esp + 32 + 28]  // uvstep
161     add       edi, ebx
162     movq      mm0, [kCoefficientsRgbU + 8 * eax]
163     movzx     eax, byte ptr [esi]
164     add       esi, ebx
165     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
166     movzx     eax, byte ptr [edx]
167     mov       ebx, [esp + 32 + 24]  // ystep
168     add       edx, ebx
169     movq      mm1, [kCoefficientsRgbY + 8 * eax]
170     movzx     eax, byte ptr [edx]
171     add       edx, ebx
172     movq      mm2, [kCoefficientsRgbY + 8 * eax]
173     paddsw    mm1, mm0
174     paddsw    mm2, mm0
175     psraw     mm1, 6
176     psraw     mm2, 6
177     packuswb  mm1, mm2
178     movntq    [ebp], mm1
179     add       ebp, 8
180  wend :
181     sub       ecx, 2
182     jns       wloop
183 
184     and       ecx, 1  // odd number of pixels?
185     jz        wdone
186 
187     movzx     eax, byte ptr [edi]
188     movq      mm0, [kCoefficientsRgbU + 8 * eax]
189     movzx     eax, byte ptr [esi]
190     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
191     movzx     eax, byte ptr [edx]
192     movq      mm1, [kCoefficientsRgbY + 8 * eax]
193     paddsw    mm1, mm0
194     psraw     mm1, 6
195     packuswb  mm1, mm1
196     movd      [ebp], mm1
197  wdone :
198 
199     popad
200     ret
201   }
202 }
203 
204 __declspec(naked)
DoubleYUVToRGB32Row_SSE(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)205 void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
206                              const uint8* u_buf,
207                              const uint8* v_buf,
208                              uint8* rgb_buf,
209                              int width) {
210   __asm {
211     pushad
212     mov       edx, [esp + 32 + 4]   // Y
213     mov       edi, [esp + 32 + 8]   // U
214     mov       esi, [esp + 32 + 12]  // V
215     mov       ebp, [esp + 32 + 16]  // rgb
216     mov       ecx, [esp + 32 + 20]  // width
217     jmp       wend
218 
219  wloop :
220     movzx     eax, byte ptr [edi]
221     add       edi, 1
222     movzx     ebx, byte ptr [esi]
223     add       esi, 1
224     movq      mm0, [kCoefficientsRgbU + 8 * eax]
225     movzx     eax, byte ptr [edx]
226     paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
227     movq      mm1, [kCoefficientsRgbY + 8 * eax]
228     paddsw    mm1, mm0
229     psraw     mm1, 6
230     packuswb  mm1, mm1
231     punpckldq mm1, mm1
232     movntq    [ebp], mm1
233 
234     movzx     ebx, byte ptr [edx + 1]
235     add       edx, 2
236     paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
237     psraw     mm0, 6
238     packuswb  mm0, mm0
239     punpckldq mm0, mm0
240     movntq    [ebp+8], mm0
241     add       ebp, 16
242  wend :
243     sub       ecx, 4
244     jns       wloop
245 
246     add       ecx, 4
247     jz        wdone
248 
249     movzx     eax, byte ptr [edi]
250     movq      mm0, [kCoefficientsRgbU + 8 * eax]
251     movzx     eax, byte ptr [esi]
252     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
253     movzx     eax, byte ptr [edx]
254     movq      mm1, [kCoefficientsRgbY + 8 * eax]
255     paddsw    mm1, mm0
256     psraw     mm1, 6
257     packuswb  mm1, mm1
258     jmp       wend1
259 
260  wloop1 :
261     movd      [ebp], mm1
262     add       ebp, 4
263  wend1 :
264     sub       ecx, 1
265     jns       wloop1
266  wdone :
267     popad
268     ret
269   }
270 }
271 
272 // This version does general purpose scaling by any amount, up or down.
273 // The only thing it cannot do is rotation by 90 or 270.
274 // For performance the chroma is under-sampled, reducing cost of a 3x
275 // 1080p scale from 8.4 ms to 5.4 ms.
276 __declspec(naked)
ScaleYUVToRGB32Row_SSE(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width,int source_dx)277 void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
278                             const uint8* u_buf,
279                             const uint8* v_buf,
280                             uint8* rgb_buf,
281                             int width,
282                             int source_dx) {
283   __asm {
284     pushad
285     mov       edx, [esp + 32 + 4]   // Y
286     mov       edi, [esp + 32 + 8]   // U
287     mov       esi, [esp + 32 + 12]  // V
288     mov       ebp, [esp + 32 + 16]  // rgb
289     mov       ecx, [esp + 32 + 20]  // width
290     xor       ebx, ebx              // x
291     jmp       scaleend
292 
293  scaleloop :
294     mov       eax, ebx
295     sar       eax, 17
296     movzx     eax, byte ptr [edi + eax]
297     movq      mm0, [kCoefficientsRgbU + 8 * eax]
298     mov       eax, ebx
299     sar       eax, 17
300     movzx     eax, byte ptr [esi + eax]
301     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
302     mov       eax, ebx
303     add       ebx, [esp + 32 + 24]  // x += source_dx
304     sar       eax, 16
305     movzx     eax, byte ptr [edx + eax]
306     movq      mm1, [kCoefficientsRgbY + 8 * eax]
307     mov       eax, ebx
308     add       ebx, [esp + 32 + 24]  // x += source_dx
309     sar       eax, 16
310     movzx     eax, byte ptr [edx + eax]
311     movq      mm2, [kCoefficientsRgbY + 8 * eax]
312     paddsw    mm1, mm0
313     paddsw    mm2, mm0
314     psraw     mm1, 6
315     psraw     mm2, 6
316     packuswb  mm1, mm2
317     movntq    [ebp], mm1
318     add       ebp, 8
319  scaleend :
320     sub       ecx, 2
321     jns       scaleloop
322 
323     and       ecx, 1  // odd number of pixels?
324     jz        scaledone
325 
326     mov       eax, ebx
327     sar       eax, 17
328     movzx     eax, byte ptr [edi + eax]
329     movq      mm0, [kCoefficientsRgbU + 8 * eax]
330     mov       eax, ebx
331     sar       eax, 17
332     movzx     eax, byte ptr [esi + eax]
333     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
334     mov       eax, ebx
335     sar       eax, 16
336     movzx     eax, byte ptr [edx + eax]
337     movq      mm1, [kCoefficientsRgbY + 8 * eax]
338     paddsw    mm1, mm0
339     psraw     mm1, 6
340     packuswb  mm1, mm1
341     movd      [ebp], mm1
342 
343  scaledone :
344     popad
345     ret
346   }
347 }
348 
349 __declspec(naked)
LinearScaleYUVToRGB32Row_SSE(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width,int source_dx)350 void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
351                                   const uint8* u_buf,
352                                   const uint8* v_buf,
353                                   uint8* rgb_buf,
354                                   int width,
355                                   int source_dx) {
356   __asm {
357     pushad
358     mov       edx, [esp + 32 + 4]  // Y
359     mov       edi, [esp + 32 + 8]  // U
360                 // [esp + 32 + 12] // V
361     mov       ebp, [esp + 32 + 16] // rgb
362     mov       ecx, [esp + 32 + 20] // width
363     imul      ecx, [esp + 32 + 24] // source_dx
364     mov       [esp + 32 + 20], ecx // source_width = width * source_dx
365     mov       ecx, [esp + 32 + 24] // source_dx
366     xor       ebx, ebx             // x = 0
367     cmp       ecx, 0x20000
368     jl        lscaleend
369     mov       ebx, 0x8000          // x = 0.5 for 1/2 or less
370     jmp       lscaleend
371 lscaleloop:
372     mov       eax, ebx
373     sar       eax, 0x11
374 
375     movzx     ecx, byte ptr [edi + eax]
376     movzx     esi, byte ptr [edi + eax + 1]
377     mov       eax, ebx
378     and       eax, 0x1fffe
379     imul      esi, eax
380     xor       eax, 0x1fffe
381     imul      ecx, eax
382     add       ecx, esi
383     shr       ecx, 17
384     movq      mm0, [kCoefficientsRgbU + 8 * ecx]
385 
386     mov       esi, [esp + 32 + 12]
387     mov       eax, ebx
388     sar       eax, 0x11
389 
390     movzx     ecx, byte ptr [esi + eax]
391     movzx     esi, byte ptr [esi + eax + 1]
392     mov       eax, ebx
393     and       eax, 0x1fffe
394     imul      esi, eax
395     xor       eax, 0x1fffe
396     imul      ecx, eax
397     add       ecx, esi
398     shr       ecx, 17
399     paddsw    mm0, [kCoefficientsRgbV + 8 * ecx]
400 
401     mov       eax, ebx
402     sar       eax, 0x10
403     movzx     ecx, byte ptr [edx + eax]
404     movzx     esi, byte ptr [1 + edx + eax]
405     mov       eax, ebx
406     add       ebx, [esp + 32 + 24]
407     and       eax, 0xffff
408     imul      esi, eax
409     xor       eax, 0xffff
410     imul      ecx, eax
411     add       ecx, esi
412     shr       ecx, 16
413     movq      mm1, [kCoefficientsRgbY + 8 * ecx]
414 
415     cmp       ebx, [esp + 32 + 20]
416     jge       lscalelastpixel
417 
418     mov       eax, ebx
419     sar       eax, 0x10
420     movzx     ecx, byte ptr [edx + eax]
421     movzx     esi, byte ptr [edx + eax + 1]
422     mov       eax, ebx
423     add       ebx, [esp + 32 + 24]
424     and       eax, 0xffff
425     imul      esi, eax
426     xor       eax, 0xffff
427     imul      ecx, eax
428     add       ecx, esi
429     shr       ecx, 16
430     movq      mm2, [kCoefficientsRgbY + 8 * ecx]
431 
432     paddsw    mm1, mm0
433     paddsw    mm2, mm0
434     psraw     mm1, 0x6
435     psraw     mm2, 0x6
436     packuswb  mm1, mm2
437     movntq    [ebp], mm1
438     add       ebp, 0x8
439 
440 lscaleend:
441     cmp       ebx, [esp + 32 + 20]
442     jl        lscaleloop
443     popad
444     ret
445 
446 lscalelastpixel:
447     paddsw    mm1, mm0
448     psraw     mm1, 6
449     packuswb  mm1, mm1
450     movd      [ebp], mm1
451     popad
452     ret
453   };
454 }
455 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
456 
FastConvertYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)457 void FastConvertYUVToRGB32Row(const uint8* y_buf,
458                               const uint8* u_buf,
459                               const uint8* v_buf,
460                               uint8* rgb_buf,
461                               int width) {
462 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
463   if (mozilla::supports_sse()) {
464     FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
465     return;
466   }
467 #endif
468 
469   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
470 }
471 
ScaleYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width,int source_dx)472 void ScaleYUVToRGB32Row(const uint8* y_buf,
473                         const uint8* u_buf,
474                         const uint8* v_buf,
475                         uint8* rgb_buf,
476                         int width,
477                         int source_dx) {
478 
479 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
480   if (mozilla::supports_sse()) {
481     ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
482     return;
483   }
484 #endif
485 
486   ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
487 }
488 
LinearScaleYUVToRGB32Row(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width,int source_dx)489 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
490                               const uint8* u_buf,
491                               const uint8* v_buf,
492                               uint8* rgb_buf,
493                               int width,
494                               int source_dx) {
495 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
496   if (mozilla::supports_sse()) {
497     LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
498                                  source_dx);
499     return;
500   }
501 #endif
502 
503   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
504 }
505 
506 } // extern "C"
507