1 //*@@@+++@@@@******************************************************************
2 //
3 // Copyright � Microsoft Corp.
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are met:
8 //
9 // � Redistributions of source code must retain the above copyright notice,
10 //   this list of conditions and the following disclaimer.
11 // � Redistributions in binary form must reproduce the above copyright notice,
12 //   this list of conditions and the following disclaimer in the documentation
13 //   and/or other materials provided with the distribution.
14 //
15 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 // POSSIBILITY OF SUCH DAMAGE.
26 //
27 //*@@@---@@@@******************************************************************
28 #include "strcodec.h"
29 #include "decode.h"
30 
31 #if defined(WMP_OPT_SSE2)
32 #include <emmintrin.h>
33 #include <windows.h>
34 
35 //================================================================
36 static __m128i g_const_d0;
37 static __m128i g_const_d1;
38 
39 __m128i g_const_d3;
40 __m128i g_const_d4;
41 __m128i g_const_d0x80;
42 __m128i g_const_w0x80;
43 __m128i g_const_b0x80;
44 
45 //================================================================
46 #if defined(WMP_OPT_CC_DEC)
storeRGB24_5(U8 * pbYCoCg,size_t cbYCoCg,const U8 * pbRGB,size_t cbRGB,size_t cmb)47 __declspec(naked) void __stdcall storeRGB24_5(
48     U8* pbYCoCg,
49     size_t cbYCoCg,
50     const U8* pbRGB,
51     size_t cbRGB,
52     size_t cmb)
53 {
54 #define DISP 8
55     UNREFERENCED_PARAMETER( pbYCoCg );
56     UNREFERENCED_PARAMETER( cbYCoCg );
57     UNREFERENCED_PARAMETER( pbRGB );
58     UNREFERENCED_PARAMETER( cbRGB );
59     UNREFERENCED_PARAMETER( cmb );
60     __asm {
61         push ebp
62         push ebx
63         push esi
64         push edi
65 
66         mov ebx, [esp + 36]         // $ebx = cmb
67         mov edi, [esp + 28]         // $edi = pbRGB
68         lea ebx, [ebx + ebx * 2]    // $ebx = cmb * 3
69         mov edx, [esp + 32]         // $edx = cbRGB
70         shl ebx, 4                  // $ebx = cmb * 3 * 16
71         mov esi, [esp + 20]         // $esi = pbYCoCg
72         add edi, ebx                // $edi = pbRGB + 3 * 16 * cmb
73         mov ebp, [esp + 24]         // $ebp = cbYCoCg
74         neg ebx
75 
76         mov eax, esp
77         and esp, 0xffffff80
78         sub esp, 64 * 4 + DISP
79 
80         mov [esp], eax              // original $esp
81         mov [esp + 4], edi
82     }
83 Loop0:
84     __asm {
85         mov edi, [esp + 4]          // $edi = pbRGB + 3 * 16 * cmb
86 
87             // first 8 pixels
88             pxor xmm1, xmm1
89             pxor xmm5, xmm5
90             movdqa xmm0, [esi]
91             movdqa xmm4, [esi + 16]
92             psubd xmm1, [esi + ebp]
93             psubd xmm5, [esi + ebp + 16]
94             movdqa xmm2, [esi + ebp * 2]
95             movdqa xmm6, [esi + ebp * 2 + 16]
96 
97             paddd xmm0, [g_const_d0x80]
98             paddd xmm4, [g_const_d0x80]
99 
100             // ICC
101             movdqa xmm3, xmm1           // g -= r >> 1
102             movdqa xmm7, xmm5
103             psrad xmm3, 1
104             psrad xmm7, 1
105             psubd xmm0, xmm3
106             psubd xmm4, xmm7
107 
108             movdqa xmm3, [g_const_d1]   // r -= ((b + 1) >> 1) - g
109             movdqa xmm7, [g_const_d1]
110             paddd xmm3, xmm2
111             paddd xmm7, xmm6
112             paddd xmm1, xmm0
113             paddd xmm5, xmm4
114             psrad xmm3, 1
115             psrad xmm7, 1
116             psubd xmm1, xmm3
117             psubd xmm5, xmm7
118 
119             paddd xmm2, xmm1            // b += r
120             paddd xmm6, xmm5
121 
122             pslld xmm0, 8
123             pslld xmm2, 16
124             pslld xmm4, 8
125             pslld xmm6, 16
126             por xmm0, xmm1
127             por xmm4, xmm5
128             por xmm0, xmm2
129             por xmm4, xmm6
130 
131             movdqa [esp + DISP + 64 * 0 + 16 * 0], xmm0
132             pslld xmm0, 8
133             movdqa [esp + DISP + 64 * 0 + 16 * 1], xmm4
134             pslld xmm4, 8
135             movdqa [esp + DISP + 64 * 0 + 16 * 2], xmm0
136             movdqa [esp + DISP + 64 * 0 + 16 * 3], xmm4
137 
138             // second 8 pixels
139             pxor xmm1, xmm1
140             pxor xmm5, xmm5
141             movdqa xmm0, [esi + 32]
142             movdqa xmm4, [esi + 48]
143             psubd xmm1, [esi + ebp + 32]
144             psubd xmm5, [esi + ebp + 48]
145             movdqa xmm2, [esi + ebp * 2 + 32]
146             movdqa xmm6, [esi + ebp * 2 + 48]
147 
148             paddd xmm0, [g_const_d0x80]
149             paddd xmm4, [g_const_d0x80]
150 
151             // ICC
152             movdqa xmm3, xmm1           // g -= r >> 1
153             movdqa xmm7, xmm5
154             psrad xmm3, 1
155             psrad xmm7, 1
156             psubd xmm0, xmm3
157             psubd xmm4, xmm7
158 
159             movdqa xmm3, [g_const_d1]   // r -= ((b + 1) >> 1) - g
160             movdqa xmm7, [g_const_d1]
161             paddd xmm3, xmm2
162             paddd xmm7, xmm6
163             paddd xmm1, xmm0
164             paddd xmm5, xmm4
165             psrad xmm3, 1
166             psrad xmm7, 1
167             psubd xmm1, xmm3
168             psubd xmm5, xmm7
169 
170             paddd xmm2, xmm1            // b += r
171             paddd xmm6, xmm5
172 
173             pslld xmm0, 8
174             pslld xmm2, 16
175             pslld xmm4, 8
176             pslld xmm6, 16
177             por xmm0, xmm1
178             por xmm4, xmm5
179             por xmm0, xmm2
180             por xmm4, xmm6
181 
182             movdqa [esp + DISP + 64 * 1 + 16 * 0], xmm0
183             pslld xmm0, 8
184             movdqa [esp + DISP + 64 * 1 + 16 * 1], xmm4
185             pslld xmm4, 8
186             movdqa [esp + DISP + 64 * 1 + 16 * 2], xmm0
187             movdqa [esp + DISP + 64 * 1 + 16 * 3], xmm4
188 
189             //================
190             add esi, 64
191 
192             // first 8 pixels
193             pxor xmm1, xmm1
194             pxor xmm5, xmm5
195             movdqa xmm0, [esi]
196             movdqa xmm4, [esi + 16]
197             psubd xmm1, [esi + ebp]
198             psubd xmm5, [esi + ebp + 16]
199             movdqa xmm2, [esi + ebp * 2]
200             movdqa xmm6, [esi + ebp * 2 + 16]
201 
202             paddd xmm0, [g_const_d0x80]
203             paddd xmm4, [g_const_d0x80]
204 
205             // ICC
206             movdqa xmm3, xmm1           // g -= r >> 1
207             movdqa xmm7, xmm5
208             psrad xmm3, 1
209             psrad xmm7, 1
210             psubd xmm0, xmm3
211             psubd xmm4, xmm7
212 
213             movdqa xmm3, [g_const_d1]   // r -= ((b + 1) >> 1) - g
214             movdqa xmm7, [g_const_d1]
215             paddd xmm3, xmm2
216             paddd xmm7, xmm6
217             paddd xmm1, xmm0
218             paddd xmm5, xmm4
219             psrad xmm3, 1
220             psrad xmm7, 1
221             psubd xmm1, xmm3
222             psubd xmm5, xmm7
223 
224             paddd xmm2, xmm1            // b += r
225             paddd xmm6, xmm5
226 
227             pslld xmm0, 8
228             pslld xmm2, 16
229             pslld xmm4, 8
230             pslld xmm6, 16
231 
232             por xmm0, xmm1
233             por xmm4, xmm5
234             por xmm0, xmm2
235             por xmm4, xmm6
236 
237             movdqa [esp + DISP + 64 * 2 + 16 * 0], xmm0
238             pslld xmm0, 8
239             movdqa [esp + DISP + 64 * 2 + 16 * 1], xmm4
240             pslld xmm4, 8
241             movdqa [esp + DISP + 64 * 2 + 16 * 2], xmm0
242             movdqa [esp + DISP + 64 * 2 + 16 * 3], xmm4
243 
244             // second 8 pixels
245             pxor xmm1, xmm1
246             pxor xmm5, xmm5
247             movdqa xmm0, [esi + 32]
248             movdqa xmm4, [esi + 48]
249             psubd xmm1, [esi + ebp + 32]
250             psubd xmm5, [esi + ebp + 48]
251             movdqa xmm2, [esi + ebp * 2 + 32]
252             movdqa xmm6, [esi + ebp * 2 + 48]
253 
254             paddd xmm0, [g_const_d0x80]
255             paddd xmm4, [g_const_d0x80]
256 
257             // ICC
258             movdqa xmm3, xmm1           // g -= r >> 1
259             movdqa xmm7, xmm5
260             psrad xmm3, 1
261             psrad xmm7, 1
262             psubd xmm0, xmm3
263             psubd xmm4, xmm7
264 
265             movdqa xmm3, [g_const_d1]   // r -= ((b + 1) >> 1) - g
266             movdqa xmm7, [g_const_d1]
267             paddd xmm3, xmm2
268             paddd xmm7, xmm6
269             paddd xmm1, xmm0
270             paddd xmm5, xmm4
271             psrad xmm3, 1
272             psrad xmm7, 1
273             psubd xmm1, xmm3
274             psubd xmm5, xmm7
275 
276             paddd xmm2, xmm1            // b += r
277             paddd xmm6, xmm5
278 
279             pslld xmm0, 8
280             pslld xmm2, 16
281             pslld xmm4, 8
282             pslld xmm6, 16
283             por xmm0, xmm1
284             por xmm4, xmm5
285             por xmm0, xmm2
286             por xmm4, xmm6
287 
288             movdqa [esp + DISP + 64 * 3 + 16 * 0], xmm0
289             pslld xmm0, 8
290             movdqa [esp + DISP + 64 * 3 + 16 * 1], xmm4
291             pslld xmm4, 8
292             movdqa [esp + DISP + 64 * 3 + 16 * 2], xmm0
293             movdqa [esp + DISP + 64 * 3 + 16 * 3], xmm4
294 
295             //================================
296             // RGBX32 -> RGB24
297             mov eax, [esp + DISP + 64 * 0 + 4]      // ..B1G1R1
298             mov ecx, [esp + DISP + 64 * 0 + 32]     // B0G0R0..
299             shld eax, ecx, 24                       // R1B0G0R0
300             mov [edi + ebx + 0], eax
301             mov eax, [esp + DISP + 64 * 0 + 20]     // ..B5G5R5
302             mov ecx, [esp + DISP + 64 * 0 + 36]     // B1G1R1..
303             shld eax, ecx, 16                       // G5R5B1G1
304             mov [edi + ebx + 4], eax
305             mov eax, [esp + DISP + 64 * 0 + 16]     // ..B4G4R4
306             mov ecx, [esp + DISP + 64 * 0 + 52]     // B5G5R5..
307             shld eax, ecx, 8                        // B4G4R4B5
308             mov [edi + ebx + 8], eax
309             add edi, edx                // $edi = pbRGB += cbRGB
310 
311             mov eax, [esp + DISP + 64 * 0 + 4 + 8]  // ..B3G3R3
312             mov ecx, [esp + DISP + 64 * 0 + 32 + 8] // B2G2R2..
313             shld eax, ecx, 24                       // R3B2G2R2
314             mov [edi + ebx + 0], eax
315             mov eax, [esp + DISP + 64 * 0 + 20 + 8] // ..B7G7R7
316             mov ecx, [esp + DISP + 64 * 0 + 36 + 8] // B3G3R3..
317             shld eax, ecx, 16                       // G7R7B3G3
318             mov [edi + ebx + 4], eax
319             mov eax, [esp + DISP + 64 * 0 + 16 + 8] // ..B6G6R6
320             mov ecx, [esp + DISP + 64 * 0 + 52 + 8] // B7G7R7..
321             shld eax, ecx, 8                        // B6G6R6B7
322             mov [edi + ebx + 8], eax
323             add edi, edx                // $edi = pbRGB += cbRGB
324 
325             // RGBX32 -> RGB24
326             mov eax, [esp + DISP + 64 * 1 + 4 + 8]  // ..B3G3R3
327             mov ecx, [esp + DISP + 64 * 1 + 32 + 8] // B2G2R2..
328             shld eax, ecx, 24                       // R3B2G2R2
329             mov [edi + ebx + 0], eax
330             mov eax, [esp + DISP + 64 * 1 + 20 + 8] // ..B7G7R7
331             mov ecx, [esp + DISP + 64 * 1 + 36 + 8] // B3G3R3..
332             shld eax, ecx, 16                       // G7R7B3G3
333             mov [edi + ebx + 4], eax
334             mov eax, [esp + DISP + 64 * 1 + 16 + 8] // ..B6G6R6
335             mov ecx, [esp + DISP + 64 * 1 + 52 + 8] // B7G7R7..
336             shld eax, ecx, 8                        // B6G6R6B7
337             mov [edi + ebx + 8], eax
338             add edi, edx                // $edi = pbRGB += cbRGB
339 
340             mov eax, [esp + DISP + 64 * 1 + 4]  // ..B1G1R1
341             mov ecx, [esp + DISP + 64 * 1 + 32] // B0G0R0..
342             shld eax, ecx, 24                   // R1B0G0R0
343             mov [edi + ebx + 0], eax
344             mov eax, [esp + DISP + 64 * 1 + 20] // ..B5G5R5
345             mov ecx, [esp + DISP + 64 * 1 + 36] // B1G1R1..
346             shld eax, ecx, 16                   // G5R5B1G1
347             mov [edi + ebx + 4], eax
348             mov eax, [esp + DISP + 64 * 1 + 16] // ..B4G4R4
349             mov ecx, [esp + DISP + 64 * 1 + 52] // B5G5R5..
350             shld eax, ecx, 8                    // B4G4R4B5
351             mov [edi + ebx + 8], eax
352             add edi, edx                // $edi = pbRGB += cbRGB
353 
354             // RGBX32 -> RGB24
355             mov eax, [esp + DISP + 64 * 2 + 4]  // ..B1G1R1
356             mov ecx, [esp + DISP + 64 * 2 + 32] // B0G0R0..
357             shld eax, ecx, 24                   // R1B0G0R0
358             mov [edi + ebx + 0], eax
359             mov eax, [esp + DISP + 64 * 2 + 20] // ..B5G5R5
360             mov ecx, [esp + DISP + 64 * 2 + 36] // B1G1R1..
361             shld eax, ecx, 16                   // G5R5B1G1
362             mov [edi + ebx + 4], eax
363             mov eax, [esp + DISP + 64 * 2 + 16] // ..B4G4R4
364             mov ecx, [esp + DISP + 64 * 2 + 52] // B5G5R5..
365             shld eax, ecx, 8                    // B4G4R4B5
366             mov [edi + ebx + 8], eax
367             add edi, edx                // $edi = pbRGB += cbRGB
368 
369             mov eax, [esp + DISP + 64 * 2 + 4 + 8]  // ..B3G3R3
370             mov ecx, [esp + DISP + 64 * 2 + 32 + 8] // B2G2R2..
371             shld eax, ecx, 24                       // R3B2G2R2
372             mov [edi + ebx + 0], eax
373             mov eax, [esp + DISP + 64 * 2 + 20 + 8] // ..B7G7R7
374             mov ecx, [esp + DISP + 64 * 2 + 36 + 8] // B3G3R3..
375             shld eax, ecx, 16                       // G7R7B3G3
376             mov [edi + ebx + 4], eax
377             mov eax, [esp + DISP + 64 * 2 + 16 + 8] // ..B6G6R6
378             mov ecx, [esp + DISP + 64 * 2 + 52 + 8] // B7G7R7..
379             shld eax, ecx, 8                        // B6G6R6B7
380             mov [edi + ebx + 8], eax
381             add edi, edx                // $edi = pbRGB += cbRGB
382 
383             // RGBX32 -> RGB24
384             mov eax, [esp + DISP + 64 * 3 + 4 + 8]  // ..B3G3R3
385             mov ecx, [esp + DISP + 64 * 3 + 32 + 8] // B2G2R2..
386             shld eax, ecx, 24                       // R3B2G2R2
387             mov [edi + ebx + 0], eax
388             mov eax, [esp + DISP + 64 * 3 + 20 + 8] // ..B7G7R7
389             mov ecx, [esp + DISP + 64 * 3 + 36 + 8] // B3G3R3..
390             shld eax, ecx, 16                       // G7R7B3G3
391             mov [edi + ebx + 4], eax
392             mov eax, [esp + DISP + 64 * 3 + 16 + 8] // ..B6G6R6
393             mov ecx, [esp + DISP + 64 * 3 + 52 + 8] // B7G7R7..
394             shld eax, ecx, 8                        // B6G6R6B7
395             mov [edi + ebx + 8], eax
396             add edi, edx                // $edi = pbRGB += cbRGB
397 
398             mov eax, [esp + DISP + 64 * 3 + 4]      // ..B1G1R1
399             mov ecx, [esp + DISP + 64 * 3 + 32]     // B0G0R0..
400             shld eax, ecx, 24                       // R1B0G0R0
401             mov [edi + ebx + 0], eax
402             mov eax, [esp + DISP + 64 * 3 + 20]     // ..B5G5R5
403             mov ecx, [esp + DISP + 64 * 3 + 36]     // B1G1R1..
404             shld eax, ecx, 16                       // G5R5B1G1
405             mov [edi + ebx + 4], eax
406             mov eax, [esp + DISP + 64 * 3 + 16]     // ..B4G4R4
407             mov ecx, [esp + DISP + 64 * 3 + 52]     // B5G5R5..
408             shld eax, ecx, 8                        // B4G4R4B5
409             mov [edi + ebx + 8], eax
410 
411         //================================
412         add esi, 256 - 64
413         add ebx, 12
414         jnz Loop0
415 
416         //================
417         pop esp
418         pop edi
419         pop esi
420         pop ebx
421         pop ebp
422         ret 20
423     }
424 }
425 
outputMBRow_RGB24_Lossless_1(CWMImageStrCodec * pSC)426 Int outputMBRow_RGB24_Lossless_1(CWMImageStrCodec* pSC)
427 {
428 #ifdef REENTRANT_MODE
429     const size_t cHeight = min((pSC->m_Dparam->cROIBottomY + 1) - (pSC->cRow - 1) * 16, 16);
430     const size_t iFirstRow = ((pSC->cRow - 1) * 16 > pSC->m_Dparam->cROITopY ? 0 : (pSC->m_Dparam->cROITopY & 0xf));
431 #endif
432     const size_t cbRGB = pSC->WMIBI.cbStride;
433     const U8* const pbRGB = (U8*)pSC->WMIBI.pv + cbRGB * (pSC->cRow - 1) * 16;
434 
435     U8* const pbY = (U8*)pSC->a0MBbuffer[0];
436     U8* const pbU = (U8*)pSC->a0MBbuffer[1];
437     // U8* const pbV = (U8*)pSC->a0MBbuffer[2];
438 
439     const size_t cmbColumn = (pSC->WMII.cWidth + 15) / 16;
440 
441     assert(BD_8 == pSC->WMII.bdBitDepth);
442     assert(CF_RGB == pSC->WMII.cfColorFormat);
443     assert(24 == pSC->WMII.cBitsPerUnit);
444     assert(pSC->WMII.bRGB);
445     assert(O_NONE == pSC->WMII.oOrientation);
446 
447     assert(YUV_444 == pSC->m_param.cfColorFormat);
448     assert(!pSC->m_param.bScaledArith);
449 
450     assert(pSC->m_Dparam->bDecodeFullFrame);
451 
452     storeRGB24_5(pbY + 64 * 0, pbU - pbY, pbRGB + cbRGB *  0, cbRGB, cmbColumn);
453     storeRGB24_5(pbY + 64 * 2, pbU - pbY, pbRGB + cbRGB *  8, cbRGB, cmbColumn);
454 
455 #ifdef REENTRANT_MODE
456     pSC->WMIBI.cLinesDecoded = cHeight - iFirstRow;
457 #endif
458     return ICERR_OK;
459 }
460 
461 
storeRGB24_3(U8 * pbYCoCg,size_t cbYCoCg,const U8 * pbRGB,size_t cbRGB,size_t cmb,const U8 * Shift)462 __declspec(naked) void __stdcall storeRGB24_3(
463     U8* pbYCoCg,
464     size_t cbYCoCg,
465     const U8* pbRGB,
466     size_t cbRGB,
467     size_t cmb,
468     const U8* Shift)
469 {
470     UNREFERENCED_PARAMETER( pbYCoCg );
471     UNREFERENCED_PARAMETER( cbYCoCg );
472     UNREFERENCED_PARAMETER( pbRGB );
473     UNREFERENCED_PARAMETER( cbRGB );
474     UNREFERENCED_PARAMETER( cmb );
475     UNREFERENCED_PARAMETER( Shift );
476     __asm {
477         push ebp
478         push ebx
479         push esi
480         push edi
481 
482         mov ecx, [esp + 40]         // $ecx = Shift
483         mov ebx, [esp + 36]         // $ebx = cmb
484         mov edi, [esp + 28]         // $edi = pbRGB
485         lea ebx, [ebx + ebx * 2]    // $ebx = cmb * 3
486         mov edx, [esp + 32]         // $edx = cbRGB
487         shl ebx, 4                  // $ebx = cmb * 3 * 16
488         mov esi, [esp + 20]         // $esi = pbYCoCg
489         add edi, ebx                // $edi = pbRGB + 3 * 16 * cmb
490         mov ebp, [esp + 24]         // $ebp = cbYCoCg
491         neg ebx
492 
493         mov eax, esp
494         and esp, 0xffffff80
495         sub esp, 320
496 
497         mov [esp], eax              // original $esp
498         mov [esp + 4], edi
499         mov [esp + 8], ecx
500     }
501 Loop0:
502     __asm {
503         mov edi, [esp + 4]          // $edi = pbRGB + 3 * 16 * cmb
504 
505             //================
506             // first 8 pixels
507             movdqa xmm0, [esi]
508             movdqa xmm4, [esi + 16]
509             movdqa xmm3, [esi + ebp]
510             movdqa xmm7, [esi + ebp + 16]
511             movdqa xmm2, [esi + ebp * 2]
512             movdqa xmm6, [esi + ebp * 2 + 16]
513 
514             mov ecx, [esp + 8]
515             movdqa xmm1, [ecx]
516             movdqa xmm5, [g_const_d0x80]
517             pslld xmm5, xmm1
518             paddd xmm5, xmm1
519             paddd xmm0, xmm5            // bias
520             paddd xmm4, xmm5            // bias
521             pxor xmm1, xmm1
522             pxor xmm5, xmm5
523             psubd xmm1, xmm3
524             psubd xmm5, xmm7
525 
526             // ICC
527             movdqa xmm3, xmm1           // g -= r >> 1
528             movdqa xmm7, xmm5
529             psrad xmm3, 1
530             psrad xmm7, 1
531             psubd xmm0, xmm3
532             psubd xmm4, xmm7
533 
534             movdqa xmm3, [g_const_d1]   // r -= ((b + 1) >> 1) - g
535             movdqa xmm7, [g_const_d1]
536             paddd xmm3, xmm2
537             paddd xmm7, xmm6
538             paddd xmm1, xmm0
539             paddd xmm5, xmm4
540             psrad xmm3, 1
541             psrad xmm7, 1
542             psubd xmm1, xmm3
543             psubd xmm5, xmm7
544 
545             paddd xmm2, xmm1            // b += r
546             paddd xmm6, xmm5
547 
548             // clip
549             movdqa xmm3, [g_const_w0x80]
550             packssdw xmm0, xmm4
551             packssdw xmm1, xmm5
552             packssdw xmm2, xmm6
553 
554             mov ecx, [esp + 8]
555             movdqa xmm4, [ecx]
556             psraw xmm0, xmm4
557             psraw xmm1, xmm4
558             psraw xmm2, xmm4
559 
560             psubw xmm0, xmm3
561             psubw xmm1, xmm3
562             psubw xmm2, xmm3
563 
564             movdqa [esp + 16], xmm0
565             movdqa [esp + 32], xmm1
566             movdqa [esp + 48], xmm2
567 
568             //================
569             // second 8 pixels
570             movdqa xmm0, [esi + 32]
571             movdqa xmm4, [esi + 48]
572             movdqa xmm3, [esi + ebp + 32]
573             movdqa xmm7, [esi + ebp + 48]
574             movdqa xmm2, [esi + ebp * 2 + 32]
575             movdqa xmm6, [esi + ebp * 2 + 48]
576 
577             mov ecx, [esp + 8]
578             movdqa xmm1, [ecx]
579             movdqa xmm5, [g_const_d0x80]
580             pslld xmm5, xmm1
581             paddd xmm5, xmm1
582             paddd xmm0, xmm5            // bias
583             paddd xmm4, xmm5            // bias
584             pxor xmm1, xmm1
585             pxor xmm5, xmm5
586             psubd xmm1, xmm3
587             psubd xmm5, xmm7
588 
589             // ICC
590             movdqa xmm3, xmm1           // g -= r >> 1
591             movdqa xmm7, xmm5
592             psrad xmm3, 1
593             psrad xmm7, 1
594             psubd xmm0, xmm3
595             psubd xmm4, xmm7
596 
597             movdqa xmm3, [g_const_d1]   // r -= ((b + 1) >> 1) - g
598             movdqa xmm7, [g_const_d1]
599             paddd xmm3, xmm2
600             paddd xmm7, xmm6
601             paddd xmm1, xmm0
602             paddd xmm5, xmm4
603             psrad xmm3, 1
604             psrad xmm7, 1
605             psubd xmm1, xmm3
606             psubd xmm5, xmm7
607 
608             paddd xmm2, xmm1            // b += r
609             paddd xmm6, xmm5
610 
611             // clip
612             movdqa xmm3, [g_const_w0x80]
613             packssdw xmm0, xmm4
614             packssdw xmm1, xmm5
615             packssdw xmm2, xmm6
616 
617             mov ecx, [esp + 8]
618             movdqa xmm4, [ecx]
619             psraw xmm0, xmm4
620             psraw xmm1, xmm4
621             psraw xmm2, xmm4
622 
623             psubw xmm0, xmm3
624             psubw xmm1, xmm3
625             psubw xmm2, xmm3
626 
627             //================
628             // 16 pixels
629             movdqa xmm3, [g_const_b0x80]
630             packsswb xmm0, [esp + 16]
631             packsswb xmm1, [esp + 32]
632             packsswb xmm2, [esp + 48]
633 
634             psubb xmm0, xmm3
635             psubb xmm1, xmm3
636             psubb xmm2, xmm3
637 
638             pxor xmm7, xmm7
639             movdqa xmm4, xmm0
640             movdqa xmm5, xmm1
641             movdqa xmm6, xmm2
642 
643             punpckhbw xmm0, xmm7
644             punpckhbw xmm1, xmm7
645             punpckhbw xmm2, xmm7
646             punpcklbw xmm4, xmm7
647             punpcklbw xmm5, xmm7
648             punpcklbw xmm6, xmm7
649 
650             // spill second 8 pixels
651             movdqa [esp + 16], xmm4
652             movdqa [esp + 32], xmm5
653             movdqa [esp + 48], xmm6
654 
655             // first 8 pixels
656             movdqa xmm4, xmm0
657             movdqa xmm5, xmm1
658             movdqa xmm6, xmm2
659 
660             punpcklwd xmm0, xmm7
661             punpcklwd xmm1, xmm7
662             punpcklwd xmm2, xmm7
663 
664             punpckhwd xmm4, xmm7
665             punpckhwd xmm5, xmm7
666             punpckhwd xmm6, xmm7
667 
668             pslld xmm0, 8
669             pslld xmm2, 16
670             pslld xmm4, 8
671             pslld xmm6, 16
672 
673             por xmm0, xmm1
674             por xmm4, xmm5
675             por xmm0, xmm2
676             por xmm4, xmm6
677 
678             movdqa [esp + 64], xmm0
679             pslld xmm0, 8
680             movdqa [esp + 80], xmm4
681             pslld xmm4, 8
682             movdqa [esp + 96], xmm0
683             movdqa [esp + 112], xmm4
684 
685             // second 8 pixels
686             movdqa xmm0, [esp + 16]
687             movdqa xmm1, [esp + 32]
688             movdqa xmm2, [esp + 48]
689             movdqa xmm4, xmm0
690             movdqa xmm5, xmm1
691             movdqa xmm6, xmm2
692 
693             punpcklwd xmm0, xmm7
694             punpcklwd xmm1, xmm7
695             punpcklwd xmm2, xmm7
696             punpckhwd xmm4, xmm7
697             punpckhwd xmm5, xmm7
698             punpckhwd xmm6, xmm7
699 
700             pslld xmm0, 8
701             pslld xmm2, 16
702             pslld xmm4, 8
703             pslld xmm6, 16
704             por xmm0, xmm1
705             por xmm4, xmm5
706             por xmm0, xmm2
707             por xmm4, xmm6
708 
709             movdqa [esp + 128], xmm0
710             pslld xmm0, 8
711             movdqa [esp + 144], xmm4
712             pslld xmm4, 8
713             movdqa [esp + 160], xmm0
714             movdqa [esp + 176], xmm4
715 
716         //================================
717         add esi, 64
718 
719             //================
720             // first 8 pixels
721             movdqa xmm0, [esi]
722             movdqa xmm4, [esi + 16]
723             movdqa xmm3, [esi + ebp]
724             movdqa xmm7, [esi + ebp + 16]
725             movdqa xmm2, [esi + ebp * 2]
726             movdqa xmm6, [esi + ebp * 2 + 16]
727 
728             mov ecx, [esp + 8]
729             movdqa xmm1, [ecx]
730             movdqa xmm5, [g_const_d0x80]
731             pslld xmm5, xmm1
732             paddd xmm5, xmm1
733             paddd xmm0, xmm5            // bias
734             paddd xmm4, xmm5            // bias
735             pxor xmm1, xmm1
736             pxor xmm5, xmm5
737             psubd xmm1, xmm3
738             psubd xmm5, xmm7
739 
740             // ICC
741             movdqa xmm3, xmm1           // g -= r >> 1
742             movdqa xmm7, xmm5
743             psrad xmm3, 1
744             psrad xmm7, 1
745             psubd xmm0, xmm3
746             psubd xmm4, xmm7
747 
748             movdqa xmm3, [g_const_d1]   // r -= ((b + 1) >> 1) - g
749             movdqa xmm7, [g_const_d1]
750             paddd xmm3, xmm2
751             paddd xmm7, xmm6
752             paddd xmm1, xmm0
753             paddd xmm5, xmm4
754             psrad xmm3, 1
755             psrad xmm7, 1
756             psubd xmm1, xmm3
757             psubd xmm5, xmm7
758 
759             paddd xmm2, xmm1            // b += r
760             paddd xmm6, xmm5
761 
762             // clip
763             movdqa xmm3, [g_const_w0x80]
764             packssdw xmm0, xmm4
765             packssdw xmm1, xmm5
766             packssdw xmm2, xmm6
767 
768             mov ecx, [esp + 8]
769             movdqa xmm4, [ecx]
770             psraw xmm0, xmm4
771             psraw xmm1, xmm4
772             psraw xmm2, xmm4
773 
774             psubw xmm0, xmm3
775             psubw xmm1, xmm3
776             psubw xmm2, xmm3
777 
778             movdqa [esp + 16], xmm0
779             movdqa [esp + 32], xmm1
780             movdqa [esp + 48], xmm2
781 
782             //================
783             // second 8 pixels
784             movdqa xmm0, [esi + 32]
785             movdqa xmm4, [esi + 48]
786             movdqa xmm3, [esi + ebp + 32]
787             movdqa xmm7, [esi + ebp + 48]
788             movdqa xmm2, [esi + ebp * 2 + 32]
789             movdqa xmm6, [esi + ebp * 2 + 48]
790 
791             mov ecx, [esp + 8]
792             movdqa xmm1, [ecx]
793             movdqa xmm5, [g_const_d0x80]
794             pslld xmm5, xmm1
795             paddd xmm5, xmm1
796             paddd xmm0, xmm5            // bias
797             paddd xmm4, xmm5            // bias
798             pxor xmm1, xmm1
799             pxor xmm5, xmm5
800             psubd xmm1, xmm3
801             psubd xmm5, xmm7
802 
803             // ICC
804             movdqa xmm3, xmm1           // g -= r >> 1
805             movdqa xmm7, xmm5
806             psrad xmm3, 1
807             psrad xmm7, 1
808             psubd xmm0, xmm3
809             psubd xmm4, xmm7
810 
811             movdqa xmm3, [g_const_d1]   // r -= ((b + 1) >> 1) - g
812             movdqa xmm7, [g_const_d1]
813             paddd xmm3, xmm2
814             paddd xmm7, xmm6
815             paddd xmm1, xmm0
816             paddd xmm5, xmm4
817             psrad xmm3, 1
818             psrad xmm7, 1
819             psubd xmm1, xmm3
820             psubd xmm5, xmm7
821 
822             paddd xmm2, xmm1            // b += r
823             paddd xmm6, xmm5
824 
825             // clip
826             movdqa xmm3, [g_const_w0x80]
827             packssdw xmm0, xmm4
828             packssdw xmm1, xmm5
829             packssdw xmm2, xmm6
830 
831             mov ecx, [esp + 8]
832             movdqa xmm4, [ecx]
833             psraw xmm0, xmm4
834             psraw xmm1, xmm4
835             psraw xmm2, xmm4
836 
837             psubw xmm0, xmm3
838             psubw xmm1, xmm3
839             psubw xmm2, xmm3
840 
841             //================
842             // 16 pixels
843             movdqa xmm3, [g_const_b0x80]
844             packsswb xmm0, [esp + 16]
845             packsswb xmm1, [esp + 32]
846             packsswb xmm2, [esp + 48]
847 
848             psubb xmm0, xmm3
849             psubb xmm1, xmm3
850             psubb xmm2, xmm3
851 
852             pxor xmm7, xmm7
853             movdqa xmm4, xmm0
854             movdqa xmm5, xmm1
855             movdqa xmm6, xmm2
856 
857             punpckhbw xmm0, xmm7
858             punpckhbw xmm1, xmm7
859             punpckhbw xmm2, xmm7
860             punpcklbw xmm4, xmm7
861             punpcklbw xmm5, xmm7
862             punpcklbw xmm6, xmm7
863 
864             // spill second 8 pixels
865             movdqa [esp + 16], xmm4
866             movdqa [esp + 32], xmm5
867             movdqa [esp + 48], xmm6
868 
869             // first 8 pixels
870             movdqa xmm4, xmm0
871             movdqa xmm5, xmm1
872             movdqa xmm6, xmm2
873 
874             punpcklwd xmm0, xmm7
875             punpcklwd xmm1, xmm7
876             punpcklwd xmm2, xmm7
877 
878             punpckhwd xmm4, xmm7
879             punpckhwd xmm5, xmm7
880             punpckhwd xmm6, xmm7
881 
882             pslld xmm0, 8
883             pslld xmm2, 16
884             pslld xmm4, 8
885             pslld xmm6, 16
886 
887             por xmm0, xmm1
888             por xmm4, xmm5
889             por xmm0, xmm2
890             por xmm4, xmm6
891 
892             movdqa [esp + 192], xmm0
893             pslld xmm0, 8
894             movdqa [esp + 208], xmm4
895             pslld xmm4, 8
896             movdqa [esp + 224], xmm0
897             movdqa [esp + 240], xmm4
898 
899             // second 8 pixels
900             movdqa xmm0, [esp + 16]
901             movdqa xmm1, [esp + 32]
902             movdqa xmm2, [esp + 48]
903             movdqa xmm4, xmm0
904             movdqa xmm5, xmm1
905             movdqa xmm6, xmm2
906 
907             punpcklwd xmm0, xmm7
908             punpcklwd xmm1, xmm7
909             punpcklwd xmm2, xmm7
910             punpckhwd xmm4, xmm7
911             punpckhwd xmm5, xmm7
912             punpckhwd xmm6, xmm7
913 
914             pslld xmm0, 8
915             pslld xmm2, 16
916             pslld xmm4, 8
917             pslld xmm6, 16
918             por xmm0, xmm1
919             por xmm4, xmm5
920             por xmm0, xmm2
921             por xmm4, xmm6
922 
923             movdqa [esp + 256], xmm0
924             pslld xmm0, 8
925             movdqa [esp + 272], xmm4
926             pslld xmm4, 8
927             movdqa [esp + 288], xmm0
928             movdqa [esp + 304], xmm4
929 
930             // RGBX32 -> RGB24
931             mov eax, [esp + 68]         // ..B1G1R1
932             mov ecx, [esp + 96]         // B0G0R0..
933             shld eax, ecx, 24           // R1B0G0R0
934             mov [edi + ebx + 0], eax
935             mov eax, [esp + 84]         // ..B5G5R5
936             mov ecx, [esp + 100]        // B1G1R1..
937             shld eax, ecx, 16           // G5R5B1G1
938             mov [edi + ebx + 4], eax
939             mov eax, [esp + 80]         // ..B4G4R4
940             mov ecx, [esp + 116]        // B5G5R5..
941             shld eax, ecx, 8            // B4G4R4B5
942             mov [edi + ebx + 8], eax
943             add edi, edx                // $edi = pbRGB += cbRGB
944 
945             mov eax, [esp + 76]         // ..B3G3R3
946             mov ecx, [esp + 104]        // B2G2R2..
947             shld eax, ecx, 24           // R3B2G2R2
948             mov [edi + ebx + 0], eax
949             mov eax, [esp + 92]         // ..B7G7R7
950             mov ecx, [esp + 108]        // B3G3R3..
951             shld eax, ecx, 16           // G7R7B3G3
952             mov [edi + ebx + 4], eax
953             mov eax, [esp + 88]         // ..B6G6R6
954             mov ecx, [esp + 124]        // B7G7R7..
955             shld eax, ecx, 8            // B6G6R6B7
956             mov [edi + ebx + 8], eax
957             add edi, edx                // $edi = pbRGB += cbRGB
958 
959             // RGBX32 -> RGB24
960             mov eax, [esp + 140]        // ..B3G3R3
961             mov ecx, [esp + 168]        // B2G2R2..
962             shld eax, ecx, 24           // R3B2G2R2
963             mov [edi + ebx + 0], eax
964             mov eax, [esp + 156]        // ..B7G7R7
965             mov ecx, [esp + 172]        // B3G3R3..
966             shld eax, ecx, 16           // G7R7B3G3
967             mov [edi + ebx + 4], eax
968             mov eax, [esp + 152]        // ..B6G6R6
969             mov ecx, [esp + 188]        // B7G7R7..
970             shld eax, ecx, 8            // B6G6R6B7
971             mov [edi + ebx + 8], eax
972             add edi, edx                // $edi = pbRGB += cbRGB
973 
974             mov eax, [esp + 132]        // ..B1G1R1
975             mov ecx, [esp + 160]        // B0G0R0..
976             shld eax, ecx, 24           // R1B0G0R0
977             mov [edi + ebx + 0], eax
978             mov eax, [esp + 148]        // ..B5G5R5
979             mov ecx, [esp + 164]        // B1G1R1..
980             shld eax, ecx, 16           // G5R5B1G1
981             mov [edi + ebx + 4], eax
982             mov eax, [esp + 144]        // ..B4G4R4
983             mov ecx, [esp + 180]        // B5G5R5..
984             shld eax, ecx, 8            // B4G4R4B5
985             mov [edi + ebx + 8], eax
986             add edi, edx                // $edi = pbRGB += cbRGB
987 
988             // RGBX32 -> RGB24
989             mov eax, [esp + 196]        // ..B1G1R1
990             mov ecx, [esp + 224]        // B0G0R0..
991             shld eax, ecx, 24           // R1B0G0R0
992             mov [edi + ebx + 0], eax
993             mov eax, [esp + 212]        // ..B5G5R5
994             mov ecx, [esp + 228]        // B1G1R1..
995             shld eax, ecx, 16           // G5R5B1G1
996             mov [edi + ebx + 4], eax
997             mov eax, [esp + 208]        // ..B4G4R4
998             mov ecx, [esp + 244]        // B5G5R5..
999             shld eax, ecx, 8            // B4G4R4B5
1000             mov [edi + ebx + 8], eax
1001             add edi, edx                // $edi = pbRGB += cbRGB
1002 
1003             mov eax, [esp + 204]        // ..B3G3R3
1004             mov ecx, [esp + 232]        // B2G2R2..
1005             shld eax, ecx, 24           // R3B2G2R2
1006             mov [edi + ebx + 0], eax
1007             mov eax, [esp + 220]        // ..B7G7R7
1008             mov ecx, [esp + 236]        // B3G3R3..
1009             shld eax, ecx, 16           // G7R7B3G3
1010             mov [edi + ebx + 4], eax
1011             mov eax, [esp + 216]        // ..B6G6R6
1012             mov ecx, [esp + 252]        // B7G7R7..
1013             shld eax, ecx, 8            // B6G6R6B7
1014             mov [edi + ebx + 8], eax
1015             add edi, edx                // $edi = pbRGB += cbRGB
1016 
1017             // RGBX32 -> RGB24
1018             mov eax, [esp + 268]         // ..B3G3R3
1019             mov ecx, [esp + 296]        // B2G2R2..
1020             shld eax, ecx, 24           // R3B2G2R2
1021             mov [edi + ebx + 0], eax
1022             mov eax, [esp + 284]         // ..B7G7R7
1023             mov ecx, [esp + 300]        // B3G3R3..
1024             shld eax, ecx, 16           // G7R7B3G3
1025             mov [edi + ebx + 4], eax
1026             mov eax, [esp + 280]         // ..B6G6R6
1027             mov ecx, [esp + 316]        // B7G7R7..
1028             shld eax, ecx, 8            // B6G6R6B7
1029             mov [edi + ebx + 8], eax
1030             add edi, edx                // $edi = pbRGB += cbRGB
1031 
1032             mov eax, [esp + 260]         // ..B1G1R1
1033             mov ecx, [esp + 288]         // B0G0R0..
1034             shld eax, ecx, 24           // R1B0G0R0
1035             mov [edi + ebx + 0], eax
1036             mov eax, [esp + 276]         // ..B5G5R5
1037             mov ecx, [esp + 292]        // B1G1R1..
1038             shld eax, ecx, 16           // G5R5B1G1
1039             mov [edi + ebx + 4], eax
1040             mov eax, [esp + 272]         // ..B4G4R4
1041             mov ecx, [esp + 308]        // B5G5R5..
1042             shld eax, ecx, 8            // B4G4R4B5
1043             mov [edi + ebx + 8], eax
1044             add edi, edx                // $edi = pbRGB += cbRGB
1045 
1046         //================================
1047         add esi, 256 - 64
1048         add ebx, 12
1049         jnz Loop0
1050 
1051         //================
1052         pop esp
1053         pop edi
1054         pop esi
1055         pop ebx
1056         pop ebp
1057         ret 24
1058     }
1059 }
1060 
outputMBRow_RGB24_Lossy_3(CWMImageStrCodec * pSC)1061 Int outputMBRow_RGB24_Lossy_3(CWMImageStrCodec* pSC)
1062 {
1063 #ifdef REENTRANT_MODE
1064     const size_t cHeight = min((pSC->m_Dparam->cROIBottomY + 1) - (pSC->cRow - 1) * 16, 16);
1065     const size_t iFirstRow = ((pSC->cRow - 1) * 16 > pSC->m_Dparam->cROITopY ? 0 : (pSC->m_Dparam->cROITopY & 0xf));
1066 #endif
1067     const size_t cbRGB = pSC->WMIBI.cbStride;
1068     const U8* const pbRGB = (U8*)pSC->WMIBI.pv + cbRGB * (pSC->cRow - 1) * 16;
1069 
1070     U8* const pbY = (U8*)pSC->a0MBbuffer[0];
1071     U8* const pbU = (U8*)pSC->a0MBbuffer[1];
1072     // U8* const pbV = (U8*)pSC->a0MBbuffer[2];
1073 
1074     const size_t cmbColumn = (pSC->WMII.cWidth + 15) / 16;
1075 
1076     __declspec(align(16)) U8 Shift[16];
1077 
1078     assert(BD_8 == pSC->WMII.bdBitDepth);
1079     assert(CF_RGB == pSC->WMII.cfColorFormat);
1080     assert(24 == pSC->WMII.cBitsPerUnit);
1081     assert(pSC->WMII.bRGB);
1082     assert(O_NONE == pSC->WMII.oOrientation);
1083 
1084     assert(YUV_444 == pSC->m_param.cfColorFormat);
1085 
1086     assert(pSC->m_Dparam->bDecodeFullFrame);
1087 
1088     _mm_store_si128((__m128i *) Shift, pSC->m_param.bScaledArith ? g_const_d3 : g_const_d0);
1089     storeRGB24_3(pbY + 64 * 0, pbU - pbY, pbRGB + cbRGB *  0, cbRGB, cmbColumn,
1090         Shift);
1091     storeRGB24_3(pbY + 64 * 2, pbU - pbY, pbRGB + cbRGB *  8, cbRGB, cmbColumn,
1092         Shift);
1093 
1094 #ifdef REENTRANT_MODE
1095     pSC->WMIBI.cLinesDecoded = cHeight - iFirstRow;
1096 #endif
1097     return ICERR_OK;
1098 }
1099 #endif
1100 
1101 //================================================================
1102 #if defined(WMP_OPT_TRFM_DEC)
strDCT2x2up_OPT(PixelI * pa,PixelI * pb,PixelI * pc,PixelI * pd)1103 FORCE_INLINE Void strDCT2x2up_OPT(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
1104 {
1105     PixelI a, b, c, d, C, t;
1106     a = *pa;
1107     b = *pb;
1108     C = *pc;
1109     d = *pd;
1110 
1111     a += d;
1112     b -= C;
1113     t = ((a - b + 1) >> 1);
1114     c = t - d;
1115     d = t - C;
1116     a -= d;
1117     b += c;
1118 
1119     *pa = a;
1120     *pb = b;
1121     *pc = c;
1122     *pd = d;
1123 }
1124 
invOdd_OPT(PixelI * pa,PixelI * pb,PixelI * pc,PixelI * pd)1125 FORCE_INLINE Void invOdd_OPT(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
1126 {
1127     PixelI a, b, c, d;
1128     a = *pa;
1129     b = *pb;
1130     c = *pc;
1131     d = *pd;
1132 
1133     /** butterflies **/
1134     b += d;
1135     a -= c;
1136     d -= (b) >> 1;
1137     c += (a + 1) >> 1;
1138 
1139     /** rotate pi/8 **/
1140 #define IROTATE2(a, b) (a) -= (((b)*3 + 4) >> 3), (b) += (((a)*3 + 4) >> 3)
1141     IROTATE2(a, b);
1142     IROTATE2(c, d);
1143 
1144     /** butterflies **/
1145     c -= (b + 1) >> 1;
1146     d = ((a + 1) >> 1) - d;
1147     b += c;
1148     a -= d;
1149 
1150     *pa = a;
1151     *pb = b;
1152     *pc = c;
1153     *pd = d;
1154 }
1155 
invOddOdd_OPT(PixelI * pa,PixelI * pb,PixelI * pc,PixelI * pd)1156 FORCE_INLINE Void invOddOdd_OPT(PixelI* pa, PixelI* pb, PixelI* pc, PixelI* pd)
1157 {
1158     PixelI a, b, c, d, t1, t2;
1159     a = *pa;
1160     b = *pb;
1161     c = *pc;
1162     d = *pd;
1163 
1164     /** butterflies **/
1165     d += a;
1166     c -= b;
1167     a -= (t1 = d >> 1);
1168     b += (t2 = c >> 1);
1169 
1170     /** rotate pi/4 **/
1171     a -= (b * 3 + 3) >> 3;
1172     b += (a * 3 + 3) >> 2;
1173     a -= (b * 3 + 4) >> 3;
1174 
1175     /** butterflies **/
1176     b -= t2;
1177     a += t1;
1178     c += b;
1179     d -= a;
1180 
1181     /** sign flips **/
1182     *pa = a;
1183     *pb = -b;
1184     *pc = -c;
1185     *pd = d;
1186 }
1187 
strDCT2x2dn_SSE2_1(PixelI * p)1188 FORCE_INLINE Void strDCT2x2dn_SSE2_1(PixelI* p)
1189 {
1190     __m128i* const pdq = (__m128i*)p;
1191     __m128i a = pdq[0];
1192     __m128i b = pdq[1];
1193     const __m128i C = pdq[2];
1194     __m128i d = pdq[3];
1195     __m128i t;
1196     __m128i c;
1197 
1198     a = _mm_add_epi32(a, d);
1199     b = _mm_sub_epi32(b, C);
1200     t = _mm_sub_epi32(a, b);
1201     t = _mm_srai_epi32(t, 1);
1202     c = _mm_sub_epi32(t, d);
1203     d = _mm_sub_epi32(t, C);
1204     a = _mm_sub_epi32(a, d);
1205     b = _mm_add_epi32(b, c);
1206 
1207     pdq[0] = a;
1208     pdq[1] = b;
1209     pdq[2] = c;
1210     pdq[3] = d;
1211 }
1212 
strIDCT4x4Stage1_OPT_H1(PixelI * p)1213 Void strIDCT4x4Stage1_OPT_H1(PixelI* p)
1214 {
1215     /** top left corner, butterfly => butterfly **/
1216     strDCT2x2up_OPT(p + 0, p + 1, p + 2, p + 3);
1217 
1218     /** top right corner, -pi/8 rotation => butterfly **/
1219     invOdd_OPT(p + 5, p + 4, p + 7, p + 6);
1220 
1221     /** bottom left corner, butterfly => -pi/8 rotation **/
1222     invOdd_OPT(p + 10, p + 8, p + 11, p + 9);
1223 
1224     /** bottom right corner, -pi/8 rotation => -pi/8 rotation **/
1225     invOddOdd_OPT(p + 15, p + 14, p + 13, p + 12);
1226 }
1227 
strIDCT4x4Stage1_OPT_H2(PixelI * p)1228 FORCE_INLINE Void strIDCT4x4Stage1_OPT_H2(PixelI* p)
1229 {
1230     /** butterfly **/
1231     strDCT2x2dn_SSE2_1(p);
1232 }
1233 
strIDCT4x4Stage1_OPT5(PixelI * p0,PixelI * p1)1234 Void strIDCT4x4Stage1_OPT5(PixelI* p0, PixelI* p1)
1235 {
1236     _mm_prefetch((char*)(p0 - 96 + 256), _MM_HINT_T0);
1237     strIDCT4x4Stage1_OPT_H1(p0 - 96);
1238     strIDCT4x4Stage1_OPT_H1(p0 - 80);
1239     strIDCT4x4Stage1_OPT_H1(p0 - 32);
1240     strIDCT4x4Stage1_OPT_H1(p0 - 16);
1241 
1242     _mm_prefetch((char*)(p0 - 32 + 256), _MM_HINT_T0);
1243     strIDCT4x4Stage1_OPT_H1(p0 + 32);
1244     strIDCT4x4Stage1_OPT_H1(p0 + 48);
1245     strIDCT4x4Stage1_OPT_H1(p0 + 96);
1246     strIDCT4x4Stage1_OPT_H1(p0 + 112);
1247 
1248     _mm_prefetch((char*)(p0 + 32 + 256), _MM_HINT_T0);
1249     strIDCT4x4Stage1_OPT_H1(p1 - 128);
1250     strIDCT4x4Stage1_OPT_H1(p1 - 112);
1251     strIDCT4x4Stage1_OPT_H1(p1 - 64);
1252     strIDCT4x4Stage1_OPT_H1(p1 - 48);
1253 
1254     _mm_prefetch((char*)(p0 + 96 + 256), _MM_HINT_T0);
1255     strIDCT4x4Stage1_OPT_H1(p1 + 0);
1256     strIDCT4x4Stage1_OPT_H1(p1 + 16);
1257     strIDCT4x4Stage1_OPT_H1(p1 + 64);
1258     strIDCT4x4Stage1_OPT_H1(p1 + 80);
1259 
1260     strIDCT4x4Stage1_OPT_H2(p0 - 96);
1261     strIDCT4x4Stage1_OPT_H2(p0 - 80);
1262     strIDCT4x4Stage1_OPT_H2(p0 - 32);
1263     strIDCT4x4Stage1_OPT_H2(p0 - 16);
1264     strIDCT4x4Stage1_OPT_H2(p0 + 32);
1265     strIDCT4x4Stage1_OPT_H2(p0 + 48);
1266     strIDCT4x4Stage1_OPT_H2(p0 + 96);
1267     strIDCT4x4Stage1_OPT_H2(p0 + 112);
1268 
1269     strIDCT4x4Stage1_OPT_H2(p1 - 128);
1270     strIDCT4x4Stage1_OPT_H2(p1 - 112);
1271     strIDCT4x4Stage1_OPT_H2(p1 - 64);
1272     strIDCT4x4Stage1_OPT_H2(p1 - 48);
1273     strIDCT4x4Stage1_OPT_H2(p1 + 0);
1274     strIDCT4x4Stage1_OPT_H2(p1 + 16);
1275     strIDCT4x4Stage1_OPT_H2(p1 + 64);
1276     strIDCT4x4Stage1_OPT_H2(p1 + 80);
1277 }
1278 
1279 //================================
strPost4x4Stage1_alternate_ASM5(PixelI * p0,PixelI * p1)1280 __declspec(naked) void __stdcall strPost4x4Stage1_alternate_ASM5(PixelI* p0, PixelI* p1)
1281 {
1282     UNREFERENCED_PARAMETER( p0 );
1283     UNREFERENCED_PARAMETER( p1 );
1284     __asm {
1285         push ebp
1286         push ebx
1287         push esi
1288         push edi
1289 
1290         //================
1291         // pointer array
1292         mov eax, [esp + 20]     // $esi = p0
1293         mov edx, [esp + 24]     // $edi = p1
1294         mov ecx, 4 * 16
1295         mov ebx, 4 * 48
1296 
1297         prefetcht0 [eax + 512]
1298         prefetcht0 [eax + 768]
1299         prefetcht0 [eax + 1024]
1300         prefetcht0 [eax + 1280]
1301 
1302         add edx, ecx
1303         add eax, ebx
1304 
1305         push edx
1306         sub edx, ecx
1307         push edx
1308         push edx
1309         sub edx, ebx
1310         push eax
1311         push eax
1312         sub eax, ecx
1313         push eax
1314         push eax
1315         sub eax, ecx
1316         push eax
1317         sub eax, ecx
1318 
1319         push edx
1320         sub edx, ecx
1321         push edx
1322         sub eax, ecx
1323         push edx
1324         sub edx, ebx
1325         push eax
1326         push eax
1327         sub eax, ecx
1328         push eax
1329         push eax
1330         sub eax, ecx
1331         push eax
1332         sub eax, ecx
1333 
1334         push edx
1335         sub edx, ecx
1336         push edx
1337         sub eax, ecx
1338         push edx
1339         sub edx, ebx
1340         push eax
1341         push eax
1342         sub eax, ecx
1343         push eax
1344         push eax
1345         sub eax, ecx
1346         push eax
1347         sub eax, ecx
1348 
1349         push edx
1350         sub edx, ecx
1351         push edx
1352         sub eax, ecx
1353         push edx
1354         push eax
1355         push eax
1356         sub eax, ecx
1357         push eax
1358         push eax
1359         sub eax, ecx
1360         push eax
1361 
1362         mov ebp, (4 + 4) * -16
1363         push ebp
1364     }
1365 Loop0:
1366     __asm {
1367         mov esi, [esp + (4 + 4) * 16 + 4 + ebp ]    // $esi = p0
1368         mov edi, [esp + (4 + 4) * 16 + 4 + ebp + 4] // $edi = p1
1369 
1370         //================
1371         movdqa xmm2, [esi + 4 * 12] // a = xmm2
1372         movdqa xmm1, [esi + 4 * 72] // b = xmm1
1373         movdqa xmm6, [edi + 4 * 4]  // c = xmm6
1374         movdqa xmm7, [edi + 4 * 64] // d = xmm7
1375 
1376         //================
1377         // buttefly
1378         paddd xmm2, xmm7
1379         psubd xmm1, xmm6
1380 
1381         movdqa xmm0, xmm2           // a = xmm0
1382         psubd xmm2, xmm1
1383         psrad xmm2, 1
1384         movdqa xmm3, xmm2
1385 
1386         psubd xmm2, xmm7            // c = xmm2
1387         psubd xmm3, xmm6            // d = xmm3
1388         paddd xmm1, xmm2
1389         psubd xmm0, xmm3
1390 
1391         //================
1392         // bottom right corner: -pi/8 rotation => -pi/8 rotation
1393         pshufd xmm7, xmm3, 0x3
1394         movd eax, xmm3
1395         movd edx, xmm7
1396         pshufd xmm7, xmm3, 0x1
1397         movd ebx, xmm7
1398         pshufd xmm7, xmm3, 0x2
1399         movd ecx, xmm7
1400 
1401         add edx, eax
1402         sub ecx, ebx
1403         mov esi, edx
1404         sar esi, 1
1405         mov edi, ecx
1406         sar edi, 1
1407         sub eax, esi
1408         add ebx, edi
1409 
1410         lea ebp, [ebx + ebx * 2 + 6]
1411         sar ebp, 3
1412         sub eax, ebp
1413         lea ebp, [eax + eax * 2 + 2]
1414         sar ebp, 2
1415         add ebx, ebp
1416         lea ebp, [ebx + ebx * 2 + 4]
1417         sar ebp, 3
1418         sub eax, ebp
1419 
1420         mov ebp, [esp]
1421 
1422         sub ebx, edi
1423         add eax, esi
1424         add ecx, ebx
1425         sub edx, eax
1426 
1427         mov esi, [esp + (4 + 4) * 16 + 4 + ebp ]    // $esi = p0
1428         mov edi, [esp + (4 + 4) * 16 + 4 + ebp + 4] // $edi = p1
1429 
1430         movd xmm3, eax
1431         movd xmm4, ebx
1432         movd xmm5, ecx
1433         movd xmm6, edx
1434         punpckldq xmm3, xmm4
1435         punpckldq xmm5, xmm6
1436         punpcklqdq xmm3, xmm5
1437 
1438         //================
1439         // anti diagonal corners: rotation by -pi/8
1440         movdqa xmm5, g_const_d1
1441         movdqa xmm6, g_const_d1
1442 
1443         pshufd xmm2, xmm2, 0xd8 //  7,  5,  6,  4
1444         movdqa xmm4, xmm1       // 75, 74, 73, 72
1445         punpckhqdq xmm1, xmm2   //  7,  5, 75, 74
1446         punpcklqdq xmm4, xmm2   //  6,  4, 73, 72
1447 
1448         paddd xmm5, xmm1
1449         psrad xmm5, 1
1450         psubd xmm4, xmm5
1451 
1452         paddd xmm6, xmm4
1453         psrad xmm6, 1
1454         paddd xmm1, xmm6
1455 
1456         movdqa xmm2, xmm4       //  6,  4, 73, 72
1457         punpckhqdq xmm4, xmm1   //  7,  5,  6,  4
1458         punpcklqdq xmm2, xmm1   // 75, 74, 73, 72
1459         pshufd xmm4, xmm4, 0xd8 //  7,  6,  5,  4
1460 
1461         //================
1462         // butterfly
1463         // a = xmm0, b = xmm2, c = xmm4, d = xmm3
1464         paddd xmm0, xmm3
1465         movdqa xmm1, xmm0   // a = xmm1
1466         psrad xmm0, 1
1467         psubd xmm0, xmm3    // d = xmm0
1468 
1469         movdqa xmm3, xmm0   // d = xmm3
1470         paddd xmm0, xmm0
1471         paddd xmm0, xmm3
1472         psrad xmm0, 3
1473         paddd xmm1, xmm0
1474 
1475         movdqa xmm0, xmm1   // a = xmm0
1476         paddd xmm1, xmm1
1477         paddd xmm1, xmm0
1478         psrad xmm1, 4
1479         paddd xmm3, xmm1
1480 
1481         movdqa xmm5, xmm0   // a
1482         psrad xmm5, 7
1483         paddd xmm3, xmm5    // d += (a >> 7)
1484         psrad xmm5, 3
1485         psubd xmm3, xmm5    // d -= (a >> 10)
1486 
1487         movdqa xmm5, [g_const_d4]
1488         movdqa xmm1, xmm3   // d = xmm1
1489         psubd xmm2, xmm4
1490         paddd xmm5, xmm3
1491         paddd xmm3, xmm3
1492         paddd xmm3, xmm5
1493         psrad xmm3, 3
1494         paddd xmm0, xmm3
1495 
1496         movdqa xmm3, xmm2   // b = xmm3
1497         psrad xmm2, 1
1498         psubd xmm1, xmm2
1499 
1500         movdqa xmm2, xmm0   // a = xmm2
1501         psubd xmm0, xmm3
1502         psrad xmm0, 1
1503         psubd xmm0, xmm4    // c = xmm0
1504 
1505         paddd xmm3, xmm1
1506         psubd xmm2, xmm0
1507 
1508         //================
1509         movdqa [edi + 4 * 4], xmm1
1510         movdqa [edi + 4 * 64], xmm0
1511         movdqa [esi + 4 * 12], xmm2
1512         movdqa [esi + 4 * 72], xmm3
1513 
1514         add ebp, 8
1515         mov [esp], ebp
1516         jnz Loop0
1517 
1518         //================
1519         add esp, (4 + 4) * 16 + 4
1520         pop edi
1521         pop esi
1522         pop ebx
1523         pop ebp
1524         ret 4 * 2
1525     }
1526 }
1527 
invTransformMacroblock_YUV444_Center5(CWMImageStrCodec * pSC)1528 Int invTransformMacroblock_YUV444_Center5(CWMImageStrCodec * pSC)
1529 {
1530     const OVERLAP olOverlap = pSC->WMISCP.olOverlap;
1531     int i = 0;
1532 
1533     assert(0 < pSC->cRow && pSC->cRow < pSC->cmbHeight);
1534     assert(0 < pSC->cColumn && pSC->cColumn < pSC->cmbWidth);
1535 
1536     assert(0 == pSC->WMII.cPostProcStrength);
1537 
1538     assert(YUV_444 == pSC->m_param.cfColorFormat);
1539     assert(3 == pSC->m_param.cNumChannels);
1540 
1541     assert(pSC->m_Dparam->bDecodeFullWidth);
1542     assert(1 == pSC->m_Dparam->cThumbnailScale);
1543 
1544     for (i = 0; i < 3; ++i)
1545     {
1546         PixelI* const p0 = pSC->p0MBbuffer[i];
1547         PixelI* const p1 = pSC->p1MBbuffer[i];
1548 
1549         //================================
1550         // second level inverse transform
1551         strIDCT4x4Stage2(p1);
1552         if (pSC->m_param.bScaledArith) {
1553             strNormalizeDec(p1, (i != 0));
1554         }
1555 
1556         //================================
1557         // second level inverse overlap
1558         if (OL_TWO <= olOverlap)
1559         {
1560             strPost4x4Stage2Split_alternate(p0, p1);
1561         }
1562 
1563         //================================
1564         // first level inverse transform
1565         strIDCT4x4Stage1_OPT5(p0, p1);
1566 
1567         //================================
1568         // first level inverse overlap
1569         if (OL_ONE <= olOverlap)
1570         {
1571             strPost4x4Stage1_alternate_ASM5(p0, p1);
1572         }
1573     }
1574 
1575     return ICERR_OK;
1576 }
1577 #endif
1578 #endif
1579 
1580 //================================================================
StrDecOpt(CWMImageStrCodec * pSC)1581 void StrDecOpt(CWMImageStrCodec* pSC)
1582 {
1583 #if defined(WMP_OPT_SSE2)
1584     if (IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
1585     {
1586         CWMImageInfo* pII = &pSC->WMII;
1587         // CWMIStrCodecParam* pSCP = &pSC->WMISCP;
1588 
1589         g_const_d0 = _mm_setzero_si128();
1590         g_const_d3 = _mm_set1_epi32(3);
1591         g_const_d1 = _mm_set_epi32(1, 1, 1, 1);
1592         g_const_d4 = _mm_set_epi32(4, 4, 4, 4);
1593 
1594         g_const_d0x80 = _mm_set_epi32(0x80, 0x80, 0x80, 0x80);
1595         g_const_w0x80 = _mm_set_epi16(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
1596         g_const_b0x80 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
1597 
1598         if (pSC->WMII.fPaddedUserBuffer &&
1599             //pSC->m_Dparam->bDecodeFullFrame &&
1600             //((pII->cWidth & 0xf) == 0) &&
1601             //(((int) pSC->WMIBI.pv & 0xf) == 0) &&
1602             BD_8 == pII->bdBitDepth &&
1603             CF_RGB == pII->cfColorFormat &&
1604             24 == pII->cBitsPerUnit &&
1605             pII->bRGB &&
1606             O_NONE == pII->oOrientation &&
1607             YUV_444 == pSC->m_param.cfColorFormat &&
1608             pSC->p1MBbuffer[1] - pSC->p1MBbuffer[0] == pSC->p1MBbuffer[2] - pSC->p1MBbuffer[1] &&
1609             pSC->m_Dparam->bDecodeFullFrame &&
1610             1)
1611         {
1612 #if defined(WMP_OPT_CC_DEC)
1613             if (pSC->m_param.bScaledArith || pSC->WMISCP.olOverlap != OL_NONE)
1614             {
1615                 pSC->Load = outputMBRow_RGB24_Lossy_3;
1616             }
1617             else
1618             {
1619                 pSC->Load = outputMBRow_RGB24_Lossless_1;
1620             }
1621 #endif // WMP_OPT_CC_DEC
1622         }
1623 
1624         if (YUV_444 == pSC->m_param.cfColorFormat &&
1625             pSC->p1MBbuffer[1] - pSC->p1MBbuffer[0] == pSC->p1MBbuffer[2] - pSC->p1MBbuffer[1] &&
1626             pSC->m_Dparam->bDecodeFullWidth &&
1627             pSC->m_param.cSubVersion == CODEC_SUBVERSION_NEWSCALING_SOFT_TILES &&
1628             1 == pSC->m_Dparam->cThumbnailScale)
1629         {
1630 #if defined(WMP_OPT_TRFM_DEC)
1631             pSC->TransformCenter = invTransformMacroblock_YUV444_Center5;
1632 #endif
1633         }
1634 
1635     }
1636 #else
1637     UNREFERENCED_PARAMETER( pSC );
1638 #endif
1639 }
1640 
1641