1 //*@@@+++@@@@******************************************************************
2 //
3 // Copyright � Microsoft Corp.
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are met:
8 //
9 // � Redistributions of source code must retain the above copyright notice,
10 // this list of conditions and the following disclaimer.
11 // � Redistributions in binary form must reproduce the above copyright notice,
12 // this list of conditions and the following disclaimer in the documentation
13 // and/or other materials provided with the distribution.
14 //
15 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 // POSSIBILITY OF SUCH DAMAGE.
26 //
27 //*@@@---@@@@******************************************************************
28 #include "strcodec.h"
29 #include "decode.h"
30
31 #if defined(WMP_OPT_SSE2)
32 #include <emmintrin.h>
33 #include <windows.h>
34
35 //================================================================
36 static __m128i g_const_d0;
37 static __m128i g_const_d1;
38
39 __m128i g_const_d3;
40 __m128i g_const_d4;
41 __m128i g_const_d0x80;
42 __m128i g_const_w0x80;
43 __m128i g_const_b0x80;
44
45 //================================================================
46 #if defined(WMP_OPT_CC_DEC)
storeRGB24_5(U8 * pbYCoCg,size_t cbYCoCg,const U8 * pbRGB,size_t cbRGB,size_t cmb)47 __declspec(naked) void __stdcall storeRGB24_5(
48 U8* pbYCoCg,
49 size_t cbYCoCg,
50 const U8* pbRGB,
51 size_t cbRGB,
52 size_t cmb)
53 {
54 #define DISP 8
55 UNREFERENCED_PARAMETER( pbYCoCg );
56 UNREFERENCED_PARAMETER( cbYCoCg );
57 UNREFERENCED_PARAMETER( pbRGB );
58 UNREFERENCED_PARAMETER( cbRGB );
59 UNREFERENCED_PARAMETER( cmb );
60 __asm {
61 push ebp
62 push ebx
63 push esi
64 push edi
65
66 mov ebx, [esp + 36] // $ebx = cmb
67 mov edi, [esp + 28] // $edi = pbRGB
68 lea ebx, [ebx + ebx * 2] // $ebx = cmb * 3
69 mov edx, [esp + 32] // $edx = cbRGB
70 shl ebx, 4 // $ebx = cmb * 3 * 16
71 mov esi, [esp + 20] // $esi = pbYCoCg
72 add edi, ebx // $edi = pbRGB + 3 * 16 * cmb
73 mov ebp, [esp + 24] // $ebp = cbYCoCg
74 neg ebx
75
76 mov eax, esp
77 and esp, 0xffffff80
78 sub esp, 64 * 4 + DISP
79
80 mov [esp], eax // original $esp
81 mov [esp + 4], edi
82 }
83 Loop0:
84 __asm {
85 mov edi, [esp + 4] // $edi = pbRGB + 3 * 16 * cmb
86
87 // first 8 pixels
88 pxor xmm1, xmm1
89 pxor xmm5, xmm5
90 movdqa xmm0, [esi]
91 movdqa xmm4, [esi + 16]
92 psubd xmm1, [esi + ebp]
93 psubd xmm5, [esi + ebp + 16]
94 movdqa xmm2, [esi + ebp * 2]
95 movdqa xmm6, [esi + ebp * 2 + 16]
96
97 paddd xmm0, [g_const_d0x80]
98 paddd xmm4, [g_const_d0x80]
99
100 // ICC
101 movdqa xmm3, xmm1 // g -= r >> 1
102 movdqa xmm7, xmm5
103 psrad xmm3, 1
104 psrad xmm7, 1
105 psubd xmm0, xmm3
106 psubd xmm4, xmm7
107
108 movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
109 movdqa xmm7, [g_const_d1]
110 paddd xmm3, xmm2
111 paddd xmm7, xmm6
112 paddd xmm1, xmm0
113 paddd xmm5, xmm4
114 psrad xmm3, 1
115 psrad xmm7, 1
116 psubd xmm1, xmm3
117 psubd xmm5, xmm7
118
119 paddd xmm2, xmm1 // b += r
120 paddd xmm6, xmm5
121
122 pslld xmm0, 8
123 pslld xmm2, 16
124 pslld xmm4, 8
125 pslld xmm6, 16
126 por xmm0, xmm1
127 por xmm4, xmm5
128 por xmm0, xmm2
129 por xmm4, xmm6
130
131 movdqa [esp + DISP + 64 * 0 + 16 * 0], xmm0
132 pslld xmm0, 8
133 movdqa [esp + DISP + 64 * 0 + 16 * 1], xmm4
134 pslld xmm4, 8
135 movdqa [esp + DISP + 64 * 0 + 16 * 2], xmm0
136 movdqa [esp + DISP + 64 * 0 + 16 * 3], xmm4
137
138 // second 8 pixels
139 pxor xmm1, xmm1
140 pxor xmm5, xmm5
141 movdqa xmm0, [esi + 32]
142 movdqa xmm4, [esi + 48]
143 psubd xmm1, [esi + ebp + 32]
144 psubd xmm5, [esi + ebp + 48]
145 movdqa xmm2, [esi + ebp * 2 + 32]
146 movdqa xmm6, [esi + ebp * 2 + 48]
147
148 paddd xmm0, [g_const_d0x80]
149 paddd xmm4, [g_const_d0x80]
150
151 // ICC
152 movdqa xmm3, xmm1 // g -= r >> 1
153 movdqa xmm7, xmm5
154 psrad xmm3, 1
155 psrad xmm7, 1
156 psubd xmm0, xmm3
157 psubd xmm4, xmm7
158
159 movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
160 movdqa xmm7, [g_const_d1]
161 paddd xmm3, xmm2
162 paddd xmm7, xmm6
163 paddd xmm1, xmm0
164 paddd xmm5, xmm4
165 psrad xmm3, 1
166 psrad xmm7, 1
167 psubd xmm1, xmm3
168 psubd xmm5, xmm7
169
170 paddd xmm2, xmm1 // b += r
171 paddd xmm6, xmm5
172
173 pslld xmm0, 8
174 pslld xmm2, 16
175 pslld xmm4, 8
176 pslld xmm6, 16
177 por xmm0, xmm1
178 por xmm4, xmm5
179 por xmm0, xmm2
180 por xmm4, xmm6
181
182 movdqa [esp + DISP + 64 * 1 + 16 * 0], xmm0
183 pslld xmm0, 8
184 movdqa [esp + DISP + 64 * 1 + 16 * 1], xmm4
185 pslld xmm4, 8
186 movdqa [esp + DISP + 64 * 1 + 16 * 2], xmm0
187 movdqa [esp + DISP + 64 * 1 + 16 * 3], xmm4
188
189 //================
190 add esi, 64
191
192 // first 8 pixels
193 pxor xmm1, xmm1
194 pxor xmm5, xmm5
195 movdqa xmm0, [esi]
196 movdqa xmm4, [esi + 16]
197 psubd xmm1, [esi + ebp]
198 psubd xmm5, [esi + ebp + 16]
199 movdqa xmm2, [esi + ebp * 2]
200 movdqa xmm6, [esi + ebp * 2 + 16]
201
202 paddd xmm0, [g_const_d0x80]
203 paddd xmm4, [g_const_d0x80]
204
205 // ICC
206 movdqa xmm3, xmm1 // g -= r >> 1
207 movdqa xmm7, xmm5
208 psrad xmm3, 1
209 psrad xmm7, 1
210 psubd xmm0, xmm3
211 psubd xmm4, xmm7
212
213 movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
214 movdqa xmm7, [g_const_d1]
215 paddd xmm3, xmm2
216 paddd xmm7, xmm6
217 paddd xmm1, xmm0
218 paddd xmm5, xmm4
219 psrad xmm3, 1
220 psrad xmm7, 1
221 psubd xmm1, xmm3
222 psubd xmm5, xmm7
223
224 paddd xmm2, xmm1 // b += r
225 paddd xmm6, xmm5
226
227 pslld xmm0, 8
228 pslld xmm2, 16
229 pslld xmm4, 8
230 pslld xmm6, 16
231
232 por xmm0, xmm1
233 por xmm4, xmm5
234 por xmm0, xmm2
235 por xmm4, xmm6
236
237 movdqa [esp + DISP + 64 * 2 + 16 * 0], xmm0
238 pslld xmm0, 8
239 movdqa [esp + DISP + 64 * 2 + 16 * 1], xmm4
240 pslld xmm4, 8
241 movdqa [esp + DISP + 64 * 2 + 16 * 2], xmm0
242 movdqa [esp + DISP + 64 * 2 + 16 * 3], xmm4
243
244 // second 8 pixels
245 pxor xmm1, xmm1
246 pxor xmm5, xmm5
247 movdqa xmm0, [esi + 32]
248 movdqa xmm4, [esi + 48]
249 psubd xmm1, [esi + ebp + 32]
250 psubd xmm5, [esi + ebp + 48]
251 movdqa xmm2, [esi + ebp * 2 + 32]
252 movdqa xmm6, [esi + ebp * 2 + 48]
253
254 paddd xmm0, [g_const_d0x80]
255 paddd xmm4, [g_const_d0x80]
256
257 // ICC
258 movdqa xmm3, xmm1 // g -= r >> 1
259 movdqa xmm7, xmm5
260 psrad xmm3, 1
261 psrad xmm7, 1
262 psubd xmm0, xmm3
263 psubd xmm4, xmm7
264
265 movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
266 movdqa xmm7, [g_const_d1]
267 paddd xmm3, xmm2
268 paddd xmm7, xmm6
269 paddd xmm1, xmm0
270 paddd xmm5, xmm4
271 psrad xmm3, 1
272 psrad xmm7, 1
273 psubd xmm1, xmm3
274 psubd xmm5, xmm7
275
276 paddd xmm2, xmm1 // b += r
277 paddd xmm6, xmm5
278
279 pslld xmm0, 8
280 pslld xmm2, 16
281 pslld xmm4, 8
282 pslld xmm6, 16
283 por xmm0, xmm1
284 por xmm4, xmm5
285 por xmm0, xmm2
286 por xmm4, xmm6
287
288 movdqa [esp + DISP + 64 * 3 + 16 * 0], xmm0
289 pslld xmm0, 8
290 movdqa [esp + DISP + 64 * 3 + 16 * 1], xmm4
291 pslld xmm4, 8
292 movdqa [esp + DISP + 64 * 3 + 16 * 2], xmm0
293 movdqa [esp + DISP + 64 * 3 + 16 * 3], xmm4
294
295 //================================
296 // RGBX32 -> RGB24
297 mov eax, [esp + DISP + 64 * 0 + 4] // ..B1G1R1
298 mov ecx, [esp + DISP + 64 * 0 + 32] // B0G0R0..
299 shld eax, ecx, 24 // R1B0G0R0
300 mov [edi + ebx + 0], eax
301 mov eax, [esp + DISP + 64 * 0 + 20] // ..B5G5R5
302 mov ecx, [esp + DISP + 64 * 0 + 36] // B1G1R1..
303 shld eax, ecx, 16 // G5R5B1G1
304 mov [edi + ebx + 4], eax
305 mov eax, [esp + DISP + 64 * 0 + 16] // ..B4G4R4
306 mov ecx, [esp + DISP + 64 * 0 + 52] // B5G5R5..
307 shld eax, ecx, 8 // B4G4R4B5
308 mov [edi + ebx + 8], eax
309 add edi, edx // $edi = pbRGB += cbRGB
310
311 mov eax, [esp + DISP + 64 * 0 + 4 + 8] // ..B3G3R3
312 mov ecx, [esp + DISP + 64 * 0 + 32 + 8] // B2G2R2..
313 shld eax, ecx, 24 // R3B2G2R2
314 mov [edi + ebx + 0], eax
315 mov eax, [esp + DISP + 64 * 0 + 20 + 8] // ..B7G7R7
316 mov ecx, [esp + DISP + 64 * 0 + 36 + 8] // B3G3R3..
317 shld eax, ecx, 16 // G7R7B3G3
318 mov [edi + ebx + 4], eax
319 mov eax, [esp + DISP + 64 * 0 + 16 + 8] // ..B6G6R6
320 mov ecx, [esp + DISP + 64 * 0 + 52 + 8] // B7G7R7..
321 shld eax, ecx, 8 // B6G6R6B7
322 mov [edi + ebx + 8], eax
323 add edi, edx // $edi = pbRGB += cbRGB
324
325 // RGBX32 -> RGB24
326 mov eax, [esp + DISP + 64 * 1 + 4 + 8] // ..B3G3R3
327 mov ecx, [esp + DISP + 64 * 1 + 32 + 8] // B2G2R2..
328 shld eax, ecx, 24 // R3B2G2R2
329 mov [edi + ebx + 0], eax
330 mov eax, [esp + DISP + 64 * 1 + 20 + 8] // ..B7G7R7
331 mov ecx, [esp + DISP + 64 * 1 + 36 + 8] // B3G3R3..
332 shld eax, ecx, 16 // G7R7B3G3
333 mov [edi + ebx + 4], eax
334 mov eax, [esp + DISP + 64 * 1 + 16 + 8] // ..B6G6R6
335 mov ecx, [esp + DISP + 64 * 1 + 52 + 8] // B7G7R7..
336 shld eax, ecx, 8 // B6G6R6B7
337 mov [edi + ebx + 8], eax
338 add edi, edx // $edi = pbRGB += cbRGB
339
340 mov eax, [esp + DISP + 64 * 1 + 4] // ..B1G1R1
341 mov ecx, [esp + DISP + 64 * 1 + 32] // B0G0R0..
342 shld eax, ecx, 24 // R1B0G0R0
343 mov [edi + ebx + 0], eax
344 mov eax, [esp + DISP + 64 * 1 + 20] // ..B5G5R5
345 mov ecx, [esp + DISP + 64 * 1 + 36] // B1G1R1..
346 shld eax, ecx, 16 // G5R5B1G1
347 mov [edi + ebx + 4], eax
348 mov eax, [esp + DISP + 64 * 1 + 16] // ..B4G4R4
349 mov ecx, [esp + DISP + 64 * 1 + 52] // B5G5R5..
350 shld eax, ecx, 8 // B4G4R4B5
351 mov [edi + ebx + 8], eax
352 add edi, edx // $edi = pbRGB += cbRGB
353
354 // RGBX32 -> RGB24
355 mov eax, [esp + DISP + 64 * 2 + 4] // ..B1G1R1
356 mov ecx, [esp + DISP + 64 * 2 + 32] // B0G0R0..
357 shld eax, ecx, 24 // R1B0G0R0
358 mov [edi + ebx + 0], eax
359 mov eax, [esp + DISP + 64 * 2 + 20] // ..B5G5R5
360 mov ecx, [esp + DISP + 64 * 2 + 36] // B1G1R1..
361 shld eax, ecx, 16 // G5R5B1G1
362 mov [edi + ebx + 4], eax
363 mov eax, [esp + DISP + 64 * 2 + 16] // ..B4G4R4
364 mov ecx, [esp + DISP + 64 * 2 + 52] // B5G5R5..
365 shld eax, ecx, 8 // B4G4R4B5
366 mov [edi + ebx + 8], eax
367 add edi, edx // $edi = pbRGB += cbRGB
368
369 mov eax, [esp + DISP + 64 * 2 + 4 + 8] // ..B3G3R3
370 mov ecx, [esp + DISP + 64 * 2 + 32 + 8] // B2G2R2..
371 shld eax, ecx, 24 // R3B2G2R2
372 mov [edi + ebx + 0], eax
373 mov eax, [esp + DISP + 64 * 2 + 20 + 8] // ..B7G7R7
374 mov ecx, [esp + DISP + 64 * 2 + 36 + 8] // B3G3R3..
375 shld eax, ecx, 16 // G7R7B3G3
376 mov [edi + ebx + 4], eax
377 mov eax, [esp + DISP + 64 * 2 + 16 + 8] // ..B6G6R6
378 mov ecx, [esp + DISP + 64 * 2 + 52 + 8] // B7G7R7..
379 shld eax, ecx, 8 // B6G6R6B7
380 mov [edi + ebx + 8], eax
381 add edi, edx // $edi = pbRGB += cbRGB
382
383 // RGBX32 -> RGB24
384 mov eax, [esp + DISP + 64 * 3 + 4 + 8] // ..B3G3R3
385 mov ecx, [esp + DISP + 64 * 3 + 32 + 8] // B2G2R2..
386 shld eax, ecx, 24 // R3B2G2R2
387 mov [edi + ebx + 0], eax
388 mov eax, [esp + DISP + 64 * 3 + 20 + 8] // ..B7G7R7
389 mov ecx, [esp + DISP + 64 * 3 + 36 + 8] // B3G3R3..
390 shld eax, ecx, 16 // G7R7B3G3
391 mov [edi + ebx + 4], eax
392 mov eax, [esp + DISP + 64 * 3 + 16 + 8] // ..B6G6R6
393 mov ecx, [esp + DISP + 64 * 3 + 52 + 8] // B7G7R7..
394 shld eax, ecx, 8 // B6G6R6B7
395 mov [edi + ebx + 8], eax
396 add edi, edx // $edi = pbRGB += cbRGB
397
398 mov eax, [esp + DISP + 64 * 3 + 4] // ..B1G1R1
399 mov ecx, [esp + DISP + 64 * 3 + 32] // B0G0R0..
400 shld eax, ecx, 24 // R1B0G0R0
401 mov [edi + ebx + 0], eax
402 mov eax, [esp + DISP + 64 * 3 + 20] // ..B5G5R5
403 mov ecx, [esp + DISP + 64 * 3 + 36] // B1G1R1..
404 shld eax, ecx, 16 // G5R5B1G1
405 mov [edi + ebx + 4], eax
406 mov eax, [esp + DISP + 64 * 3 + 16] // ..B4G4R4
407 mov ecx, [esp + DISP + 64 * 3 + 52] // B5G5R5..
408 shld eax, ecx, 8 // B4G4R4B5
409 mov [edi + ebx + 8], eax
410
411 //================================
412 add esi, 256 - 64
413 add ebx, 12
414 jnz Loop0
415
416 //================
417 pop esp
418 pop edi
419 pop esi
420 pop ebx
421 pop ebp
422 ret 20
423 }
424 }
425
outputMBRow_RGB24_Lossless_1(CWMImageStrCodec * pSC)426 Int outputMBRow_RGB24_Lossless_1(CWMImageStrCodec* pSC)
427 {
428 #ifdef REENTRANT_MODE
429 const size_t cHeight = min((pSC->m_Dparam->cROIBottomY + 1) - (pSC->cRow - 1) * 16, 16);
430 const size_t iFirstRow = ((pSC->cRow - 1) * 16 > pSC->m_Dparam->cROITopY ? 0 : (pSC->m_Dparam->cROITopY & 0xf));
431 #endif
432 const size_t cbRGB = pSC->WMIBI.cbStride;
433 const U8* const pbRGB = (U8*)pSC->WMIBI.pv + cbRGB * (pSC->cRow - 1) * 16;
434
435 U8* const pbY = (U8*)pSC->a0MBbuffer[0];
436 U8* const pbU = (U8*)pSC->a0MBbuffer[1];
437 // U8* const pbV = (U8*)pSC->a0MBbuffer[2];
438
439 const size_t cmbColumn = (pSC->WMII.cWidth + 15) / 16;
440
441 assert(BD_8 == pSC->WMII.bdBitDepth);
442 assert(CF_RGB == pSC->WMII.cfColorFormat);
443 assert(24 == pSC->WMII.cBitsPerUnit);
444 assert(pSC->WMII.bRGB);
445 assert(O_NONE == pSC->WMII.oOrientation);
446
447 assert(YUV_444 == pSC->m_param.cfColorFormat);
448 assert(!pSC->m_param.bScaledArith);
449
450 assert(pSC->m_Dparam->bDecodeFullFrame);
451
452 storeRGB24_5(pbY + 64 * 0, pbU - pbY, pbRGB + cbRGB * 0, cbRGB, cmbColumn);
453 storeRGB24_5(pbY + 64 * 2, pbU - pbY, pbRGB + cbRGB * 8, cbRGB, cmbColumn);
454
455 #ifdef REENTRANT_MODE
456 pSC->WMIBI.cLinesDecoded = cHeight - iFirstRow;
457 #endif
458 return ICERR_OK;
459 }
460
461
storeRGB24_3(U8 * pbYCoCg,size_t cbYCoCg,const U8 * pbRGB,size_t cbRGB,size_t cmb,const U8 * Shift)462 __declspec(naked) void __stdcall storeRGB24_3(
463 U8* pbYCoCg,
464 size_t cbYCoCg,
465 const U8* pbRGB,
466 size_t cbRGB,
467 size_t cmb,
468 const U8* Shift)
469 {
470 UNREFERENCED_PARAMETER( pbYCoCg );
471 UNREFERENCED_PARAMETER( cbYCoCg );
472 UNREFERENCED_PARAMETER( pbRGB );
473 UNREFERENCED_PARAMETER( cbRGB );
474 UNREFERENCED_PARAMETER( cmb );
475 UNREFERENCED_PARAMETER( Shift );
476 __asm {
477 push ebp
478 push ebx
479 push esi
480 push edi
481
482 mov ecx, [esp + 40] // $ecx = Shift
483 mov ebx, [esp + 36] // $ebx = cmb
484 mov edi, [esp + 28] // $edi = pbRGB
485 lea ebx, [ebx + ebx * 2] // $ebx = cmb * 3
486 mov edx, [esp + 32] // $edx = cbRGB
487 shl ebx, 4 // $ebx = cmb * 3 * 16
488 mov esi, [esp + 20] // $esi = pbYCoCg
489 add edi, ebx // $edi = pbRGB + 3 * 16 * cmb
490 mov ebp, [esp + 24] // $ebp = cbYCoCg
491 neg ebx
492
493 mov eax, esp
494 and esp, 0xffffff80
495 sub esp, 320
496
497 mov [esp], eax // original $esp
498 mov [esp + 4], edi
499 mov [esp + 8], ecx
500 }
501 Loop0:
502 __asm {
503 mov edi, [esp + 4] // $edi = pbRGB + 3 * 16 * cmb
504
505 //================
506 // first 8 pixels
507 movdqa xmm0, [esi]
508 movdqa xmm4, [esi + 16]
509 movdqa xmm3, [esi + ebp]
510 movdqa xmm7, [esi + ebp + 16]
511 movdqa xmm2, [esi + ebp * 2]
512 movdqa xmm6, [esi + ebp * 2 + 16]
513
514 mov ecx, [esp + 8]
515 movdqa xmm1, [ecx]
516 movdqa xmm5, [g_const_d0x80]
517 pslld xmm5, xmm1
518 paddd xmm5, xmm1
519 paddd xmm0, xmm5 // bias
520 paddd xmm4, xmm5 // bias
521 pxor xmm1, xmm1
522 pxor xmm5, xmm5
523 psubd xmm1, xmm3
524 psubd xmm5, xmm7
525
526 // ICC
527 movdqa xmm3, xmm1 // g -= r >> 1
528 movdqa xmm7, xmm5
529 psrad xmm3, 1
530 psrad xmm7, 1
531 psubd xmm0, xmm3
532 psubd xmm4, xmm7
533
534 movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
535 movdqa xmm7, [g_const_d1]
536 paddd xmm3, xmm2
537 paddd xmm7, xmm6
538 paddd xmm1, xmm0
539 paddd xmm5, xmm4
540 psrad xmm3, 1
541 psrad xmm7, 1
542 psubd xmm1, xmm3
543 psubd xmm5, xmm7
544
545 paddd xmm2, xmm1 // b += r
546 paddd xmm6, xmm5
547
548 // clip
549 movdqa xmm3, [g_const_w0x80]
550 packssdw xmm0, xmm4
551 packssdw xmm1, xmm5
552 packssdw xmm2, xmm6
553
554 mov ecx, [esp + 8]
555 movdqa xmm4, [ecx]
556 psraw xmm0, xmm4
557 psraw xmm1, xmm4
558 psraw xmm2, xmm4
559
560 psubw xmm0, xmm3
561 psubw xmm1, xmm3
562 psubw xmm2, xmm3
563
564 movdqa [esp + 16], xmm0
565 movdqa [esp + 32], xmm1
566 movdqa [esp + 48], xmm2
567
568 //================
569 // second 8 pixels
570 movdqa xmm0, [esi + 32]
571 movdqa xmm4, [esi + 48]
572 movdqa xmm3, [esi + ebp + 32]
573 movdqa xmm7, [esi + ebp + 48]
574 movdqa xmm2, [esi + ebp * 2 + 32]
575 movdqa xmm6, [esi + ebp * 2 + 48]
576
577 mov ecx, [esp + 8]
578 movdqa xmm1, [ecx]
579 movdqa xmm5, [g_const_d0x80]
580 pslld xmm5, xmm1
581 paddd xmm5, xmm1
582 paddd xmm0, xmm5 // bias
583 paddd xmm4, xmm5 // bias
584 pxor xmm1, xmm1
585 pxor xmm5, xmm5
586 psubd xmm1, xmm3
587 psubd xmm5, xmm7
588
589 // ICC
590 movdqa xmm3, xmm1 // g -= r >> 1
591 movdqa xmm7, xmm5
592 psrad xmm3, 1
593 psrad xmm7, 1
594 psubd xmm0, xmm3
595 psubd xmm4, xmm7
596
597 movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
598 movdqa xmm7, [g_const_d1]
599 paddd xmm3, xmm2
600 paddd xmm7, xmm6
601 paddd xmm1, xmm0
602 paddd xmm5, xmm4
603 psrad xmm3, 1
604 psrad xmm7, 1
605 psubd xmm1, xmm3
606 psubd xmm5, xmm7
607
608 paddd xmm2, xmm1 // b += r
609 paddd xmm6, xmm5
610
611 // clip
612 movdqa xmm3, [g_const_w0x80]
613 packssdw xmm0, xmm4
614 packssdw xmm1, xmm5
615 packssdw xmm2, xmm6
616
617 mov ecx, [esp + 8]
618 movdqa xmm4, [ecx]
619 psraw xmm0, xmm4
620 psraw xmm1, xmm4
621 psraw xmm2, xmm4
622
623 psubw xmm0, xmm3
624 psubw xmm1, xmm3
625 psubw xmm2, xmm3
626
627 //================
628 // 16 pixels
629 movdqa xmm3, [g_const_b0x80]
630 packsswb xmm0, [esp + 16]
631 packsswb xmm1, [esp + 32]
632 packsswb xmm2, [esp + 48]
633
634 psubb xmm0, xmm3
635 psubb xmm1, xmm3
636 psubb xmm2, xmm3
637
638 pxor xmm7, xmm7
639 movdqa xmm4, xmm0
640 movdqa xmm5, xmm1
641 movdqa xmm6, xmm2
642
643 punpckhbw xmm0, xmm7
644 punpckhbw xmm1, xmm7
645 punpckhbw xmm2, xmm7
646 punpcklbw xmm4, xmm7
647 punpcklbw xmm5, xmm7
648 punpcklbw xmm6, xmm7
649
650 // spill second 8 pixels
651 movdqa [esp + 16], xmm4
652 movdqa [esp + 32], xmm5
653 movdqa [esp + 48], xmm6
654
655 // first 8 pixels
656 movdqa xmm4, xmm0
657 movdqa xmm5, xmm1
658 movdqa xmm6, xmm2
659
660 punpcklwd xmm0, xmm7
661 punpcklwd xmm1, xmm7
662 punpcklwd xmm2, xmm7
663
664 punpckhwd xmm4, xmm7
665 punpckhwd xmm5, xmm7
666 punpckhwd xmm6, xmm7
667
668 pslld xmm0, 8
669 pslld xmm2, 16
670 pslld xmm4, 8
671 pslld xmm6, 16
672
673 por xmm0, xmm1
674 por xmm4, xmm5
675 por xmm0, xmm2
676 por xmm4, xmm6
677
678 movdqa [esp + 64], xmm0
679 pslld xmm0, 8
680 movdqa [esp + 80], xmm4
681 pslld xmm4, 8
682 movdqa [esp + 96], xmm0
683 movdqa [esp + 112], xmm4
684
685 // second 8 pixels
686 movdqa xmm0, [esp + 16]
687 movdqa xmm1, [esp + 32]
688 movdqa xmm2, [esp + 48]
689 movdqa xmm4, xmm0
690 movdqa xmm5, xmm1
691 movdqa xmm6, xmm2
692
693 punpcklwd xmm0, xmm7
694 punpcklwd xmm1, xmm7
695 punpcklwd xmm2, xmm7
696 punpckhwd xmm4, xmm7
697 punpckhwd xmm5, xmm7
698 punpckhwd xmm6, xmm7
699
700 pslld xmm0, 8
701 pslld xmm2, 16
702 pslld xmm4, 8
703 pslld xmm6, 16
704 por xmm0, xmm1
705 por xmm4, xmm5
706 por xmm0, xmm2
707 por xmm4, xmm6
708
709 movdqa [esp + 128], xmm0
710 pslld xmm0, 8
711 movdqa [esp + 144], xmm4
712 pslld xmm4, 8
713 movdqa [esp + 160], xmm0
714 movdqa [esp + 176], xmm4
715
716 //================================
717 add esi, 64
718
719 //================
720 // first 8 pixels
721 movdqa xmm0, [esi]
722 movdqa xmm4, [esi + 16]
723 movdqa xmm3, [esi + ebp]
724 movdqa xmm7, [esi + ebp + 16]
725 movdqa xmm2, [esi + ebp * 2]
726 movdqa xmm6, [esi + ebp * 2 + 16]
727
728 mov ecx, [esp + 8]
729 movdqa xmm1, [ecx]
730 movdqa xmm5, [g_const_d0x80]
731 pslld xmm5, xmm1
732 paddd xmm5, xmm1
733 paddd xmm0, xmm5 // bias
734 paddd xmm4, xmm5 // bias
735 pxor xmm1, xmm1
736 pxor xmm5, xmm5
737 psubd xmm1, xmm3
738 psubd xmm5, xmm7
739
740 // ICC
741 movdqa xmm3, xmm1 // g -= r >> 1
742 movdqa xmm7, xmm5
743 psrad xmm3, 1
744 psrad xmm7, 1
745 psubd xmm0, xmm3
746 psubd xmm4, xmm7
747
748 movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
749 movdqa xmm7, [g_const_d1]
750 paddd xmm3, xmm2
751 paddd xmm7, xmm6
752 paddd xmm1, xmm0
753 paddd xmm5, xmm4
754 psrad xmm3, 1
755 psrad xmm7, 1
756 psubd xmm1, xmm3
757 psubd xmm5, xmm7
758
759 paddd xmm2, xmm1 // b += r
760 paddd xmm6, xmm5
761
762 // clip
763 movdqa xmm3, [g_const_w0x80]
764 packssdw xmm0, xmm4
765 packssdw xmm1, xmm5
766 packssdw xmm2, xmm6
767
768 mov ecx, [esp + 8]
769 movdqa xmm4, [ecx]
770 psraw xmm0, xmm4
771 psraw xmm1, xmm4
772 psraw xmm2, xmm4
773
774 psubw xmm0, xmm3
775 psubw xmm1, xmm3
776 psubw xmm2, xmm3
777
778 movdqa [esp + 16], xmm0
779 movdqa [esp + 32], xmm1
780 movdqa [esp + 48], xmm2
781
782 //================
783 // second 8 pixels
784 movdqa xmm0, [esi + 32]
785 movdqa xmm4, [esi + 48]
786 movdqa xmm3, [esi + ebp + 32]
787 movdqa xmm7, [esi + ebp + 48]
788 movdqa xmm2, [esi + ebp * 2 + 32]
789 movdqa xmm6, [esi + ebp * 2 + 48]
790
791 mov ecx, [esp + 8]
792 movdqa xmm1, [ecx]
793 movdqa xmm5, [g_const_d0x80]
794 pslld xmm5, xmm1
795 paddd xmm5, xmm1
796 paddd xmm0, xmm5 // bias
797 paddd xmm4, xmm5 // bias
798 pxor xmm1, xmm1
799 pxor xmm5, xmm5
800 psubd xmm1, xmm3
801 psubd xmm5, xmm7
802
803 // ICC
804 movdqa xmm3, xmm1 // g -= r >> 1
805 movdqa xmm7, xmm5
806 psrad xmm3, 1
807 psrad xmm7, 1
808 psubd xmm0, xmm3
809 psubd xmm4, xmm7
810
811 movdqa xmm3, [g_const_d1] // r -= ((b + 1) >> 1) - g
812 movdqa xmm7, [g_const_d1]
813 paddd xmm3, xmm2
814 paddd xmm7, xmm6
815 paddd xmm1, xmm0
816 paddd xmm5, xmm4
817 psrad xmm3, 1
818 psrad xmm7, 1
819 psubd xmm1, xmm3
820 psubd xmm5, xmm7
821
822 paddd xmm2, xmm1 // b += r
823 paddd xmm6, xmm5
824
825 // clip
826 movdqa xmm3, [g_const_w0x80]
827 packssdw xmm0, xmm4
828 packssdw xmm1, xmm5
829 packssdw xmm2, xmm6
830
831 mov ecx, [esp + 8]
832 movdqa xmm4, [ecx]
833 psraw xmm0, xmm4
834 psraw xmm1, xmm4
835 psraw xmm2, xmm4
836
837 psubw xmm0, xmm3
838 psubw xmm1, xmm3
839 psubw xmm2, xmm3
840
841 //================
842 // 16 pixels
843 movdqa xmm3, [g_const_b0x80]
844 packsswb xmm0, [esp + 16]
845 packsswb xmm1, [esp + 32]
846 packsswb xmm2, [esp + 48]
847
848 psubb xmm0, xmm3
849 psubb xmm1, xmm3
850 psubb xmm2, xmm3
851
852 pxor xmm7, xmm7
853 movdqa xmm4, xmm0
854 movdqa xmm5, xmm1
855 movdqa xmm6, xmm2
856
857 punpckhbw xmm0, xmm7
858 punpckhbw xmm1, xmm7
859 punpckhbw xmm2, xmm7
860 punpcklbw xmm4, xmm7
861 punpcklbw xmm5, xmm7
862 punpcklbw xmm6, xmm7
863
864 // spill second 8 pixels
865 movdqa [esp + 16], xmm4
866 movdqa [esp + 32], xmm5
867 movdqa [esp + 48], xmm6
868
869 // first 8 pixels
870 movdqa xmm4, xmm0
871 movdqa xmm5, xmm1
872 movdqa xmm6, xmm2
873
874 punpcklwd xmm0, xmm7
875 punpcklwd xmm1, xmm7
876 punpcklwd xmm2, xmm7
877
878 punpckhwd xmm4, xmm7
879 punpckhwd xmm5, xmm7
880 punpckhwd xmm6, xmm7
881
882 pslld xmm0, 8
883 pslld xmm2, 16
884 pslld xmm4, 8
885 pslld xmm6, 16
886
887 por xmm0, xmm1
888 por xmm4, xmm5
889 por xmm0, xmm2
890 por xmm4, xmm6
891
892 movdqa [esp + 192], xmm0
893 pslld xmm0, 8
894 movdqa [esp + 208], xmm4
895 pslld xmm4, 8
896 movdqa [esp + 224], xmm0
897 movdqa [esp + 240], xmm4
898
899 // second 8 pixels
900 movdqa xmm0, [esp + 16]
901 movdqa xmm1, [esp + 32]
902 movdqa xmm2, [esp + 48]
903 movdqa xmm4, xmm0
904 movdqa xmm5, xmm1
905 movdqa xmm6, xmm2
906
907 punpcklwd xmm0, xmm7
908 punpcklwd xmm1, xmm7
909 punpcklwd xmm2, xmm7
910 punpckhwd xmm4, xmm7
911 punpckhwd xmm5, xmm7
912 punpckhwd xmm6, xmm7
913
914 pslld xmm0, 8
915 pslld xmm2, 16
916 pslld xmm4, 8
917 pslld xmm6, 16
918 por xmm0, xmm1
919 por xmm4, xmm5
920 por xmm0, xmm2
921 por xmm4, xmm6
922
923 movdqa [esp + 256], xmm0
924 pslld xmm0, 8
925 movdqa [esp + 272], xmm4
926 pslld xmm4, 8
927 movdqa [esp + 288], xmm0
928 movdqa [esp + 304], xmm4
929
930 // RGBX32 -> RGB24
931 mov eax, [esp + 68] // ..B1G1R1
932 mov ecx, [esp + 96] // B0G0R0..
933 shld eax, ecx, 24 // R1B0G0R0
934 mov [edi + ebx + 0], eax
935 mov eax, [esp + 84] // ..B5G5R5
936 mov ecx, [esp + 100] // B1G1R1..
937 shld eax, ecx, 16 // G5R5B1G1
938 mov [edi + ebx + 4], eax
939 mov eax, [esp + 80] // ..B4G4R4
940 mov ecx, [esp + 116] // B5G5R5..
941 shld eax, ecx, 8 // B4G4R4B5
942 mov [edi + ebx + 8], eax
943 add edi, edx // $edi = pbRGB += cbRGB
944
945 mov eax, [esp + 76] // ..B3G3R3
946 mov ecx, [esp + 104] // B2G2R2..
947 shld eax, ecx, 24 // R3B2G2R2
948 mov [edi + ebx + 0], eax
949 mov eax, [esp + 92] // ..B7G7R7
950 mov ecx, [esp + 108] // B3G3R3..
951 shld eax, ecx, 16 // G7R7B3G3
952 mov [edi + ebx + 4], eax
953 mov eax, [esp + 88] // ..B6G6R6
954 mov ecx, [esp + 124] // B7G7R7..
955 shld eax, ecx, 8 // B6G6R6B7
956 mov [edi + ebx + 8], eax
957 add edi, edx // $edi = pbRGB += cbRGB
958
959 // RGBX32 -> RGB24
960 mov eax, [esp + 140] // ..B3G3R3
961 mov ecx, [esp + 168] // B2G2R2..
962 shld eax, ecx, 24 // R3B2G2R2
963 mov [edi + ebx + 0], eax
964 mov eax, [esp + 156] // ..B7G7R7
965 mov ecx, [esp + 172] // B3G3R3..
966 shld eax, ecx, 16 // G7R7B3G3
967 mov [edi + ebx + 4], eax
968 mov eax, [esp + 152] // ..B6G6R6
969 mov ecx, [esp + 188] // B7G7R7..
970 shld eax, ecx, 8 // B6G6R6B7
971 mov [edi + ebx + 8], eax
972 add edi, edx // $edi = pbRGB += cbRGB
973
974 mov eax, [esp + 132] // ..B1G1R1
975 mov ecx, [esp + 160] // B0G0R0..
976 shld eax, ecx, 24 // R1B0G0R0
977 mov [edi + ebx + 0], eax
978 mov eax, [esp + 148] // ..B5G5R5
979 mov ecx, [esp + 164] // B1G1R1..
980 shld eax, ecx, 16 // G5R5B1G1
981 mov [edi + ebx + 4], eax
982 mov eax, [esp + 144] // ..B4G4R4
983 mov ecx, [esp + 180] // B5G5R5..
984 shld eax, ecx, 8 // B4G4R4B5
985 mov [edi + ebx + 8], eax
986 add edi, edx // $edi = pbRGB += cbRGB
987
988 // RGBX32 -> RGB24
989 mov eax, [esp + 196] // ..B1G1R1
990 mov ecx, [esp + 224] // B0G0R0..
991 shld eax, ecx, 24 // R1B0G0R0
992 mov [edi + ebx + 0], eax
993 mov eax, [esp + 212] // ..B5G5R5
994 mov ecx, [esp + 228] // B1G1R1..
995 shld eax, ecx, 16 // G5R5B1G1
996 mov [edi + ebx + 4], eax
997 mov eax, [esp + 208] // ..B4G4R4
998 mov ecx, [esp + 244] // B5G5R5..
999 shld eax, ecx, 8 // B4G4R4B5
1000 mov [edi + ebx + 8], eax
1001 add edi, edx // $edi = pbRGB += cbRGB
1002
1003 mov eax, [esp + 204] // ..B3G3R3
1004 mov ecx, [esp + 232] // B2G2R2..
1005 shld eax, ecx, 24 // R3B2G2R2
1006 mov [edi + ebx + 0], eax
1007 mov eax, [esp + 220] // ..B7G7R7
1008 mov ecx, [esp + 236] // B3G3R3..
1009 shld eax, ecx, 16 // G7R7B3G3
1010 mov [edi + ebx + 4], eax
1011 mov eax, [esp + 216] // ..B6G6R6
1012 mov ecx, [esp + 252] // B7G7R7..
1013 shld eax, ecx, 8 // B6G6R6B7
1014 mov [edi + ebx + 8], eax
1015 add edi, edx // $edi = pbRGB += cbRGB
1016
1017 // RGBX32 -> RGB24
1018 mov eax, [esp + 268] // ..B3G3R3
1019 mov ecx, [esp + 296] // B2G2R2..
1020 shld eax, ecx, 24 // R3B2G2R2
1021 mov [edi + ebx + 0], eax
1022 mov eax, [esp + 284] // ..B7G7R7
1023 mov ecx, [esp + 300] // B3G3R3..
1024 shld eax, ecx, 16 // G7R7B3G3
1025 mov [edi + ebx + 4], eax
1026 mov eax, [esp + 280] // ..B6G6R6
1027 mov ecx, [esp + 316] // B7G7R7..
1028 shld eax, ecx, 8 // B6G6R6B7
1029 mov [edi + ebx + 8], eax
1030 add edi, edx // $edi = pbRGB += cbRGB
1031
1032 mov eax, [esp + 260] // ..B1G1R1
1033 mov ecx, [esp + 288] // B0G0R0..
1034 shld eax, ecx, 24 // R1B0G0R0
1035 mov [edi + ebx + 0], eax
1036 mov eax, [esp + 276] // ..B5G5R5
1037 mov ecx, [esp + 292] // B1G1R1..
1038 shld eax, ecx, 16 // G5R5B1G1
1039 mov [edi + ebx + 4], eax
1040 mov eax, [esp + 272] // ..B4G4R4
1041 mov ecx, [esp + 308] // B5G5R5..
1042 shld eax, ecx, 8 // B4G4R4B5
1043 mov [edi + ebx + 8], eax
1044 add edi, edx // $edi = pbRGB += cbRGB
1045
1046 //================================
1047 add esi, 256 - 64
1048 add ebx, 12
1049 jnz Loop0
1050
1051 //================
1052 pop esp
1053 pop edi
1054 pop esi
1055 pop ebx
1056 pop ebp
1057 ret 24
1058 }
1059 }
1060
outputMBRow_RGB24_Lossy_3(CWMImageStrCodec * pSC)1061 Int outputMBRow_RGB24_Lossy_3(CWMImageStrCodec* pSC)
1062 {
1063 #ifdef REENTRANT_MODE
1064 const size_t cHeight = min((pSC->m_Dparam->cROIBottomY + 1) - (pSC->cRow - 1) * 16, 16);
1065 const size_t iFirstRow = ((pSC->cRow - 1) * 16 > pSC->m_Dparam->cROITopY ? 0 : (pSC->m_Dparam->cROITopY & 0xf));
1066 #endif
1067 const size_t cbRGB = pSC->WMIBI.cbStride;
1068 const U8* const pbRGB = (U8*)pSC->WMIBI.pv + cbRGB * (pSC->cRow - 1) * 16;
1069
1070 U8* const pbY = (U8*)pSC->a0MBbuffer[0];
1071 U8* const pbU = (U8*)pSC->a0MBbuffer[1];
1072 // U8* const pbV = (U8*)pSC->a0MBbuffer[2];
1073
1074 const size_t cmbColumn = (pSC->WMII.cWidth + 15) / 16;
1075
1076 __declspec(align(16)) U8 Shift[16];
1077
1078 assert(BD_8 == pSC->WMII.bdBitDepth);
1079 assert(CF_RGB == pSC->WMII.cfColorFormat);
1080 assert(24 == pSC->WMII.cBitsPerUnit);
1081 assert(pSC->WMII.bRGB);
1082 assert(O_NONE == pSC->WMII.oOrientation);
1083
1084 assert(YUV_444 == pSC->m_param.cfColorFormat);
1085
1086 assert(pSC->m_Dparam->bDecodeFullFrame);
1087
1088 _mm_store_si128((__m128i *) Shift, pSC->m_param.bScaledArith ? g_const_d3 : g_const_d0);
1089 storeRGB24_3(pbY + 64 * 0, pbU - pbY, pbRGB + cbRGB * 0, cbRGB, cmbColumn,
1090 Shift);
1091 storeRGB24_3(pbY + 64 * 2, pbU - pbY, pbRGB + cbRGB * 8, cbRGB, cmbColumn,
1092 Shift);
1093
1094 #ifdef REENTRANT_MODE
1095 pSC->WMIBI.cLinesDecoded = cHeight - iFirstRow;
1096 #endif
1097 return ICERR_OK;
1098 }
1099 #endif
1100
1101 //================================================================
1102 #if defined(WMP_OPT_TRFM_DEC)
strDCT2x2up_OPT(PixelI * pa,PixelI * pb,PixelI * pc,PixelI * pd)1103 FORCE_INLINE Void strDCT2x2up_OPT(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
1104 {
1105 PixelI a, b, c, d, C, t;
1106 a = *pa;
1107 b = *pb;
1108 C = *pc;
1109 d = *pd;
1110
1111 a += d;
1112 b -= C;
1113 t = ((a - b + 1) >> 1);
1114 c = t - d;
1115 d = t - C;
1116 a -= d;
1117 b += c;
1118
1119 *pa = a;
1120 *pb = b;
1121 *pc = c;
1122 *pd = d;
1123 }
1124
invOdd_OPT(PixelI * pa,PixelI * pb,PixelI * pc,PixelI * pd)1125 FORCE_INLINE Void invOdd_OPT(PixelI *pa, PixelI *pb, PixelI *pc, PixelI *pd)
1126 {
1127 PixelI a, b, c, d;
1128 a = *pa;
1129 b = *pb;
1130 c = *pc;
1131 d = *pd;
1132
1133 /** butterflies **/
1134 b += d;
1135 a -= c;
1136 d -= (b) >> 1;
1137 c += (a + 1) >> 1;
1138
1139 /** rotate pi/8 **/
1140 #define IROTATE2(a, b) (a) -= (((b)*3 + 4) >> 3), (b) += (((a)*3 + 4) >> 3)
1141 IROTATE2(a, b);
1142 IROTATE2(c, d);
1143
1144 /** butterflies **/
1145 c -= (b + 1) >> 1;
1146 d = ((a + 1) >> 1) - d;
1147 b += c;
1148 a -= d;
1149
1150 *pa = a;
1151 *pb = b;
1152 *pc = c;
1153 *pd = d;
1154 }
1155
invOddOdd_OPT(PixelI * pa,PixelI * pb,PixelI * pc,PixelI * pd)1156 FORCE_INLINE Void invOddOdd_OPT(PixelI* pa, PixelI* pb, PixelI* pc, PixelI* pd)
1157 {
1158 PixelI a, b, c, d, t1, t2;
1159 a = *pa;
1160 b = *pb;
1161 c = *pc;
1162 d = *pd;
1163
1164 /** butterflies **/
1165 d += a;
1166 c -= b;
1167 a -= (t1 = d >> 1);
1168 b += (t2 = c >> 1);
1169
1170 /** rotate pi/4 **/
1171 a -= (b * 3 + 3) >> 3;
1172 b += (a * 3 + 3) >> 2;
1173 a -= (b * 3 + 4) >> 3;
1174
1175 /** butterflies **/
1176 b -= t2;
1177 a += t1;
1178 c += b;
1179 d -= a;
1180
1181 /** sign flips **/
1182 *pa = a;
1183 *pb = -b;
1184 *pc = -c;
1185 *pd = d;
1186 }
1187
strDCT2x2dn_SSE2_1(PixelI * p)1188 FORCE_INLINE Void strDCT2x2dn_SSE2_1(PixelI* p)
1189 {
1190 __m128i* const pdq = (__m128i*)p;
1191 __m128i a = pdq[0];
1192 __m128i b = pdq[1];
1193 const __m128i C = pdq[2];
1194 __m128i d = pdq[3];
1195 __m128i t;
1196 __m128i c;
1197
1198 a = _mm_add_epi32(a, d);
1199 b = _mm_sub_epi32(b, C);
1200 t = _mm_sub_epi32(a, b);
1201 t = _mm_srai_epi32(t, 1);
1202 c = _mm_sub_epi32(t, d);
1203 d = _mm_sub_epi32(t, C);
1204 a = _mm_sub_epi32(a, d);
1205 b = _mm_add_epi32(b, c);
1206
1207 pdq[0] = a;
1208 pdq[1] = b;
1209 pdq[2] = c;
1210 pdq[3] = d;
1211 }
1212
strIDCT4x4Stage1_OPT_H1(PixelI * p)1213 Void strIDCT4x4Stage1_OPT_H1(PixelI* p)
1214 {
1215 /** top left corner, butterfly => butterfly **/
1216 strDCT2x2up_OPT(p + 0, p + 1, p + 2, p + 3);
1217
1218 /** top right corner, -pi/8 rotation => butterfly **/
1219 invOdd_OPT(p + 5, p + 4, p + 7, p + 6);
1220
1221 /** bottom left corner, butterfly => -pi/8 rotation **/
1222 invOdd_OPT(p + 10, p + 8, p + 11, p + 9);
1223
1224 /** bottom right corner, -pi/8 rotation => -pi/8 rotation **/
1225 invOddOdd_OPT(p + 15, p + 14, p + 13, p + 12);
1226 }
1227
strIDCT4x4Stage1_OPT_H2(PixelI * p)1228 FORCE_INLINE Void strIDCT4x4Stage1_OPT_H2(PixelI* p)
1229 {
1230 /** butterfly **/
1231 strDCT2x2dn_SSE2_1(p);
1232 }
1233
strIDCT4x4Stage1_OPT5(PixelI * p0,PixelI * p1)1234 Void strIDCT4x4Stage1_OPT5(PixelI* p0, PixelI* p1)
1235 {
1236 _mm_prefetch((char*)(p0 - 96 + 256), _MM_HINT_T0);
1237 strIDCT4x4Stage1_OPT_H1(p0 - 96);
1238 strIDCT4x4Stage1_OPT_H1(p0 - 80);
1239 strIDCT4x4Stage1_OPT_H1(p0 - 32);
1240 strIDCT4x4Stage1_OPT_H1(p0 - 16);
1241
1242 _mm_prefetch((char*)(p0 - 32 + 256), _MM_HINT_T0);
1243 strIDCT4x4Stage1_OPT_H1(p0 + 32);
1244 strIDCT4x4Stage1_OPT_H1(p0 + 48);
1245 strIDCT4x4Stage1_OPT_H1(p0 + 96);
1246 strIDCT4x4Stage1_OPT_H1(p0 + 112);
1247
1248 _mm_prefetch((char*)(p0 + 32 + 256), _MM_HINT_T0);
1249 strIDCT4x4Stage1_OPT_H1(p1 - 128);
1250 strIDCT4x4Stage1_OPT_H1(p1 - 112);
1251 strIDCT4x4Stage1_OPT_H1(p1 - 64);
1252 strIDCT4x4Stage1_OPT_H1(p1 - 48);
1253
1254 _mm_prefetch((char*)(p0 + 96 + 256), _MM_HINT_T0);
1255 strIDCT4x4Stage1_OPT_H1(p1 + 0);
1256 strIDCT4x4Stage1_OPT_H1(p1 + 16);
1257 strIDCT4x4Stage1_OPT_H1(p1 + 64);
1258 strIDCT4x4Stage1_OPT_H1(p1 + 80);
1259
1260 strIDCT4x4Stage1_OPT_H2(p0 - 96);
1261 strIDCT4x4Stage1_OPT_H2(p0 - 80);
1262 strIDCT4x4Stage1_OPT_H2(p0 - 32);
1263 strIDCT4x4Stage1_OPT_H2(p0 - 16);
1264 strIDCT4x4Stage1_OPT_H2(p0 + 32);
1265 strIDCT4x4Stage1_OPT_H2(p0 + 48);
1266 strIDCT4x4Stage1_OPT_H2(p0 + 96);
1267 strIDCT4x4Stage1_OPT_H2(p0 + 112);
1268
1269 strIDCT4x4Stage1_OPT_H2(p1 - 128);
1270 strIDCT4x4Stage1_OPT_H2(p1 - 112);
1271 strIDCT4x4Stage1_OPT_H2(p1 - 64);
1272 strIDCT4x4Stage1_OPT_H2(p1 - 48);
1273 strIDCT4x4Stage1_OPT_H2(p1 + 0);
1274 strIDCT4x4Stage1_OPT_H2(p1 + 16);
1275 strIDCT4x4Stage1_OPT_H2(p1 + 64);
1276 strIDCT4x4Stage1_OPT_H2(p1 + 80);
1277 }
1278
1279 //================================
strPost4x4Stage1_alternate_ASM5(PixelI * p0,PixelI * p1)1280 __declspec(naked) void __stdcall strPost4x4Stage1_alternate_ASM5(PixelI* p0, PixelI* p1)
1281 {
1282 UNREFERENCED_PARAMETER( p0 );
1283 UNREFERENCED_PARAMETER( p1 );
1284 __asm {
1285 push ebp
1286 push ebx
1287 push esi
1288 push edi
1289
1290 //================
1291 // pointer array
1292 mov eax, [esp + 20] // $esi = p0
1293 mov edx, [esp + 24] // $edi = p1
1294 mov ecx, 4 * 16
1295 mov ebx, 4 * 48
1296
1297 prefetcht0 [eax + 512]
1298 prefetcht0 [eax + 768]
1299 prefetcht0 [eax + 1024]
1300 prefetcht0 [eax + 1280]
1301
1302 add edx, ecx
1303 add eax, ebx
1304
1305 push edx
1306 sub edx, ecx
1307 push edx
1308 push edx
1309 sub edx, ebx
1310 push eax
1311 push eax
1312 sub eax, ecx
1313 push eax
1314 push eax
1315 sub eax, ecx
1316 push eax
1317 sub eax, ecx
1318
1319 push edx
1320 sub edx, ecx
1321 push edx
1322 sub eax, ecx
1323 push edx
1324 sub edx, ebx
1325 push eax
1326 push eax
1327 sub eax, ecx
1328 push eax
1329 push eax
1330 sub eax, ecx
1331 push eax
1332 sub eax, ecx
1333
1334 push edx
1335 sub edx, ecx
1336 push edx
1337 sub eax, ecx
1338 push edx
1339 sub edx, ebx
1340 push eax
1341 push eax
1342 sub eax, ecx
1343 push eax
1344 push eax
1345 sub eax, ecx
1346 push eax
1347 sub eax, ecx
1348
1349 push edx
1350 sub edx, ecx
1351 push edx
1352 sub eax, ecx
1353 push edx
1354 push eax
1355 push eax
1356 sub eax, ecx
1357 push eax
1358 push eax
1359 sub eax, ecx
1360 push eax
1361
1362 mov ebp, (4 + 4) * -16
1363 push ebp
1364 }
1365 Loop0:
1366 __asm {
1367 mov esi, [esp + (4 + 4) * 16 + 4 + ebp ] // $esi = p0
1368 mov edi, [esp + (4 + 4) * 16 + 4 + ebp + 4] // $edi = p1
1369
1370 //================
1371 movdqa xmm2, [esi + 4 * 12] // a = xmm2
1372 movdqa xmm1, [esi + 4 * 72] // b = xmm1
1373 movdqa xmm6, [edi + 4 * 4] // c = xmm6
1374 movdqa xmm7, [edi + 4 * 64] // d = xmm7
1375
1376 //================
1377 // buttefly
1378 paddd xmm2, xmm7
1379 psubd xmm1, xmm6
1380
1381 movdqa xmm0, xmm2 // a = xmm0
1382 psubd xmm2, xmm1
1383 psrad xmm2, 1
1384 movdqa xmm3, xmm2
1385
1386 psubd xmm2, xmm7 // c = xmm2
1387 psubd xmm3, xmm6 // d = xmm3
1388 paddd xmm1, xmm2
1389 psubd xmm0, xmm3
1390
1391 //================
1392 // bottom right corner: -pi/8 rotation => -pi/8 rotation
1393 pshufd xmm7, xmm3, 0x3
1394 movd eax, xmm3
1395 movd edx, xmm7
1396 pshufd xmm7, xmm3, 0x1
1397 movd ebx, xmm7
1398 pshufd xmm7, xmm3, 0x2
1399 movd ecx, xmm7
1400
1401 add edx, eax
1402 sub ecx, ebx
1403 mov esi, edx
1404 sar esi, 1
1405 mov edi, ecx
1406 sar edi, 1
1407 sub eax, esi
1408 add ebx, edi
1409
1410 lea ebp, [ebx + ebx * 2 + 6]
1411 sar ebp, 3
1412 sub eax, ebp
1413 lea ebp, [eax + eax * 2 + 2]
1414 sar ebp, 2
1415 add ebx, ebp
1416 lea ebp, [ebx + ebx * 2 + 4]
1417 sar ebp, 3
1418 sub eax, ebp
1419
1420 mov ebp, [esp]
1421
1422 sub ebx, edi
1423 add eax, esi
1424 add ecx, ebx
1425 sub edx, eax
1426
1427 mov esi, [esp + (4 + 4) * 16 + 4 + ebp ] // $esi = p0
1428 mov edi, [esp + (4 + 4) * 16 + 4 + ebp + 4] // $edi = p1
1429
1430 movd xmm3, eax
1431 movd xmm4, ebx
1432 movd xmm5, ecx
1433 movd xmm6, edx
1434 punpckldq xmm3, xmm4
1435 punpckldq xmm5, xmm6
1436 punpcklqdq xmm3, xmm5
1437
1438 //================
1439 // anti diagonal corners: rotation by -pi/8
1440 movdqa xmm5, g_const_d1
1441 movdqa xmm6, g_const_d1
1442
1443 pshufd xmm2, xmm2, 0xd8 // 7, 5, 6, 4
1444 movdqa xmm4, xmm1 // 75, 74, 73, 72
1445 punpckhqdq xmm1, xmm2 // 7, 5, 75, 74
1446 punpcklqdq xmm4, xmm2 // 6, 4, 73, 72
1447
1448 paddd xmm5, xmm1
1449 psrad xmm5, 1
1450 psubd xmm4, xmm5
1451
1452 paddd xmm6, xmm4
1453 psrad xmm6, 1
1454 paddd xmm1, xmm6
1455
1456 movdqa xmm2, xmm4 // 6, 4, 73, 72
1457 punpckhqdq xmm4, xmm1 // 7, 5, 6, 4
1458 punpcklqdq xmm2, xmm1 // 75, 74, 73, 72
1459 pshufd xmm4, xmm4, 0xd8 // 7, 6, 5, 4
1460
1461 //================
1462 // butterfly
1463 // a = xmm0, b = xmm2, c = xmm4, d = xmm3
1464 paddd xmm0, xmm3
1465 movdqa xmm1, xmm0 // a = xmm1
1466 psrad xmm0, 1
1467 psubd xmm0, xmm3 // d = xmm0
1468
1469 movdqa xmm3, xmm0 // d = xmm3
1470 paddd xmm0, xmm0
1471 paddd xmm0, xmm3
1472 psrad xmm0, 3
1473 paddd xmm1, xmm0
1474
1475 movdqa xmm0, xmm1 // a = xmm0
1476 paddd xmm1, xmm1
1477 paddd xmm1, xmm0
1478 psrad xmm1, 4
1479 paddd xmm3, xmm1
1480
1481 movdqa xmm5, xmm0 // a
1482 psrad xmm5, 7
1483 paddd xmm3, xmm5 // d += (a >> 7)
1484 psrad xmm5, 3
1485 psubd xmm3, xmm5 // d -= (a >> 10)
1486
1487 movdqa xmm5, [g_const_d4]
1488 movdqa xmm1, xmm3 // d = xmm1
1489 psubd xmm2, xmm4
1490 paddd xmm5, xmm3
1491 paddd xmm3, xmm3
1492 paddd xmm3, xmm5
1493 psrad xmm3, 3
1494 paddd xmm0, xmm3
1495
1496 movdqa xmm3, xmm2 // b = xmm3
1497 psrad xmm2, 1
1498 psubd xmm1, xmm2
1499
1500 movdqa xmm2, xmm0 // a = xmm2
1501 psubd xmm0, xmm3
1502 psrad xmm0, 1
1503 psubd xmm0, xmm4 // c = xmm0
1504
1505 paddd xmm3, xmm1
1506 psubd xmm2, xmm0
1507
1508 //================
1509 movdqa [edi + 4 * 4], xmm1
1510 movdqa [edi + 4 * 64], xmm0
1511 movdqa [esi + 4 * 12], xmm2
1512 movdqa [esi + 4 * 72], xmm3
1513
1514 add ebp, 8
1515 mov [esp], ebp
1516 jnz Loop0
1517
1518 //================
1519 add esp, (4 + 4) * 16 + 4
1520 pop edi
1521 pop esi
1522 pop ebx
1523 pop ebp
1524 ret 4 * 2
1525 }
1526 }
1527
invTransformMacroblock_YUV444_Center5(CWMImageStrCodec * pSC)1528 Int invTransformMacroblock_YUV444_Center5(CWMImageStrCodec * pSC)
1529 {
1530 const OVERLAP olOverlap = pSC->WMISCP.olOverlap;
1531 int i = 0;
1532
1533 assert(0 < pSC->cRow && pSC->cRow < pSC->cmbHeight);
1534 assert(0 < pSC->cColumn && pSC->cColumn < pSC->cmbWidth);
1535
1536 assert(0 == pSC->WMII.cPostProcStrength);
1537
1538 assert(YUV_444 == pSC->m_param.cfColorFormat);
1539 assert(3 == pSC->m_param.cNumChannels);
1540
1541 assert(pSC->m_Dparam->bDecodeFullWidth);
1542 assert(1 == pSC->m_Dparam->cThumbnailScale);
1543
1544 for (i = 0; i < 3; ++i)
1545 {
1546 PixelI* const p0 = pSC->p0MBbuffer[i];
1547 PixelI* const p1 = pSC->p1MBbuffer[i];
1548
1549 //================================
1550 // second level inverse transform
1551 strIDCT4x4Stage2(p1);
1552 if (pSC->m_param.bScaledArith) {
1553 strNormalizeDec(p1, (i != 0));
1554 }
1555
1556 //================================
1557 // second level inverse overlap
1558 if (OL_TWO <= olOverlap)
1559 {
1560 strPost4x4Stage2Split_alternate(p0, p1);
1561 }
1562
1563 //================================
1564 // first level inverse transform
1565 strIDCT4x4Stage1_OPT5(p0, p1);
1566
1567 //================================
1568 // first level inverse overlap
1569 if (OL_ONE <= olOverlap)
1570 {
1571 strPost4x4Stage1_alternate_ASM5(p0, p1);
1572 }
1573 }
1574
1575 return ICERR_OK;
1576 }
1577 #endif
1578 #endif
1579
1580 //================================================================
StrDecOpt(CWMImageStrCodec * pSC)1581 void StrDecOpt(CWMImageStrCodec* pSC)
1582 {
1583 #if defined(WMP_OPT_SSE2)
1584 if (IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
1585 {
1586 CWMImageInfo* pII = &pSC->WMII;
1587 // CWMIStrCodecParam* pSCP = &pSC->WMISCP;
1588
1589 g_const_d0 = _mm_setzero_si128();
1590 g_const_d3 = _mm_set1_epi32(3);
1591 g_const_d1 = _mm_set_epi32(1, 1, 1, 1);
1592 g_const_d4 = _mm_set_epi32(4, 4, 4, 4);
1593
1594 g_const_d0x80 = _mm_set_epi32(0x80, 0x80, 0x80, 0x80);
1595 g_const_w0x80 = _mm_set_epi16(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
1596 g_const_b0x80 = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
1597
1598 if (pSC->WMII.fPaddedUserBuffer &&
1599 //pSC->m_Dparam->bDecodeFullFrame &&
1600 //((pII->cWidth & 0xf) == 0) &&
1601 //(((int) pSC->WMIBI.pv & 0xf) == 0) &&
1602 BD_8 == pII->bdBitDepth &&
1603 CF_RGB == pII->cfColorFormat &&
1604 24 == pII->cBitsPerUnit &&
1605 pII->bRGB &&
1606 O_NONE == pII->oOrientation &&
1607 YUV_444 == pSC->m_param.cfColorFormat &&
1608 pSC->p1MBbuffer[1] - pSC->p1MBbuffer[0] == pSC->p1MBbuffer[2] - pSC->p1MBbuffer[1] &&
1609 pSC->m_Dparam->bDecodeFullFrame &&
1610 1)
1611 {
1612 #if defined(WMP_OPT_CC_DEC)
1613 if (pSC->m_param.bScaledArith || pSC->WMISCP.olOverlap != OL_NONE)
1614 {
1615 pSC->Load = outputMBRow_RGB24_Lossy_3;
1616 }
1617 else
1618 {
1619 pSC->Load = outputMBRow_RGB24_Lossless_1;
1620 }
1621 #endif // WMP_OPT_CC_DEC
1622 }
1623
1624 if (YUV_444 == pSC->m_param.cfColorFormat &&
1625 pSC->p1MBbuffer[1] - pSC->p1MBbuffer[0] == pSC->p1MBbuffer[2] - pSC->p1MBbuffer[1] &&
1626 pSC->m_Dparam->bDecodeFullWidth &&
1627 pSC->m_param.cSubVersion == CODEC_SUBVERSION_NEWSCALING_SOFT_TILES &&
1628 1 == pSC->m_Dparam->cThumbnailScale)
1629 {
1630 #if defined(WMP_OPT_TRFM_DEC)
1631 pSC->TransformCenter = invTransformMacroblock_YUV444_Center5;
1632 #endif
1633 }
1634
1635 }
1636 #else
1637 UNREFERENCED_PARAMETER( pSC );
1638 #endif
1639 }
1640
1641