1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2 * Mupen64plus-rsp-hle - jpeg.c *
3 * Mupen64Plus homepage: http://code.google.com/p/mupen64plus/ *
4 * Copyright (C) 2012 Bobby Smiles *
5 * Copyright (C) 2009 Richard Goedeken *
6 * Copyright (C) 2002 Hacktarux *
7 * *
8 * This program is free software; you can redistribute it and/or modify *
9 * it under the terms of the GNU General Public License as published by *
10 * the Free Software Foundation; either version 2 of the License, or *
11 * (at your option) any later version. *
12 * *
13 * This program is distributed in the hope that it will be useful, *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16 * GNU General Public License for more details. *
17 * *
18 * You should have received a copy of the GNU General Public License *
19 * along with this program; if not, write to the *
20 * Free Software Foundation, Inc., *
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
22 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
23
24 #include <assert.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27
28 #include "arithmetics.h"
29 #include "hle_external.h"
30 #include "hle_internal.h"
31 #include "memory.h"
32
33 #define SUBBLOCK_SIZE 64
34
35 typedef void (*tile_line_emitter_t)(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address);
36 typedef void (*subblock_transform_t)(int16_t *dst, const int16_t *src);
37
38 /* standard jpeg ucode decoder */
39 static void jpeg_decode_std(struct hle_t* hle,
40 const char *const version,
41 const subblock_transform_t transform_luma,
42 const subblock_transform_t transform_chroma,
43 const tile_line_emitter_t emit_line);
44
45 /* helper functions */
46 static uint8_t clamp_u8(int16_t x);
47 static int16_t clamp_s12(int16_t x);
48 static uint16_t clamp_RGBA_component(int16_t x);
49
50 /* pixel conversion & formatting */
51 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
52 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
53
54 /* tile line emitters */
55 static void EmitYUVTileLine(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address);
56 static void EmitRGBATileLine(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address);
57
58 /* macroblocks operations */
59 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
60 static void decode_macroblock_std(const subblock_transform_t transform_luma,
61 const subblock_transform_t transform_chroma,
62 int16_t *macroblock,
63 unsigned int subblock_count,
64 const int16_t qtables[3][SUBBLOCK_SIZE]);
65 static void EmitTilesMode0(struct hle_t* hle, const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
66 static void EmitTilesMode2(struct hle_t* hle, const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
67
68 /* subblocks operations */
69 static void TransposeSubBlock(int16_t *dst, const int16_t *src);
70 static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
71 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
72 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
73 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
74 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
75 static void InverseDCT1D(const float *const x, float *dst, unsigned int stride);
76 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
77 static void RescaleYSubBlock(int16_t *dst, const int16_t *src);
78 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src);
79
80 /* transposed dequantization table */
81 static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] = {
82 16, 12, 14, 14, 18, 24, 49, 72,
83 11, 12, 13, 17, 22, 35, 64, 92,
84 10, 14, 16, 22, 37, 55, 78, 95,
85 16, 19, 24, 29, 56, 64, 87, 98,
86 24, 26, 40, 51, 68, 81, 103, 112,
87 40, 58, 57, 87, 109, 104, 121, 100,
88 51, 60, 69, 80, 103, 113, 120, 103,
89 61, 55, 56, 62, 77, 92, 101, 99
90 };
91
92 /* zig-zag indices */
93 static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] = {
94 0, 1, 5, 6, 14, 15, 27, 28,
95 2, 4, 7, 13, 16, 26, 29, 42,
96 3, 8, 12, 17, 25, 30, 41, 43,
97 9, 11, 18, 24, 31, 40, 44, 53,
98 10, 19, 23, 32, 39, 45, 52, 54,
99 20, 22, 33, 38, 46, 51, 55, 60,
100 21, 34, 37, 47, 50, 56, 59, 61,
101 35, 36, 48, 49, 57, 58, 62, 63
102 };
103
104 /* transposition indices */
105 static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] = {
106 0, 8, 16, 24, 32, 40, 48, 56,
107 1, 9, 17, 25, 33, 41, 49, 57,
108 2, 10, 18, 26, 34, 42, 50, 58,
109 3, 11, 19, 27, 35, 43, 51, 59,
110 4, 12, 20, 28, 36, 44, 52, 60,
111 5, 13, 21, 29, 37, 45, 53, 61,
112 6, 14, 22, 30, 38, 46, 54, 62,
113 7, 15, 23, 31, 39, 47, 55, 63
114 };
115
116
117
118 /* IDCT related constants
119 * Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
120 static const float IDCT_C3 = 1.175875602f;
121 static const float IDCT_C6 = 0.541196100f;
122 static const float IDCT_K[10] = {
123 0.765366865f, /* C2-C6 */
124 -1.847759065f, /* -C2-C6 */
125 -0.390180644f, /* C5-C3 */
126 -1.961570561f, /* -C5-C3 */
127 1.501321110f, /* C1+C3-C5-C7 */
128 2.053119869f, /* C1+C3-C5+C7 */
129 3.072711027f, /* C1+C3+C5-C7 */
130 0.298631336f, /* -C1+C3+C5-C7 */
131 -0.899976223f, /* C7-C3 */
132 -2.562915448f /* -C1-C3 */
133 };
134
135
136 /* global functions */
137
138 /***************************************************************************
139 * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
140 **************************************************************************/
jpeg_decode_PS0(struct hle_t * hle)141 void jpeg_decode_PS0(struct hle_t* hle)
142 {
143 jpeg_decode_std(hle, "PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
144 }
145
146 /***************************************************************************
147 * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
148 * Pokemon Stadium 2.
149 **************************************************************************/
jpeg_decode_PS(struct hle_t * hle)150 void jpeg_decode_PS(struct hle_t* hle)
151 {
152 jpeg_decode_std(hle, "PS", NULL, NULL, EmitRGBATileLine);
153 }
154
155 /***************************************************************************
156 * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
157 **************************************************************************/
jpeg_decode_OB(struct hle_t * hle)158 void jpeg_decode_OB(struct hle_t* hle)
159 {
160 int16_t qtable[SUBBLOCK_SIZE];
161 unsigned int mb;
162
163 int32_t y_dc = 0;
164 int32_t u_dc = 0;
165 int32_t v_dc = 0;
166
167 uint32_t address = *dmem_u32(hle, TASK_DATA_PTR);
168 const unsigned int macroblock_count = *dmem_u32(hle, TASK_DATA_SIZE);
169 const int qscale = *dmem_u32(hle, TASK_YIELD_DATA_SIZE);
170
171 HleVerboseMessage(hle->user_defined,
172 "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
173 address,
174 macroblock_count,
175 qscale);
176
177 if (qscale != 0) {
178 if (qscale > 0)
179 ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
180 else
181 RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
182 }
183
184 for (mb = 0; mb < macroblock_count; ++mb) {
185 int16_t macroblock[6 * SUBBLOCK_SIZE];
186
187 dram_load_u16(hle, (uint16_t *)macroblock, address, 6 * SUBBLOCK_SIZE);
188 decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
189 EmitTilesMode2(hle, EmitYUVTileLine, macroblock, address);
190
191 address += (2 * 6 * SUBBLOCK_SIZE);
192 }
193 }
194
195
196 /* local functions */
jpeg_decode_std(struct hle_t * hle,const char * const version,const subblock_transform_t transform_luma,const subblock_transform_t transform_chroma,const tile_line_emitter_t emit_line)197 static void jpeg_decode_std(struct hle_t* hle,
198 const char *const version,
199 const subblock_transform_t transform_luma,
200 const subblock_transform_t transform_chroma,
201 const tile_line_emitter_t emit_line)
202 {
203 int16_t qtables[3][SUBBLOCK_SIZE];
204 unsigned int mb;
205 uint32_t address;
206 uint32_t macroblock_count;
207 uint32_t mode;
208 uint32_t qtableY_ptr;
209 uint32_t qtableU_ptr;
210 uint32_t qtableV_ptr;
211 unsigned int subblock_count;
212 unsigned int macroblock_size;
213 /* macroblock contains at most 6 subblocks */
214 int16_t macroblock[6 * SUBBLOCK_SIZE];
215 uint32_t data_ptr;
216
217 if (*dmem_u32(hle, TASK_FLAGS) & 0x1) {
218 HleWarnMessage(hle->user_defined,
219 "jpeg_decode_%s: task yielding not implemented", version);
220 return;
221 }
222
223 data_ptr = *dmem_u32(hle, TASK_DATA_PTR);
224 address = *dram_u32(hle, data_ptr);
225 macroblock_count = *dram_u32(hle, data_ptr + 4);
226 mode = *dram_u32(hle, data_ptr + 8);
227 qtableY_ptr = *dram_u32(hle, data_ptr + 12);
228 qtableU_ptr = *dram_u32(hle, data_ptr + 16);
229 qtableV_ptr = *dram_u32(hle, data_ptr + 20);
230
231 HleVerboseMessage(hle->user_defined,
232 "jpeg_decode_%s: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
233 version,
234 address,
235 macroblock_count,
236 mode,
237 qtableY_ptr,
238 qtableU_ptr,
239 qtableV_ptr);
240
241 if (mode != 0 && mode != 2) {
242 HleWarnMessage(hle->user_defined,
243 "jpeg_decode_%s: invalid mode %d", version, mode);
244 return;
245 }
246
247 subblock_count = mode + 4;
248 macroblock_size = subblock_count * SUBBLOCK_SIZE;
249
250 dram_load_u16(hle, (uint16_t *)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
251 dram_load_u16(hle, (uint16_t *)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
252 dram_load_u16(hle, (uint16_t *)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
253
254 for (mb = 0; mb < macroblock_count; ++mb) {
255 dram_load_u16(hle, (uint16_t *)macroblock, address, macroblock_size);
256 decode_macroblock_std(transform_luma, transform_chroma,
257 macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
258
259 if (mode == 0)
260 EmitTilesMode0(hle, emit_line, macroblock, address);
261 else
262 EmitTilesMode2(hle, emit_line, macroblock, address);
263
264 address += 2 * macroblock_size;
265 }
266 }
267
clamp_u8(int16_t x)268 static uint8_t clamp_u8(int16_t x)
269 {
270 return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
271 }
272
clamp_s12(int16_t x)273 static int16_t clamp_s12(int16_t x)
274 {
275 if (x < -0x800)
276 x = -0x800;
277 else if (x > 0x7f0)
278 x = 0x7f0;
279 return x;
280 }
281
clamp_RGBA_component(int16_t x)282 static uint16_t clamp_RGBA_component(int16_t x)
283 {
284 if (x > 0xff0)
285 x = 0xff0;
286 else if (x < 0)
287 x = 0;
288 return (x & 0xf80);
289 }
290
GetUYVY(int16_t y1,int16_t y2,int16_t u,int16_t v)291 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
292 {
293 return (uint32_t)clamp_u8(u) << 24 |
294 (uint32_t)clamp_u8(y1) << 16 |
295 (uint32_t)clamp_u8(v) << 8 |
296 (uint32_t)clamp_u8(y2);
297 }
298
GetRGBA(int16_t y,int16_t u,int16_t v)299 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
300 {
301 const float fY = (float)y + 2048.0f;
302 const float fU = (float)u;
303 const float fV = (float)v;
304
305 const uint16_t r = clamp_RGBA_component((int16_t)(fY + 1.4025 * fV));
306 const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443 * fU - 0.7144 * fV));
307 const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729 * fU));
308
309 return (r << 4) | (g >> 1) | (b >> 6) | 1;
310 }
311
EmitYUVTileLine(struct hle_t * hle,const int16_t * y,const int16_t * u,uint32_t address)312 static void EmitYUVTileLine(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address)
313 {
314 uint32_t uyvy[8];
315
316 const int16_t *const v = u + SUBBLOCK_SIZE;
317 const int16_t *const y2 = y + SUBBLOCK_SIZE;
318
319 uyvy[0] = GetUYVY(y[0], y[1], u[0], v[0]);
320 uyvy[1] = GetUYVY(y[2], y[3], u[1], v[1]);
321 uyvy[2] = GetUYVY(y[4], y[5], u[2], v[2]);
322 uyvy[3] = GetUYVY(y[6], y[7], u[3], v[3]);
323 uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
324 uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
325 uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
326 uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
327
328 dram_store_u32(hle, uyvy, address, 8);
329 }
330
EmitRGBATileLine(struct hle_t * hle,const int16_t * y,const int16_t * u,uint32_t address)331 static void EmitRGBATileLine(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address)
332 {
333 uint16_t rgba[16];
334
335 const int16_t *const v = u + SUBBLOCK_SIZE;
336 const int16_t *const y2 = y + SUBBLOCK_SIZE;
337
338 rgba[0] = GetRGBA(y[0], u[0], v[0]);
339 rgba[1] = GetRGBA(y[1], u[0], v[0]);
340 rgba[2] = GetRGBA(y[2], u[1], v[1]);
341 rgba[3] = GetRGBA(y[3], u[1], v[1]);
342 rgba[4] = GetRGBA(y[4], u[2], v[2]);
343 rgba[5] = GetRGBA(y[5], u[2], v[2]);
344 rgba[6] = GetRGBA(y[6], u[3], v[3]);
345 rgba[7] = GetRGBA(y[7], u[3], v[3]);
346 rgba[8] = GetRGBA(y2[0], u[4], v[4]);
347 rgba[9] = GetRGBA(y2[1], u[4], v[4]);
348 rgba[10] = GetRGBA(y2[2], u[5], v[5]);
349 rgba[11] = GetRGBA(y2[3], u[5], v[5]);
350 rgba[12] = GetRGBA(y2[4], u[6], v[6]);
351 rgba[13] = GetRGBA(y2[5], u[6], v[6]);
352 rgba[14] = GetRGBA(y2[6], u[7], v[7]);
353 rgba[15] = GetRGBA(y2[7], u[7], v[7]);
354
355 dram_store_u16(hle, rgba, address, 16);
356 }
357
EmitTilesMode0(struct hle_t * hle,const tile_line_emitter_t emit_line,const int16_t * macroblock,uint32_t address)358 static void EmitTilesMode0(struct hle_t* hle, const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
359 {
360 unsigned int i;
361
362 unsigned int y_offset = 0;
363 unsigned int u_offset = 2 * SUBBLOCK_SIZE;
364
365 for (i = 0; i < 8; ++i) {
366 emit_line(hle, ¯oblock[y_offset], ¯oblock[u_offset], address);
367
368 y_offset += 8;
369 u_offset += 8;
370 address += 32;
371 }
372 }
373
EmitTilesMode2(struct hle_t * hle,const tile_line_emitter_t emit_line,const int16_t * macroblock,uint32_t address)374 static void EmitTilesMode2(struct hle_t* hle, const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
375 {
376 unsigned int i;
377
378 unsigned int y_offset = 0;
379 unsigned int u_offset = 4 * SUBBLOCK_SIZE;
380
381 for (i = 0; i < 8; ++i) {
382 emit_line(hle, ¯oblock[y_offset], ¯oblock[u_offset], address);
383 emit_line(hle, ¯oblock[y_offset + 8], ¯oblock[u_offset], address + 32);
384
385 y_offset += (i == 3) ? SUBBLOCK_SIZE + 16 : 16;
386 u_offset += 8;
387 address += 64;
388 }
389 }
390
decode_macroblock_ob(int16_t * macroblock,int32_t * y_dc,int32_t * u_dc,int32_t * v_dc,const int16_t * qtable)391 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
392 {
393 int sb;
394
395 for (sb = 0; sb < 6; ++sb) {
396 int16_t tmp_sb[SUBBLOCK_SIZE];
397
398 /* update DC */
399 int32_t dc = (int32_t)macroblock[0];
400 switch (sb) {
401 case 0:
402 case 1:
403 case 2:
404 case 3:
405 *y_dc += dc;
406 macroblock[0] = *y_dc & 0xffff;
407 break;
408 case 4:
409 *u_dc += dc;
410 macroblock[0] = *u_dc & 0xffff;
411 break;
412 case 5:
413 *v_dc += dc;
414 macroblock[0] = *v_dc & 0xffff;
415 break;
416 }
417
418 ZigZagSubBlock(tmp_sb, macroblock);
419 if (qtable != NULL)
420 MultSubBlocks(tmp_sb, tmp_sb, qtable, 0);
421 TransposeSubBlock(macroblock, tmp_sb);
422 InverseDCTSubBlock(macroblock, macroblock);
423
424 macroblock += SUBBLOCK_SIZE;
425 }
426 }
427
decode_macroblock_std(const subblock_transform_t transform_luma,const subblock_transform_t transform_chroma,int16_t * macroblock,unsigned int subblock_count,const int16_t qtables[3][SUBBLOCK_SIZE])428 static void decode_macroblock_std(const subblock_transform_t transform_luma,
429 const subblock_transform_t transform_chroma,
430 int16_t *macroblock,
431 unsigned int subblock_count,
432 const int16_t qtables[3][SUBBLOCK_SIZE])
433 {
434 unsigned int sb;
435 unsigned int q = 0;
436
437 for (sb = 0; sb < subblock_count; ++sb) {
438 int16_t tmp_sb[SUBBLOCK_SIZE];
439 const int isChromaSubBlock = (subblock_count - sb <= 2);
440
441 if (isChromaSubBlock)
442 ++q;
443
444 MultSubBlocks(macroblock, macroblock, qtables[q], 4);
445 ZigZagSubBlock(tmp_sb, macroblock);
446 InverseDCTSubBlock(macroblock, tmp_sb);
447
448 if (isChromaSubBlock) {
449 if (transform_chroma != NULL)
450 transform_chroma(macroblock, macroblock);
451 } else {
452 if (transform_luma != NULL)
453 transform_luma(macroblock, macroblock);
454 }
455
456 macroblock += SUBBLOCK_SIZE;
457 }
458 }
459
TransposeSubBlock(int16_t * dst,const int16_t * src)460 static void TransposeSubBlock(int16_t *dst, const int16_t *src)
461 {
462 ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
463 }
464
ZigZagSubBlock(int16_t * dst,const int16_t * src)465 static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
466 {
467 ReorderSubBlock(dst, src, ZIGZAG_TABLE);
468 }
469
ReorderSubBlock(int16_t * dst,const int16_t * src,const unsigned int * table)470 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
471 {
472 unsigned int i;
473
474 /* source and destination sublocks cannot overlap */
475 assert(abs(dst - src) > SUBBLOCK_SIZE);
476
477 for (i = 0; i < SUBBLOCK_SIZE; ++i)
478 dst[i] = src[table[i]];
479 }
480
MultSubBlocks(int16_t * dst,const int16_t * src1,const int16_t * src2,unsigned int shift)481 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
482 {
483 unsigned int i;
484
485 for (i = 0; i < SUBBLOCK_SIZE; ++i) {
486 int32_t v = src1[i] * src2[i];
487 dst[i] = clamp_s16(v) << shift;
488 }
489 }
490
ScaleSubBlock(int16_t * dst,const int16_t * src,int16_t scale)491 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
492 {
493 unsigned int i;
494
495 for (i = 0; i < SUBBLOCK_SIZE; ++i) {
496 int32_t v = src[i] * scale;
497 dst[i] = clamp_s16(v);
498 }
499 }
500
RShiftSubBlock(int16_t * dst,const int16_t * src,unsigned int shift)501 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
502 {
503 unsigned int i;
504
505 for (i = 0; i < SUBBLOCK_SIZE; ++i)
506 dst[i] = src[i] >> shift;
507 }
508
509 /***************************************************************************
510 * Fast 2D IDCT using separable formulation and normalization
511 * Computations use single precision floats
512 * Implementation based on Wikipedia :
513 * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
514 **************************************************************************/
InverseDCT1D(const float * const x,float * dst,unsigned int stride)515 static void InverseDCT1D(const float *const x, float *dst, unsigned int stride)
516 {
517 float e[4];
518 float f[4];
519 float x26, x1357, x15, x37, x17, x35;
520
521 x15 = IDCT_K[2] * (x[1] + x[5]);
522 x37 = IDCT_K[3] * (x[3] + x[7]);
523 x17 = IDCT_K[8] * (x[1] + x[7]);
524 x35 = IDCT_K[9] * (x[3] + x[5]);
525 x1357 = IDCT_C3 * (x[1] + x[3] + x[5] + x[7]);
526 x26 = IDCT_C6 * (x[2] + x[6]);
527
528 f[0] = x[0] + x[4];
529 f[1] = x[0] - x[4];
530 f[2] = x26 + IDCT_K[0] * x[2];
531 f[3] = x26 + IDCT_K[1] * x[6];
532
533 e[0] = x1357 + x15 + IDCT_K[4] * x[1] + x17;
534 e[1] = x1357 + x37 + IDCT_K[6] * x[3] + x35;
535 e[2] = x1357 + x15 + IDCT_K[5] * x[5] + x35;
536 e[3] = x1357 + x37 + IDCT_K[7] * x[7] + x17;
537
538 *dst = f[0] + f[2] + e[0];
539 dst += stride;
540 *dst = f[1] + f[3] + e[1];
541 dst += stride;
542 *dst = f[1] - f[3] + e[2];
543 dst += stride;
544 *dst = f[0] - f[2] + e[3];
545 dst += stride;
546 *dst = f[0] - f[2] - e[3];
547 dst += stride;
548 *dst = f[1] - f[3] - e[2];
549 dst += stride;
550 *dst = f[1] + f[3] - e[1];
551 dst += stride;
552 *dst = f[0] + f[2] - e[0];
553 }
554
InverseDCTSubBlock(int16_t * dst,const int16_t * src)555 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
556 {
557 float x[8];
558 float block[SUBBLOCK_SIZE];
559 unsigned int i, j;
560
561 /* idct 1d on rows (+transposition) */
562 for (i = 0; i < 8; ++i) {
563 for (j = 0; j < 8; ++j)
564 x[j] = (float)src[i * 8 + j];
565
566 InverseDCT1D(x, &block[i], 8);
567 }
568
569 /* idct 1d on columns (thanks to previous transposition) */
570 for (i = 0; i < 8; ++i) {
571 InverseDCT1D(&block[i * 8], x, 1);
572
573 /* C4 = 1 normalization implies a division by 8 */
574 for (j = 0; j < 8; ++j)
575 dst[i + j * 8] = (int16_t)x[j] >> 3;
576 }
577 }
578
RescaleYSubBlock(int16_t * dst,const int16_t * src)579 static void RescaleYSubBlock(int16_t *dst, const int16_t *src)
580 {
581 unsigned int i;
582
583 for (i = 0; i < SUBBLOCK_SIZE; ++i)
584 dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
585 }
586
RescaleUVSubBlock(int16_t * dst,const int16_t * src)587 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src)
588 {
589 unsigned int i;
590
591 for (i = 0; i < SUBBLOCK_SIZE; ++i)
592 dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
593 }
594
595