1 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2  *   Mupen64plus-rsp-hle - jpeg.c                                          *
3  *   Mupen64Plus homepage: http://code.google.com/p/mupen64plus/           *
4  *   Copyright (C) 2012 Bobby Smiles                                       *
5  *   Copyright (C) 2009 Richard Goedeken                                   *
6  *   Copyright (C) 2002 Hacktarux                                          *
7  *                                                                         *
8  *   This program is free software; you can redistribute it and/or modify  *
9  *   it under the terms of the GNU General Public License as published by  *
10  *   the Free Software Foundation; either version 2 of the License, or     *
11  *   (at your option) any later version.                                   *
12  *                                                                         *
13  *   This program is distributed in the hope that it will be useful,       *
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
16  *   GNU General Public License for more details.                          *
17  *                                                                         *
18  *   You should have received a copy of the GNU General Public License     *
19  *   along with this program; if not, write to the                         *
20  *   Free Software Foundation, Inc.,                                       *
21  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
22  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
23 
24 #include <assert.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 
28 #include "arithmetics.h"
29 #include "hle_external.h"
30 #include "hle_internal.h"
31 #include "memory.h"
32 
33 #define SUBBLOCK_SIZE 64
34 
35 typedef void (*tile_line_emitter_t)(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address);
36 typedef void (*subblock_transform_t)(int16_t *dst, const int16_t *src);
37 
38 /* standard jpeg ucode decoder */
39 static void jpeg_decode_std(struct hle_t* hle,
40                             const char *const version,
41                             const subblock_transform_t transform_luma,
42                             const subblock_transform_t transform_chroma,
43                             const tile_line_emitter_t emit_line);
44 
45 /* helper functions */
46 static uint8_t clamp_u8(int16_t x);
47 static int16_t clamp_s12(int16_t x);
48 static uint16_t clamp_RGBA_component(int16_t x);
49 
50 /* pixel conversion & formatting */
51 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v);
52 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v);
53 
54 /* tile line emitters */
55 static void EmitYUVTileLine(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address);
56 static void EmitRGBATileLine(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address);
57 
58 /* macroblocks operations */
59 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable);
60 static void decode_macroblock_std(const subblock_transform_t transform_luma,
61                                   const subblock_transform_t transform_chroma,
62                                   int16_t *macroblock,
63                                   unsigned int subblock_count,
64                                   const int16_t qtables[3][SUBBLOCK_SIZE]);
65 static void EmitTilesMode0(struct hle_t* hle, const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
66 static void EmitTilesMode2(struct hle_t* hle, const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address);
67 
68 /* subblocks operations */
69 static void TransposeSubBlock(int16_t *dst, const int16_t *src);
70 static void ZigZagSubBlock(int16_t *dst, const int16_t *src);
71 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table);
72 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift);
73 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale);
74 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift);
75 static void InverseDCT1D(const float *const x, float *dst, unsigned int stride);
76 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src);
77 static void RescaleYSubBlock(int16_t *dst, const int16_t *src);
78 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src);
79 
80 /* transposed dequantization table */
81 static const int16_t DEFAULT_QTABLE[SUBBLOCK_SIZE] = {
82     16, 12, 14, 14,  18,  24,  49,  72,
83     11, 12, 13, 17,  22,  35,  64,  92,
84     10, 14, 16, 22,  37,  55,  78,  95,
85     16, 19, 24, 29,  56,  64,  87,  98,
86     24, 26, 40, 51,  68,  81, 103, 112,
87     40, 58, 57, 87, 109, 104, 121, 100,
88     51, 60, 69, 80, 103, 113, 120, 103,
89     61, 55, 56, 62,  77,  92, 101,  99
90 };
91 
92 /* zig-zag indices */
93 static const unsigned int ZIGZAG_TABLE[SUBBLOCK_SIZE] = {
94      0,  1,  5,  6, 14, 15, 27, 28,
95      2,  4,  7, 13, 16, 26, 29, 42,
96      3,  8, 12, 17, 25, 30, 41, 43,
97      9, 11, 18, 24, 31, 40, 44, 53,
98     10, 19, 23, 32, 39, 45, 52, 54,
99     20, 22, 33, 38, 46, 51, 55, 60,
100     21, 34, 37, 47, 50, 56, 59, 61,
101     35, 36, 48, 49, 57, 58, 62, 63
102 };
103 
104 /* transposition indices */
105 static const unsigned int TRANSPOSE_TABLE[SUBBLOCK_SIZE] = {
106     0,  8, 16, 24, 32, 40, 48, 56,
107     1,  9, 17, 25, 33, 41, 49, 57,
108     2, 10, 18, 26, 34, 42, 50, 58,
109     3, 11, 19, 27, 35, 43, 51, 59,
110     4, 12, 20, 28, 36, 44, 52, 60,
111     5, 13, 21, 29, 37, 45, 53, 61,
112     6, 14, 22, 30, 38, 46, 54, 62,
113     7, 15, 23, 31, 39, 47, 55, 63
114 };
115 
116 
117 
118 /* IDCT related constants
119  * Cn = alpha * cos(n * PI / 16) (alpha is chosen such as C4 = 1) */
120 static const float IDCT_C3 = 1.175875602f;
121 static const float IDCT_C6 = 0.541196100f;
122 static const float IDCT_K[10] = {
123      0.765366865f,   /*  C2-C6         */
124     -1.847759065f,   /* -C2-C6         */
125     -0.390180644f,   /*  C5-C3         */
126     -1.961570561f,   /* -C5-C3         */
127      1.501321110f,   /*  C1+C3-C5-C7   */
128      2.053119869f,   /*  C1+C3-C5+C7   */
129      3.072711027f,   /*  C1+C3+C5-C7   */
130      0.298631336f,   /* -C1+C3+C5-C7   */
131     -0.899976223f,   /*  C7-C3         */
132     -2.562915448f    /* -C1-C3         */
133 };
134 
135 
136 /* global functions */
137 
138 /***************************************************************************
139  * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
140  **************************************************************************/
jpeg_decode_PS0(struct hle_t * hle)141 void jpeg_decode_PS0(struct hle_t* hle)
142 {
143     jpeg_decode_std(hle, "PS0", RescaleYSubBlock, RescaleUVSubBlock, EmitYUVTileLine);
144 }
145 
146 /***************************************************************************
147  * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
148  * Pokemon Stadium 2.
149  **************************************************************************/
jpeg_decode_PS(struct hle_t * hle)150 void jpeg_decode_PS(struct hle_t* hle)
151 {
152     jpeg_decode_std(hle, "PS", NULL, NULL, EmitRGBATileLine);
153 }
154 
155 /***************************************************************************
156  * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
157  **************************************************************************/
jpeg_decode_OB(struct hle_t * hle)158 void jpeg_decode_OB(struct hle_t* hle)
159 {
160     int16_t qtable[SUBBLOCK_SIZE];
161     unsigned int mb;
162 
163     int32_t y_dc = 0;
164     int32_t u_dc = 0;
165     int32_t v_dc = 0;
166 
167     uint32_t           address          = *dmem_u32(hle, TASK_DATA_PTR);
168     const unsigned int macroblock_count = *dmem_u32(hle, TASK_DATA_SIZE);
169     const int          qscale           = *dmem_u32(hle, TASK_YIELD_DATA_SIZE);
170 
171     HleVerboseMessage(hle->user_defined,
172                       "jpeg_decode_OB: *buffer=%x, #MB=%d, qscale=%d",
173                       address,
174                       macroblock_count,
175                       qscale);
176 
177     if (qscale != 0) {
178         if (qscale > 0)
179             ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
180         else
181             RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
182     }
183 
184     for (mb = 0; mb < macroblock_count; ++mb) {
185         int16_t macroblock[6 * SUBBLOCK_SIZE];
186 
187         dram_load_u16(hle, (uint16_t *)macroblock, address, 6 * SUBBLOCK_SIZE);
188         decode_macroblock_ob(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : NULL);
189         EmitTilesMode2(hle, EmitYUVTileLine, macroblock, address);
190 
191         address += (2 * 6 * SUBBLOCK_SIZE);
192     }
193 }
194 
195 
196 /* local functions */
jpeg_decode_std(struct hle_t * hle,const char * const version,const subblock_transform_t transform_luma,const subblock_transform_t transform_chroma,const tile_line_emitter_t emit_line)197 static void jpeg_decode_std(struct hle_t* hle,
198                             const char *const version,
199                             const subblock_transform_t transform_luma,
200                             const subblock_transform_t transform_chroma,
201                             const tile_line_emitter_t emit_line)
202 {
203     int16_t qtables[3][SUBBLOCK_SIZE];
204     unsigned int mb;
205     uint32_t address;
206     uint32_t macroblock_count;
207     uint32_t mode;
208     uint32_t qtableY_ptr;
209     uint32_t qtableU_ptr;
210     uint32_t qtableV_ptr;
211     unsigned int subblock_count;
212     unsigned int macroblock_size;
213     /* macroblock contains at most 6 subblocks */
214     int16_t macroblock[6 * SUBBLOCK_SIZE];
215     uint32_t data_ptr;
216 
217     if (*dmem_u32(hle, TASK_FLAGS) & 0x1) {
218         HleWarnMessage(hle->user_defined,
219                        "jpeg_decode_%s: task yielding not implemented", version);
220         return;
221     }
222 
223     data_ptr = *dmem_u32(hle, TASK_DATA_PTR);
224     address          = *dram_u32(hle, data_ptr);
225     macroblock_count = *dram_u32(hle, data_ptr + 4);
226     mode             = *dram_u32(hle, data_ptr + 8);
227     qtableY_ptr      = *dram_u32(hle, data_ptr + 12);
228     qtableU_ptr      = *dram_u32(hle, data_ptr + 16);
229     qtableV_ptr      = *dram_u32(hle, data_ptr + 20);
230 
231     HleVerboseMessage(hle->user_defined,
232                       "jpeg_decode_%s: *buffer=%x, #MB=%d, mode=%d, *Qy=%x, *Qu=%x, *Qv=%x",
233                       version,
234                       address,
235                       macroblock_count,
236                       mode,
237                       qtableY_ptr,
238                       qtableU_ptr,
239                       qtableV_ptr);
240 
241     if (mode != 0 && mode != 2) {
242         HleWarnMessage(hle->user_defined,
243                        "jpeg_decode_%s: invalid mode %d", version, mode);
244         return;
245     }
246 
247     subblock_count = mode + 4;
248     macroblock_size = subblock_count * SUBBLOCK_SIZE;
249 
250     dram_load_u16(hle, (uint16_t *)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
251     dram_load_u16(hle, (uint16_t *)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
252     dram_load_u16(hle, (uint16_t *)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
253 
254     for (mb = 0; mb < macroblock_count; ++mb) {
255         dram_load_u16(hle, (uint16_t *)macroblock, address, macroblock_size);
256         decode_macroblock_std(transform_luma, transform_chroma,
257                               macroblock, subblock_count, (const int16_t (*)[SUBBLOCK_SIZE])qtables);
258 
259         if (mode == 0)
260             EmitTilesMode0(hle, emit_line, macroblock, address);
261         else
262             EmitTilesMode2(hle, emit_line, macroblock, address);
263 
264         address += 2 * macroblock_size;
265     }
266 }
267 
clamp_u8(int16_t x)268 static uint8_t clamp_u8(int16_t x)
269 {
270     return (x & (0xff00)) ? ((-x) >> 15) & 0xff : x;
271 }
272 
clamp_s12(int16_t x)273 static int16_t clamp_s12(int16_t x)
274 {
275     if (x < -0x800)
276         x = -0x800;
277     else if (x > 0x7f0)
278         x = 0x7f0;
279     return x;
280 }
281 
clamp_RGBA_component(int16_t x)282 static uint16_t clamp_RGBA_component(int16_t x)
283 {
284     if (x > 0xff0)
285         x = 0xff0;
286     else if (x < 0)
287         x = 0;
288     return (x & 0xf80);
289 }
290 
GetUYVY(int16_t y1,int16_t y2,int16_t u,int16_t v)291 static uint32_t GetUYVY(int16_t y1, int16_t y2, int16_t u, int16_t v)
292 {
293     return (uint32_t)clamp_u8(u)  << 24 |
294            (uint32_t)clamp_u8(y1) << 16 |
295            (uint32_t)clamp_u8(v)  << 8 |
296            (uint32_t)clamp_u8(y2);
297 }
298 
GetRGBA(int16_t y,int16_t u,int16_t v)299 static uint16_t GetRGBA(int16_t y, int16_t u, int16_t v)
300 {
301     const float fY = (float)y + 2048.0f;
302     const float fU = (float)u;
303     const float fV = (float)v;
304 
305     const uint16_t r = clamp_RGBA_component((int16_t)(fY               + 1.4025 * fV));
306     const uint16_t g = clamp_RGBA_component((int16_t)(fY - 0.3443 * fU - 0.7144 * fV));
307     const uint16_t b = clamp_RGBA_component((int16_t)(fY + 1.7729 * fU));
308 
309     return (r << 4) | (g >> 1) | (b >> 6) | 1;
310 }
311 
EmitYUVTileLine(struct hle_t * hle,const int16_t * y,const int16_t * u,uint32_t address)312 static void EmitYUVTileLine(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address)
313 {
314     uint32_t uyvy[8];
315 
316     const int16_t *const v  = u + SUBBLOCK_SIZE;
317     const int16_t *const y2 = y + SUBBLOCK_SIZE;
318 
319     uyvy[0] = GetUYVY(y[0],  y[1],  u[0], v[0]);
320     uyvy[1] = GetUYVY(y[2],  y[3],  u[1], v[1]);
321     uyvy[2] = GetUYVY(y[4],  y[5],  u[2], v[2]);
322     uyvy[3] = GetUYVY(y[6],  y[7],  u[3], v[3]);
323     uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
324     uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
325     uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
326     uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
327 
328     dram_store_u32(hle, uyvy, address, 8);
329 }
330 
EmitRGBATileLine(struct hle_t * hle,const int16_t * y,const int16_t * u,uint32_t address)331 static void EmitRGBATileLine(struct hle_t* hle, const int16_t *y, const int16_t *u, uint32_t address)
332 {
333     uint16_t rgba[16];
334 
335     const int16_t *const v  = u + SUBBLOCK_SIZE;
336     const int16_t *const y2 = y + SUBBLOCK_SIZE;
337 
338     rgba[0]  = GetRGBA(y[0],  u[0], v[0]);
339     rgba[1]  = GetRGBA(y[1],  u[0], v[0]);
340     rgba[2]  = GetRGBA(y[2],  u[1], v[1]);
341     rgba[3]  = GetRGBA(y[3],  u[1], v[1]);
342     rgba[4]  = GetRGBA(y[4],  u[2], v[2]);
343     rgba[5]  = GetRGBA(y[5],  u[2], v[2]);
344     rgba[6]  = GetRGBA(y[6],  u[3], v[3]);
345     rgba[7]  = GetRGBA(y[7],  u[3], v[3]);
346     rgba[8]  = GetRGBA(y2[0], u[4], v[4]);
347     rgba[9]  = GetRGBA(y2[1], u[4], v[4]);
348     rgba[10] = GetRGBA(y2[2], u[5], v[5]);
349     rgba[11] = GetRGBA(y2[3], u[5], v[5]);
350     rgba[12] = GetRGBA(y2[4], u[6], v[6]);
351     rgba[13] = GetRGBA(y2[5], u[6], v[6]);
352     rgba[14] = GetRGBA(y2[6], u[7], v[7]);
353     rgba[15] = GetRGBA(y2[7], u[7], v[7]);
354 
355     dram_store_u16(hle, rgba, address, 16);
356 }
357 
EmitTilesMode0(struct hle_t * hle,const tile_line_emitter_t emit_line,const int16_t * macroblock,uint32_t address)358 static void EmitTilesMode0(struct hle_t* hle, const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
359 {
360     unsigned int i;
361 
362     unsigned int y_offset = 0;
363     unsigned int u_offset = 2 * SUBBLOCK_SIZE;
364 
365     for (i = 0; i < 8; ++i) {
366         emit_line(hle, &macroblock[y_offset], &macroblock[u_offset], address);
367 
368         y_offset += 8;
369         u_offset += 8;
370         address += 32;
371     }
372 }
373 
EmitTilesMode2(struct hle_t * hle,const tile_line_emitter_t emit_line,const int16_t * macroblock,uint32_t address)374 static void EmitTilesMode2(struct hle_t* hle, const tile_line_emitter_t emit_line, const int16_t *macroblock, uint32_t address)
375 {
376     unsigned int i;
377 
378     unsigned int y_offset = 0;
379     unsigned int u_offset = 4 * SUBBLOCK_SIZE;
380 
381     for (i = 0; i < 8; ++i) {
382         emit_line(hle, &macroblock[y_offset],     &macroblock[u_offset], address);
383         emit_line(hle, &macroblock[y_offset + 8], &macroblock[u_offset], address + 32);
384 
385         y_offset += (i == 3) ? SUBBLOCK_SIZE + 16 : 16;
386         u_offset += 8;
387         address += 64;
388     }
389 }
390 
decode_macroblock_ob(int16_t * macroblock,int32_t * y_dc,int32_t * u_dc,int32_t * v_dc,const int16_t * qtable)391 static void decode_macroblock_ob(int16_t *macroblock, int32_t *y_dc, int32_t *u_dc, int32_t *v_dc, const int16_t *qtable)
392 {
393     int sb;
394 
395     for (sb = 0; sb < 6; ++sb) {
396         int16_t tmp_sb[SUBBLOCK_SIZE];
397 
398         /* update DC */
399         int32_t dc = (int32_t)macroblock[0];
400         switch (sb) {
401         case 0:
402         case 1:
403         case 2:
404         case 3:
405             *y_dc += dc;
406             macroblock[0] = *y_dc & 0xffff;
407             break;
408         case 4:
409             *u_dc += dc;
410             macroblock[0] = *u_dc & 0xffff;
411             break;
412         case 5:
413             *v_dc += dc;
414             macroblock[0] = *v_dc & 0xffff;
415             break;
416         }
417 
418         ZigZagSubBlock(tmp_sb, macroblock);
419         if (qtable != NULL)
420             MultSubBlocks(tmp_sb, tmp_sb, qtable, 0);
421         TransposeSubBlock(macroblock, tmp_sb);
422         InverseDCTSubBlock(macroblock, macroblock);
423 
424         macroblock += SUBBLOCK_SIZE;
425     }
426 }
427 
decode_macroblock_std(const subblock_transform_t transform_luma,const subblock_transform_t transform_chroma,int16_t * macroblock,unsigned int subblock_count,const int16_t qtables[3][SUBBLOCK_SIZE])428 static void decode_macroblock_std(const subblock_transform_t transform_luma,
429                                   const subblock_transform_t transform_chroma,
430                                   int16_t *macroblock,
431                                   unsigned int subblock_count,
432                                   const int16_t qtables[3][SUBBLOCK_SIZE])
433 {
434     unsigned int sb;
435     unsigned int q = 0;
436 
437     for (sb = 0; sb < subblock_count; ++sb) {
438         int16_t tmp_sb[SUBBLOCK_SIZE];
439         const int isChromaSubBlock = (subblock_count - sb <= 2);
440 
441         if (isChromaSubBlock)
442             ++q;
443 
444         MultSubBlocks(macroblock, macroblock, qtables[q], 4);
445         ZigZagSubBlock(tmp_sb, macroblock);
446         InverseDCTSubBlock(macroblock, tmp_sb);
447 
448         if (isChromaSubBlock) {
449             if (transform_chroma != NULL)
450                 transform_chroma(macroblock, macroblock);
451         } else {
452             if (transform_luma != NULL)
453                 transform_luma(macroblock, macroblock);
454         }
455 
456         macroblock += SUBBLOCK_SIZE;
457     }
458 }
459 
TransposeSubBlock(int16_t * dst,const int16_t * src)460 static void TransposeSubBlock(int16_t *dst, const int16_t *src)
461 {
462     ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
463 }
464 
ZigZagSubBlock(int16_t * dst,const int16_t * src)465 static void ZigZagSubBlock(int16_t *dst, const int16_t *src)
466 {
467     ReorderSubBlock(dst, src, ZIGZAG_TABLE);
468 }
469 
ReorderSubBlock(int16_t * dst,const int16_t * src,const unsigned int * table)470 static void ReorderSubBlock(int16_t *dst, const int16_t *src, const unsigned int *table)
471 {
472     unsigned int i;
473 
474     /* source and destination sublocks cannot overlap */
475     assert(abs(dst - src) > SUBBLOCK_SIZE);
476 
477     for (i = 0; i < SUBBLOCK_SIZE; ++i)
478         dst[i] = src[table[i]];
479 }
480 
MultSubBlocks(int16_t * dst,const int16_t * src1,const int16_t * src2,unsigned int shift)481 static void MultSubBlocks(int16_t *dst, const int16_t *src1, const int16_t *src2, unsigned int shift)
482 {
483     unsigned int i;
484 
485     for (i = 0; i < SUBBLOCK_SIZE; ++i) {
486         int32_t v = src1[i] * src2[i];
487         dst[i] = clamp_s16(v) << shift;
488     }
489 }
490 
ScaleSubBlock(int16_t * dst,const int16_t * src,int16_t scale)491 static void ScaleSubBlock(int16_t *dst, const int16_t *src, int16_t scale)
492 {
493     unsigned int i;
494 
495     for (i = 0; i < SUBBLOCK_SIZE; ++i) {
496         int32_t v = src[i] * scale;
497         dst[i] = clamp_s16(v);
498     }
499 }
500 
RShiftSubBlock(int16_t * dst,const int16_t * src,unsigned int shift)501 static void RShiftSubBlock(int16_t *dst, const int16_t *src, unsigned int shift)
502 {
503     unsigned int i;
504 
505     for (i = 0; i < SUBBLOCK_SIZE; ++i)
506         dst[i] = src[i] >> shift;
507 }
508 
509 /***************************************************************************
510  * Fast 2D IDCT using separable formulation and normalization
511  * Computations use single precision floats
512  * Implementation based on Wikipedia :
513  * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
514  **************************************************************************/
InverseDCT1D(const float * const x,float * dst,unsigned int stride)515 static void InverseDCT1D(const float *const x, float *dst, unsigned int stride)
516 {
517     float e[4];
518     float f[4];
519     float x26, x1357, x15, x37, x17, x35;
520 
521     x15   = IDCT_K[2] * (x[1] + x[5]);
522     x37   = IDCT_K[3] * (x[3] + x[7]);
523     x17   = IDCT_K[8] * (x[1] + x[7]);
524     x35   = IDCT_K[9] * (x[3] + x[5]);
525     x1357 = IDCT_C3   * (x[1] + x[3] + x[5] + x[7]);
526     x26   = IDCT_C6   * (x[2] + x[6]);
527 
528     f[0] = x[0] + x[4];
529     f[1] = x[0] - x[4];
530     f[2] = x26  + IDCT_K[0] * x[2];
531     f[3] = x26  + IDCT_K[1] * x[6];
532 
533     e[0] = x1357 + x15 + IDCT_K[4] * x[1] + x17;
534     e[1] = x1357 + x37 + IDCT_K[6] * x[3] + x35;
535     e[2] = x1357 + x15 + IDCT_K[5] * x[5] + x35;
536     e[3] = x1357 + x37 + IDCT_K[7] * x[7] + x17;
537 
538     *dst = f[0] + f[2] + e[0];
539     dst += stride;
540     *dst = f[1] + f[3] + e[1];
541     dst += stride;
542     *dst = f[1] - f[3] + e[2];
543     dst += stride;
544     *dst = f[0] - f[2] + e[3];
545     dst += stride;
546     *dst = f[0] - f[2] - e[3];
547     dst += stride;
548     *dst = f[1] - f[3] - e[2];
549     dst += stride;
550     *dst = f[1] + f[3] - e[1];
551     dst += stride;
552     *dst = f[0] + f[2] - e[0];
553 }
554 
InverseDCTSubBlock(int16_t * dst,const int16_t * src)555 static void InverseDCTSubBlock(int16_t *dst, const int16_t *src)
556 {
557     float x[8];
558     float block[SUBBLOCK_SIZE];
559     unsigned int i, j;
560 
561     /* idct 1d on rows (+transposition) */
562     for (i = 0; i < 8; ++i) {
563         for (j = 0; j < 8; ++j)
564             x[j] = (float)src[i * 8 + j];
565 
566         InverseDCT1D(x, &block[i], 8);
567     }
568 
569     /* idct 1d on columns (thanks to previous transposition) */
570     for (i = 0; i < 8; ++i) {
571         InverseDCT1D(&block[i * 8], x, 1);
572 
573         /* C4 = 1 normalization implies a division by 8 */
574         for (j = 0; j < 8; ++j)
575             dst[i + j * 8] = (int16_t)x[j] >> 3;
576     }
577 }
578 
RescaleYSubBlock(int16_t * dst,const int16_t * src)579 static void RescaleYSubBlock(int16_t *dst, const int16_t *src)
580 {
581     unsigned int i;
582 
583     for (i = 0; i < SUBBLOCK_SIZE; ++i)
584         dst[i] = (((uint32_t)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
585 }
586 
RescaleUVSubBlock(int16_t * dst,const int16_t * src)587 static void RescaleUVSubBlock(int16_t *dst, const int16_t *src)
588 {
589     unsigned int i;
590 
591     for (i = 0; i < SUBBLOCK_SIZE; ++i)
592         dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
593 }
594 
595