1 /******************************************************************************/
2 /* Mednafen NEC PC-FX Emulation Module */
3 /******************************************************************************/
4 /* idct.cpp:
5 ** Copyright (C) 2019 Mednafen Team
6 **
7 ** This program is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or (at your option) any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU General Public License for more details.
16 **
17 ** You should have received a copy of the GNU General Public License
18 ** along with this program; if not, write to the Free Software Foundation, Inc.,
19 ** 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 */
21
22 /* References:
23 'Practical Fast 1-D DCT Algorithms With 11 Multiplications', 1989, by
24 Christoph Loeffler, Adriaan Ligtenberg, and George S. Moschytz
25 */
26
27 #include <mednafen/types.h>
28 #include "idct.h"
29
30 namespace MDFN_IEN_PCFX
31 {
32
33 #if 0
34 static NO_INLINE void NF_IDCT_1D(float* c, float* o)
35 {
36 #define SNORP(a0, a1) { float tmp = (c[a0] + c[a1]); c[a1] = (c[a0] - c[a1]); c[a0] = tmp; }
37 float r[2];
38 SNORP(7, 1)
39 r[0] = c[5] * 1.4142135623730951;
40 r[1] = c[3] * 1.4142135623730951;
41 c[3] = r[0];
42 c[5] = r[1];
43
44 //
45 //
46 //
47 r[0] = c[2] * 0.5411961001461970 + c[6] * -1.3065629648763766;
48 r[1] = c[2] * 1.3065629648763766 + c[6] * 0.5411961001461970;
49 c[2] = r[0];
50 c[6] = r[1];
51 SNORP(7, 5)
52 SNORP(3, 1)
53 SNORP(0, 4)
54 //
55 //
56 //
57 r[0] = c[7] * -0.5555702330196022 + c[1] * 0.8314696123025452;
58 r[1] = c[7] * 0.8314696123025452 + c[1] * 0.5555702330196022;
59 c[7] = r[0];
60 c[1] = r[1];
61 r[0] = c[3] * 0.1950903220161282 + c[5] * 0.9807852804032304;
62 r[1] = c[3] * -0.9807852804032304 + c[5] * 0.1950903220161282;
63 c[3] = r[0];
64 c[5] = r[1];
65 SNORP(0, 6)
66 SNORP(4, 2)
67 //
68 //
69 //
70 o[0 * 8] = (c[0] + c[1]);
71 o[1 * 8] = (c[4] + c[5]);
72 o[2 * 8] = (c[2] + c[3]);
73 o[3 * 8] = (c[6] + c[7]);
74 o[4 * 8] = (c[6] - c[7]);
75 o[5 * 8] = (c[2] - c[3]);
76 o[6 * 8] = (c[4] - c[5]);
77 o[7 * 8] = (c[0] - c[1]);
78 #undef SNORP
79 }
80
81 static INLINE void NF_IDCT(float* c)
82 {
83 float buf[64];
84
85 for(unsigned i = 0; i < 8; i++)
86 NF_IDCT_1D(&c[i * 8], &buf[i]);
87
88 for(unsigned i = 0; i < 8; i++)
89 NF_IDCT_1D(&buf[i * 8], &c[i]);
90
91 for(unsigned i = 0; i < 64; i++)
92 c[i] /= 2;
93 }
94 #endif
95
96 //#define IDCT_ACCURACY_TEST 1
97
98 enum : unsigned { IDCT_PRESHIFT = 9 };
99 #define EFF_RSHIFT_1D_COEFF 2 // Must be >= 2
100 #define EFF_RSHIFT_1D_POST 6
101 #define EFF_RSHIFT_2D ((EFF_RSHIFT_1D_COEFF) * 2 + EFF_RSHIFT_1D_POST - 1) // -1 is from sqrt(2) * sqrt(2)
102 #define C_COEFF(m) ((int32)((1LL << (32 - EFF_RSHIFT_1D_COEFF)) * (m) + 0.5))
103
104 #define SNORP(a0, a1) { int32 tmp = (c[a0] + c[a1]); c[a1] = (c[a0] - c[a1]); c[a0] = tmp; }
105
106 #if 0
107 static INLINE int32 MUL_32x32_H32(int32 c, int32 v)
108 {
109 int32 ret;
110
111 asm("mulhw %0, %1, %2\n\t" : "=r"(ret) : "r"(c), "r"(v));
112
113 return ret;
114 //return ((int64)v * c) >> 32;
115 }
116 #else
MUL_32x32_H32(const int32 & c,int32 v)117 static INLINE int32 MUL_32x32_H32(const int32& c, int32 v)
118 {
119 #if 0
120 int32 ret, dummy;
121
122 asm("imull %3\n\t"
123 : "=d"(ret), "=a"(dummy)
124 : "a"(v), "m"(c)
125 : "cc");
126 return ret;
127 #else
128 return ((int64)v * c) >> 32;
129 #endif
130 }
131 #endif
132
133 template<unsigned psh>
IDCT_1D(int32 * __restrict__ c_in,int32 * __restrict__ o)134 static INLINE void IDCT_1D(int32* __restrict__ c_in, int32* __restrict__ o)
135 {
136 static const int32 coeffs[10] =
137 {
138 1779033704,
139
140 C_COEFF( 0.5411961001461970),
141 C_COEFF(-1.8477590650225736),
142 C_COEFF( 0.7653668647301796),
143
144 C_COEFF(-0.5555702330196022),
145 C_COEFF( 1.3870398453221474),
146 C_COEFF( 0.2758993792829430),
147
148 C_COEFF( 0.1950903220161282),
149 C_COEFF( 0.7856949583871022),
150 C_COEFF(-1.1758756024193586),
151 };
152 int32 c[8];
153 int32 r;
154 int32 m;
155
156 if(!psh)
157 {
158 c[0] = c_in[0] << (IDCT_PRESHIFT - EFF_RSHIFT_1D_COEFF);
159 c[4] = c_in[4] << (IDCT_PRESHIFT - EFF_RSHIFT_1D_COEFF);
160
161 c[7] = (c_in[7] + c_in[1]) << IDCT_PRESHIFT;
162 c[1] = (c_in[7] - c_in[1]) << IDCT_PRESHIFT;
163
164 c[3] = (46341 * c_in[5]) >> (15 - IDCT_PRESHIFT);
165 c[5] = (46341 * c_in[3]) >> (15 - IDCT_PRESHIFT);
166 //
167 //
168 //
169 //C_COEFF( 0.5411961001461970),
170 //C_COEFF(-1.8477590650225736),
171 //C_COEFF( 0.7653668647301796),
172 m = 35468 * (c_in[2] + c_in[6]);
173 c[2] = (-121095 * c_in[6] + m) >> (16 - IDCT_PRESHIFT + EFF_RSHIFT_1D_COEFF);
174 c[6] = ( 50159 * c_in[2] + m) >> (16 - IDCT_PRESHIFT + EFF_RSHIFT_1D_COEFF);
175 /*
176 c_in[2] <<= IDCT_PRESHIFT;
177 c_in[6] <<= IDCT_PRESHIFT;
178 m = MUL_32x32_H32(coeffs[1], (c_in[2] + c_in[6]));
179 c[2] = MUL_32x32_H32(coeffs[2], c_in[6]) + m;
180 c[6] = MUL_32x32_H32(coeffs[3], c_in[2]) + m;
181 */
182 }
183 else
184 {
185 c[0] = (c_in[0] >> EFF_RSHIFT_1D_COEFF) + ((1 << psh) >> 1);
186 c[4] = c_in[4] >> EFF_RSHIFT_1D_COEFF;
187
188 c[7] = c_in[7] + c_in[1];
189 c[1] = c_in[7] - c_in[1];
190
191 //c[3] = c_in[5] + MUL_32x32_H32(coeffs[0], c_in[5]);
192 //c[5] = c_in[3] + MUL_32x32_H32(coeffs[0], c_in[3]);
193 //c[3] = c_in[5] + ((13572 * c_in[5]) >> 15);
194 //c[5] = c_in[3] + ((13572 * c_in[3]) >> 15);
195 c[3] = (c_in[5] * 181) >> 7;
196 c[5] = (c_in[3] * 181) >> 7;
197 /*
198 //c[3] = (c_in[5] * 181 + 64) >> 7;
199 //c[5] = (c_in[3] * 181 + 64) >> 7;
200 //
201 //
202 //
203 */
204 m = MUL_32x32_H32(coeffs[1], (c_in[2] + c_in[6]));
205 c[2] = MUL_32x32_H32(coeffs[2], c_in[6]) + m;
206 c[6] = MUL_32x32_H32(coeffs[3], c_in[2]) + m;
207
208 //c[3] = (46341 * c_in[5]) >> (15 + EFF_RSHIFT_1D_COEFF);
209 //c[5] = (46341 * c_in[3]) >> (15 + EFF_RSHIFT_1D_COEFF);
210 //
211 //
212 //
213 //C_COEFF( 0.5411961001461970),
214 //C_COEFF(-1.8477590650225736),
215 //C_COEFF( 0.7653668647301796),
216 //m = 35468 * (c_in[2] + c_in[6]);
217 //c[2] = (-121095 * c_in[6] + m) >> (16 + EFF_RSHIFT_1D_COEFF);
218 //c[6] = ( 50159 * c_in[2] + m) >> (16 + EFF_RSHIFT_1D_COEFF);
219 }
220 SNORP(0, 4)
221 SNORP(7, 5)
222 SNORP(3, 1)
223 //
224 //
225 //
226 m = MUL_32x32_H32(coeffs[4], (c[7] + c[1]));
227 r = MUL_32x32_H32(coeffs[5], c[1]) + m;
228 c[1] = MUL_32x32_H32(coeffs[6], c[7]) - m;
229 //c[1] = ((565LL * c[7]) >> 13) - m;
230 c[7] = r;
231 //
232 //
233 //
234 //m = (799 * (c[3] + c[5])) >> (12 + EFF_RSHIFT_1D_COEFF);
235 m = MUL_32x32_H32(coeffs[7], (c[3] + c[5]));
236 r = MUL_32x32_H32(coeffs[8], c[5]) + m;
237 c[5] = MUL_32x32_H32(coeffs[9], c[3]) + m;
238 c[3] = r;
239
240 SNORP(0, 6)
241 SNORP(4, 2)
242 //
243 //
244 //
245 o[0 * 8] = (c[0] + c[1]) >> psh;
246 o[1 * 8] = (c[4] + c[5]) >> psh;
247 o[2 * 8] = (c[2] + c[3]) >> psh;
248 o[3 * 8] = (c[6] + c[7]) >> psh;
249 o[4 * 8] = (c[6] - c[7]) >> psh;
250 o[5 * 8] = (c[2] - c[3]) >> psh;
251 o[6 * 8] = (c[4] - c[5]) >> psh;
252 o[7 * 8] = (c[0] - c[1]) >> psh;
253 }
254
255 template<unsigned psh>
IDCT_1D_Multi(int32 * __restrict__ c,int32 * __restrict__ o)256 static INLINE void IDCT_1D_Multi(int32* __restrict__ c, int32* __restrict__ o)
257 {
258 for(unsigned i = 0; i < 8; i++)
259 IDCT_1D<psh>(&c[i * 8], &o[i]);
260 }
261
IDCT(int32 * c)262 void IDCT(int32* c)
263 {
264 int32 buf[64];
265
266 static_assert(IDCT_PRESHIFT == EFF_RSHIFT_2D, "IDCT_PRESHIFT != EFF_RSHIFT_2D");
267
268 IDCT_1D_Multi<0>(c, buf);
269 IDCT_1D_Multi<EFF_RSHIFT_1D_POST>(buf, c);
270 }
271
272 }
273