1 /******************************************************************************/
2 /* Mednafen NEC PC-FX Emulation Module                                        */
3 /******************************************************************************/
4 /* idct.cpp:
5 **  Copyright (C) 2019 Mednafen Team
6 **
7 ** This program is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or (at your option) any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 ** GNU General Public License for more details.
16 **
17 ** You should have received a copy of the GNU General Public License
18 ** along with this program; if not, write to the Free Software Foundation, Inc.,
19 ** 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20 */
21 
22 /* References:
23 	'Practical Fast 1-D DCT Algorithms With 11 Multiplications', 1989, by
24 	Christoph Loeffler, Adriaan Ligtenberg, and George S. Moschytz
25 */
26 
27 #include <mednafen/types.h>
28 #include "idct.h"
29 
30 namespace MDFN_IEN_PCFX
31 {
32 
33 #if 0
34 static NO_INLINE void NF_IDCT_1D(float* c, float* o)
35 {
36  #define SNORP(a0, a1) { float tmp = (c[a0] + c[a1]); c[a1] = (c[a0] - c[a1]); c[a0] = tmp; }
37  float r[2];
38  SNORP(7, 1)
39  r[0] = c[5] * 1.4142135623730951;
40  r[1] = c[3] * 1.4142135623730951;
41  c[3] = r[0];
42  c[5] = r[1];
43 
44  //
45  //
46  //
47  r[0] = c[2] *  0.5411961001461970 + c[6] * -1.3065629648763766;
48  r[1] = c[2] *  1.3065629648763766 + c[6] *  0.5411961001461970;
49  c[2] = r[0];
50  c[6] = r[1];
51  SNORP(7, 5)
52  SNORP(3, 1)
53  SNORP(0, 4)
54  //
55  //
56  //
57  r[0] = c[7] * -0.5555702330196022 + c[1] *  0.8314696123025452;
58  r[1] = c[7] *  0.8314696123025452 + c[1] *  0.5555702330196022;
59  c[7] = r[0];
60  c[1] = r[1];
61  r[0] = c[3] *  0.1950903220161282 + c[5] *  0.9807852804032304;
62  r[1] = c[3] * -0.9807852804032304 + c[5] *  0.1950903220161282;
63  c[3] = r[0];
64  c[5] = r[1];
65  SNORP(0, 6)
66  SNORP(4, 2)
67  //
68  //
69  //
70  o[0 * 8] = (c[0] + c[1]);
71  o[1 * 8] = (c[4] + c[5]);
72  o[2 * 8] = (c[2] + c[3]);
73  o[3 * 8] = (c[6] + c[7]);
74  o[4 * 8] = (c[6] - c[7]);
75  o[5 * 8] = (c[2] - c[3]);
76  o[6 * 8] = (c[4] - c[5]);
77  o[7 * 8] = (c[0] - c[1]);
78  #undef SNORP
79 }
80 
81 static INLINE void NF_IDCT(float* c)
82 {
83  float buf[64];
84 
85  for(unsigned i = 0; i < 8; i++)
86   NF_IDCT_1D(&c[i * 8], &buf[i]);
87 
88  for(unsigned i = 0; i < 8; i++)
89   NF_IDCT_1D(&buf[i * 8], &c[i]);
90 
91  for(unsigned i = 0; i < 64; i++)
92   c[i] /= 2;
93 }
94 #endif
95 
96 //#define IDCT_ACCURACY_TEST 1
97 
98 enum : unsigned { IDCT_PRESHIFT = 9 };
99 #define EFF_RSHIFT_1D_COEFF	2	// Must be >= 2
100 #define EFF_RSHIFT_1D_POST	6
101 #define EFF_RSHIFT_2D ((EFF_RSHIFT_1D_COEFF) * 2 + EFF_RSHIFT_1D_POST - 1)	// -1 is from sqrt(2) * sqrt(2)
102 #define C_COEFF(m) ((int32)((1LL << (32 - EFF_RSHIFT_1D_COEFF)) * (m) + 0.5))
103 
104 #define SNORP(a0, a1) { int32 tmp = (c[a0] + c[a1]); c[a1] = (c[a0] - c[a1]); c[a0] = tmp; }
105 
106 #if 0
107 static INLINE int32 MUL_32x32_H32(int32 c, int32 v)
108 {
109  int32 ret;
110 
111  asm("mulhw %0, %1, %2\n\t" : "=r"(ret) : "r"(c), "r"(v));
112 
113  return ret;
114  //return ((int64)v * c) >> 32;
115 }
116 #else
MUL_32x32_H32(const int32 & c,int32 v)117 static INLINE int32 MUL_32x32_H32(const int32& c, int32 v)
118 {
119 #if 0
120  int32 ret, dummy;
121 
122  asm("imull %3\n\t"
123 	: "=d"(ret), "=a"(dummy)
124 	: "a"(v), "m"(c)
125 	: "cc");
126  return ret;
127 #else
128  return ((int64)v * c) >> 32;
129 #endif
130 }
131 #endif
132 
133 template<unsigned psh>
IDCT_1D(int32 * __restrict__ c_in,int32 * __restrict__ o)134 static INLINE void IDCT_1D(int32* __restrict__ c_in, int32* __restrict__ o)
135 {
136  static const int32 coeffs[10] =
137  {
138   1779033704,
139 
140   C_COEFF( 0.5411961001461970),
141   C_COEFF(-1.8477590650225736),
142   C_COEFF( 0.7653668647301796),
143 
144   C_COEFF(-0.5555702330196022),
145   C_COEFF( 1.3870398453221474),
146   C_COEFF( 0.2758993792829430),
147 
148   C_COEFF( 0.1950903220161282),
149   C_COEFF( 0.7856949583871022),
150   C_COEFF(-1.1758756024193586),
151  };
152  int32 c[8];
153  int32 r;
154  int32 m;
155 
156  if(!psh)
157  {
158   c[0] = c_in[0] << (IDCT_PRESHIFT - EFF_RSHIFT_1D_COEFF);
159   c[4] = c_in[4] << (IDCT_PRESHIFT - EFF_RSHIFT_1D_COEFF);
160 
161   c[7] = (c_in[7] + c_in[1]) << IDCT_PRESHIFT;
162   c[1] = (c_in[7] - c_in[1]) << IDCT_PRESHIFT;
163 
164   c[3] = (46341 * c_in[5]) >> (15 - IDCT_PRESHIFT);
165   c[5] = (46341 * c_in[3]) >> (15 - IDCT_PRESHIFT);
166   //
167   //
168   //
169   //C_COEFF( 0.5411961001461970),
170   //C_COEFF(-1.8477590650225736),
171   //C_COEFF( 0.7653668647301796),
172   m = 35468 * (c_in[2] + c_in[6]);
173   c[2] = (-121095 * c_in[6] + m) >> (16 - IDCT_PRESHIFT + EFF_RSHIFT_1D_COEFF);
174   c[6] = (  50159 * c_in[2] + m) >> (16 - IDCT_PRESHIFT + EFF_RSHIFT_1D_COEFF);
175 /*
176   c_in[2] <<= IDCT_PRESHIFT;
177   c_in[6] <<= IDCT_PRESHIFT;
178   m = MUL_32x32_H32(coeffs[1], (c_in[2] + c_in[6]));
179   c[2] = MUL_32x32_H32(coeffs[2], c_in[6]) + m;
180   c[6] = MUL_32x32_H32(coeffs[3], c_in[2]) + m;
181 */
182  }
183  else
184  {
185   c[0] = (c_in[0] >> EFF_RSHIFT_1D_COEFF) + ((1 << psh) >> 1);
186   c[4] = c_in[4] >> EFF_RSHIFT_1D_COEFF;
187 
188   c[7] = c_in[7] + c_in[1];
189   c[1] = c_in[7] - c_in[1];
190 
191   //c[3] = c_in[5] + MUL_32x32_H32(coeffs[0], c_in[5]);
192   //c[5] = c_in[3] + MUL_32x32_H32(coeffs[0], c_in[3]);
193   //c[3] = c_in[5] + ((13572 * c_in[5]) >> 15);
194   //c[5] = c_in[3] + ((13572 * c_in[3]) >> 15);
195   c[3] = (c_in[5] * 181) >> 7;
196   c[5] = (c_in[3] * 181) >> 7;
197 /*
198   //c[3] = (c_in[5] * 181 + 64) >> 7;
199   //c[5] = (c_in[3] * 181 + 64) >> 7;
200   //
201   //
202   //
203 */
204   m = MUL_32x32_H32(coeffs[1], (c_in[2] + c_in[6]));
205   c[2] = MUL_32x32_H32(coeffs[2], c_in[6]) + m;
206   c[6] = MUL_32x32_H32(coeffs[3], c_in[2]) + m;
207 
208   //c[3] = (46341 * c_in[5]) >> (15 + EFF_RSHIFT_1D_COEFF);
209   //c[5] = (46341 * c_in[3]) >> (15 + EFF_RSHIFT_1D_COEFF);
210   //
211   //
212   //
213   //C_COEFF( 0.5411961001461970),
214   //C_COEFF(-1.8477590650225736),
215   //C_COEFF( 0.7653668647301796),
216   //m = 35468 * (c_in[2] + c_in[6]);
217   //c[2] = (-121095 * c_in[6] + m) >> (16 + EFF_RSHIFT_1D_COEFF);
218   //c[6] = (  50159 * c_in[2] + m) >> (16 + EFF_RSHIFT_1D_COEFF);
219  }
220  SNORP(0, 4)
221  SNORP(7, 5)
222  SNORP(3, 1)
223  //
224  //
225  //
226  m = MUL_32x32_H32(coeffs[4], (c[7] + c[1]));
227  r    = MUL_32x32_H32(coeffs[5], c[1]) + m;
228  c[1] = MUL_32x32_H32(coeffs[6], c[7]) - m;
229  //c[1] = ((565LL * c[7]) >> 13) - m;
230  c[7] = r;
231  //
232  //
233  //
234  //m = (799 * (c[3] + c[5])) >> (12 + EFF_RSHIFT_1D_COEFF);
235  m = MUL_32x32_H32(coeffs[7], (c[3] + c[5]));
236  r    = MUL_32x32_H32(coeffs[8], c[5]) + m;
237  c[5] = MUL_32x32_H32(coeffs[9], c[3]) + m;
238  c[3] = r;
239 
240  SNORP(0, 6)
241  SNORP(4, 2)
242  //
243  //
244  //
245  o[0 * 8] = (c[0] + c[1]) >> psh;
246  o[1 * 8] = (c[4] + c[5]) >> psh;
247  o[2 * 8] = (c[2] + c[3]) >> psh;
248  o[3 * 8] = (c[6] + c[7]) >> psh;
249  o[4 * 8] = (c[6] - c[7]) >> psh;
250  o[5 * 8] = (c[2] - c[3]) >> psh;
251  o[6 * 8] = (c[4] - c[5]) >> psh;
252  o[7 * 8] = (c[0] - c[1]) >> psh;
253 }
254 
255 template<unsigned psh>
IDCT_1D_Multi(int32 * __restrict__ c,int32 * __restrict__ o)256 static INLINE void IDCT_1D_Multi(int32* __restrict__ c, int32* __restrict__ o)
257 {
258  for(unsigned i = 0; i < 8; i++)
259   IDCT_1D<psh>(&c[i * 8], &o[i]);
260 }
261 
IDCT(int32 * c)262 void IDCT(int32* c)
263 {
264  int32 buf[64];
265 
266  static_assert(IDCT_PRESHIFT == EFF_RSHIFT_2D, "IDCT_PRESHIFT != EFF_RSHIFT_2D");
267 
268  IDCT_1D_Multi<0>(c, buf);
269  IDCT_1D_Multi<EFF_RSHIFT_1D_POST>(buf, c);
270 }
271 
272 }
273