1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 * *
11 ********************************************************************
12
13 function:
14 last mod: $Id$
15
16 ********************************************************************/
17
18 #include <string.h>
19 #include "internal.h"
20 #include "dct.h"
21
22 /*Performs an inverse 8 point Type-II DCT transform.
23 The output is scaled by a factor of 2 relative to the orthonormal version of
24 the transform.
25 _y: The buffer to store the result in.
26 Data will be placed in every 8th entry (e.g., in a column of an 8x8
27 block).
28 _x: The input coefficients.
29 The first 8 entries are used (e.g., from a row of an 8x8 block).*/
idct8(ogg_int16_t * _y,const ogg_int16_t _x[8])30 static void idct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){
31 ogg_int32_t t[8];
32 ogg_int32_t r;
33 /*Stage 1:*/
34 /*0-1 butterfly.*/
35 t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16;
36 t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16;
37 /*2-3 rotation by 6pi/16.*/
38 t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16);
39 t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16);
40 /*4-7 rotation by 7pi/16.*/
41 t[4]=(OC_C7S1*_x[1]>>16)-(OC_C1S7*_x[7]>>16);
42 /*5-6 rotation by 3pi/16.*/
43 t[5]=(OC_C3S5*_x[5]>>16)-(OC_C5S3*_x[3]>>16);
44 t[6]=(OC_C5S3*_x[5]>>16)+(OC_C3S5*_x[3]>>16);
45 t[7]=(OC_C1S7*_x[1]>>16)+(OC_C7S1*_x[7]>>16);
46 /*Stage 2:*/
47 /*4-5 butterfly.*/
48 r=t[4]+t[5];
49 t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
50 t[4]=r;
51 /*7-6 butterfly.*/
52 r=t[7]+t[6];
53 t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
54 t[7]=r;
55 /*Stage 3:*/
56 /*0-3 butterfly.*/
57 r=t[0]+t[3];
58 t[3]=t[0]-t[3];
59 t[0]=r;
60 /*1-2 butterfly.*/
61 r=t[1]+t[2];
62 t[2]=t[1]-t[2];
63 t[1]=r;
64 /*6-5 butterfly.*/
65 r=t[6]+t[5];
66 t[5]=t[6]-t[5];
67 t[6]=r;
68 /*Stage 4:*/
69 /*0-7 butterfly.*/
70 _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
71 /*1-6 butterfly.*/
72 _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
73 /*2-5 butterfly.*/
74 _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
75 /*3-4 butterfly.*/
76 _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
77 _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
78 _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
79 _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
80 _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
81 }
82
83 /*Performs an inverse 8 point Type-II DCT transform.
84 The output is scaled by a factor of 2 relative to the orthonormal version of
85 the transform.
86 _y: The buffer to store the result in.
87 Data will be placed in every 8th entry (e.g., in a column of an 8x8
88 block).
89 _x: The input coefficients.
90 Only the first 4 entries are used.
91 The other 4 are assumed to be 0.*/
idct8_4(ogg_int16_t * _y,const ogg_int16_t _x[8])92 static void idct8_4(ogg_int16_t *_y,const ogg_int16_t _x[8]){
93 ogg_int32_t t[8];
94 ogg_int32_t r;
95 /*Stage 1:*/
96 t[0]=OC_C4S4*_x[0]>>16;
97 t[2]=OC_C6S2*_x[2]>>16;
98 t[3]=OC_C2S6*_x[2]>>16;
99 t[4]=OC_C7S1*_x[1]>>16;
100 t[5]=-(OC_C5S3*_x[3]>>16);
101 t[6]=OC_C3S5*_x[3]>>16;
102 t[7]=OC_C1S7*_x[1]>>16;
103 /*Stage 2:*/
104 r=t[4]+t[5];
105 t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
106 t[4]=r;
107 r=t[7]+t[6];
108 t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
109 t[7]=r;
110 /*Stage 3:*/
111 t[1]=t[0]+t[2];
112 t[2]=t[0]-t[2];
113 r=t[0]+t[3];
114 t[3]=t[0]-t[3];
115 t[0]=r;
116 r=t[6]+t[5];
117 t[5]=t[6]-t[5];
118 t[6]=r;
119 /*Stage 4:*/
120 _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
121 _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
122 _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
123 _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
124 _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
125 _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
126 _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
127 _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
128 }
129
130 /*Performs an inverse 8 point Type-II DCT transform.
131 The output is scaled by a factor of 2 relative to the orthonormal version of
132 the transform.
133 _y: The buffer to store the result in.
134 Data will be placed in every 8th entry (e.g., in a column of an 8x8
135 block).
136 _x: The input coefficients.
137 Only the first 3 entries are used.
138 The other 5 are assumed to be 0.*/
idct8_3(ogg_int16_t * _y,const ogg_int16_t _x[8])139 static void idct8_3(ogg_int16_t *_y,const ogg_int16_t _x[8]){
140 ogg_int32_t t[8];
141 ogg_int32_t r;
142 /*Stage 1:*/
143 t[0]=OC_C4S4*_x[0]>>16;
144 t[2]=OC_C6S2*_x[2]>>16;
145 t[3]=OC_C2S6*_x[2]>>16;
146 t[4]=OC_C7S1*_x[1]>>16;
147 t[7]=OC_C1S7*_x[1]>>16;
148 /*Stage 2:*/
149 t[5]=OC_C4S4*t[4]>>16;
150 t[6]=OC_C4S4*t[7]>>16;
151 /*Stage 3:*/
152 t[1]=t[0]+t[2];
153 t[2]=t[0]-t[2];
154 r=t[0]+t[3];
155 t[3]=t[0]-t[3];
156 t[0]=r;
157 r=t[6]+t[5];
158 t[5]=t[6]-t[5];
159 t[6]=r;
160 /*Stage 4:*/
161 _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
162 _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
163 _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
164 _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
165 _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
166 _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
167 _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
168 _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
169 }
170
171 /*Performs an inverse 8 point Type-II DCT transform.
172 The output is scaled by a factor of 2 relative to the orthonormal version of
173 the transform.
174 _y: The buffer to store the result in.
175 Data will be placed in every 8th entry (e.g., in a column of an 8x8
176 block).
177 _x: The input coefficients.
178 Only the first 2 entries are used.
179 The other 6 are assumed to be 0.*/
idct8_2(ogg_int16_t * _y,const ogg_int16_t _x[8])180 static void idct8_2(ogg_int16_t *_y,const ogg_int16_t _x[8]){
181 ogg_int32_t t[8];
182 ogg_int32_t r;
183 /*Stage 1:*/
184 t[0]=OC_C4S4*_x[0]>>16;
185 t[4]=OC_C7S1*_x[1]>>16;
186 t[7]=OC_C1S7*_x[1]>>16;
187 /*Stage 2:*/
188 t[5]=OC_C4S4*t[4]>>16;
189 t[6]=OC_C4S4*t[7]>>16;
190 /*Stage 3:*/
191 r=t[6]+t[5];
192 t[5]=t[6]-t[5];
193 t[6]=r;
194 /*Stage 4:*/
195 _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
196 _y[1<<3]=(ogg_int16_t)(t[0]+t[6]);
197 _y[2<<3]=(ogg_int16_t)(t[0]+t[5]);
198 _y[3<<3]=(ogg_int16_t)(t[0]+t[4]);
199 _y[4<<3]=(ogg_int16_t)(t[0]-t[4]);
200 _y[5<<3]=(ogg_int16_t)(t[0]-t[5]);
201 _y[6<<3]=(ogg_int16_t)(t[0]-t[6]);
202 _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
203 }
204
205 /*Performs an inverse 8 point Type-II DCT transform.
206 The output is scaled by a factor of 2 relative to the orthonormal version of
207 the transform.
208 _y: The buffer to store the result in.
209 Data will be placed in every 8th entry (e.g., in a column of an 8x8
210 block).
211 _x: The input coefficients.
212 Only the first entry is used.
213 The other 7 are assumed to be 0.*/
idct8_1(ogg_int16_t * _y,const ogg_int16_t _x[1])214 static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
215 _y[0<<3]=_y[1<<3]=_y[2<<3]=_y[3<<3]=
216 _y[4<<3]=_y[5<<3]=_y[6<<3]=_y[7<<3]=(ogg_int16_t)(OC_C4S4*_x[0]>>16);
217 }
218
219 /*Performs an inverse 8x8 Type-II DCT transform.
220 The input is assumed to be scaled by a factor of 4 relative to orthonormal
221 version of the transform.
222 All coefficients but the first 3 in zig-zag scan order are assumed to be 0:
223 x x 0 0 0 0 0 0
224 x 0 0 0 0 0 0 0
225 0 0 0 0 0 0 0 0
226 0 0 0 0 0 0 0 0
227 0 0 0 0 0 0 0 0
228 0 0 0 0 0 0 0 0
229 0 0 0 0 0 0 0 0
230 0 0 0 0 0 0 0 0
231 _y: The buffer to store the result in.
232 This may be the same as _x.
233 _x: The input coefficients.*/
oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64])234 static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
235 const ogg_int16_t *in;
236 ogg_int16_t *end;
237 ogg_int16_t *out;
238 ogg_int16_t w[64];
239 /*Transform rows of x into columns of w.*/
240 idct8_2(w,_x);
241 idct8_1(w+1,_x+8);
242 /*Transform rows of w into columns of y.*/
243 for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
244 /*Adjust for the scale factor.*/
245 for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
246 }
247
248 /*Performs an inverse 8x8 Type-II DCT transform.
249 The input is assumed to be scaled by a factor of 4 relative to orthonormal
250 version of the transform.
251 All coefficients but the first 10 in zig-zag scan order are assumed to be 0:
252 x x x x 0 0 0 0
253 x x x 0 0 0 0 0
254 x x 0 0 0 0 0 0
255 x 0 0 0 0 0 0 0
256 0 0 0 0 0 0 0 0
257 0 0 0 0 0 0 0 0
258 0 0 0 0 0 0 0 0
259 0 0 0 0 0 0 0 0
260 _y: The buffer to store the result in.
261 This may be the same as _x.
262 _x: The input coefficients.*/
oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64])263 static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
264 const ogg_int16_t *in;
265 ogg_int16_t *end;
266 ogg_int16_t *out;
267 ogg_int16_t w[64];
268 /*Transform rows of x into columns of w.*/
269 idct8_4(w,_x);
270 idct8_3(w+1,_x+8);
271 idct8_2(w+2,_x+16);
272 idct8_1(w+3,_x+24);
273 /*Transform rows of w into columns of y.*/
274 for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
275 /*Adjust for the scale factor.*/
276 for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
277 }
278
279 /*Performs an inverse 8x8 Type-II DCT transform.
280 The input is assumed to be scaled by a factor of 4 relative to orthonormal
281 version of the transform.
282 _y: The buffer to store the result in.
283 This may be the same as _x.
284 _x: The input coefficients.*/
oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64])285 static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
286 const ogg_int16_t *in;
287 ogg_int16_t *end;
288 ogg_int16_t *out;
289 ogg_int16_t w[64];
290 /*Transform rows of x into columns of w.*/
291 for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
292 /*Transform rows of w into columns of y.*/
293 for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
294 /*Adjust for the scale factor.*/
295 for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
296 }
297
oc_idct8x8(const oc_theora_state * _state,ogg_int16_t _y[64],int _last_zzi)298 void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
299 int _last_zzi){
300 (*_state->opt_vtable.idct8x8)(_y,_last_zzi);
301 }
302
303 /*Performs an inverse 8x8 Type-II DCT transform.
304 The input is assumed to be scaled by a factor of 4 relative to orthonormal
305 version of the transform.*/
oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi)306 void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
307 /*_last_zzi is subtly different from an actual count of the number of
308 coefficients we decoded for this block.
309 It contains the value of zzi BEFORE the final token in the block was
310 decoded.
311 In most cases this is an EOB token (the continuation of an EOB run from a
312 previous block counts), and so this is the same as the coefficient count.
313 However, in the case that the last token was NOT an EOB token, but filled
314 the block up with exactly 64 coefficients, _last_zzi will be less than 64.
315 Provided the last token was not a pure zero run, the minimum value it can
316 be is 46, and so that doesn't affect any of the cases in this routine.
317 However, if the last token WAS a pure zero run of length 63, then _last_zzi
318 will be 1 while the number of coefficients decoded is 64.
319 Thus, we will trigger the following special case, where the real
320 coefficient count would not.
321 Note also that a zero run of length 64 will give _last_zzi a value of 0,
322 but we still process the DC coefficient, which might have a non-zero value
323 due to DC prediction.
324 Although convoluted, this is arguably the correct behavior: it allows us to
325 use a smaller transform when the block ends with a long zero run instead
326 of a normal EOB token.
327 It could be smarter... multiple separate zero runs at the end of a block
328 will fool it, but an encoder that generates these really deserves what it
329 gets.
330 Needless to say we inherited this approach from VP3.*/
331 /*Then perform the iDCT.*/
332 if(_last_zzi<3)oc_idct8x8_3(_y,_y);
333 else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
334 else oc_idct8x8_slow(_y,_y);
335 }
336