1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <math.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/inv_txfm.h"
17
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20 0.5 shifts per pixel. */
21 int i;
22 tran_low_t output[16];
23 tran_high_t a1, b1, c1, d1, e1;
24 const tran_low_t *ip = input;
25 tran_low_t *op = output;
26
27 for (i = 0; i < 4; i++) {
28 a1 = ip[0] >> UNIT_QUANT_SHIFT;
29 c1 = ip[1] >> UNIT_QUANT_SHIFT;
30 d1 = ip[2] >> UNIT_QUANT_SHIFT;
31 b1 = ip[3] >> UNIT_QUANT_SHIFT;
32 a1 += c1;
33 d1 -= b1;
34 e1 = (a1 - d1) >> 1;
35 b1 = e1 - b1;
36 c1 = e1 - c1;
37 a1 -= b1;
38 d1 += c1;
39 op[0] = WRAPLOW(a1);
40 op[1] = WRAPLOW(b1);
41 op[2] = WRAPLOW(c1);
42 op[3] = WRAPLOW(d1);
43 ip += 4;
44 op += 4;
45 }
46
47 ip = output;
48 for (i = 0; i < 4; i++) {
49 a1 = ip[4 * 0];
50 c1 = ip[4 * 1];
51 d1 = ip[4 * 2];
52 b1 = ip[4 * 3];
53 a1 += c1;
54 d1 -= b1;
55 e1 = (a1 - d1) >> 1;
56 b1 = e1 - b1;
57 c1 = e1 - c1;
58 a1 -= b1;
59 d1 += c1;
60 dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61 dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62 dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63 dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64
65 ip++;
66 dest++;
67 }
68 }
69
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int stride)70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
71 int i;
72 tran_high_t a1, e1;
73 tran_low_t tmp[4];
74 const tran_low_t *ip = in;
75 tran_low_t *op = tmp;
76
77 a1 = ip[0] >> UNIT_QUANT_SHIFT;
78 e1 = a1 >> 1;
79 a1 -= e1;
80 op[0] = WRAPLOW(a1);
81 op[1] = op[2] = op[3] = WRAPLOW(e1);
82
83 ip = tmp;
84 for (i = 0; i < 4; i++) {
85 e1 = ip[0] >> 1;
86 a1 = ip[0] - e1;
87 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88 dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89 dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90 dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
91 ip++;
92 dest++;
93 }
94 }
95
idct4_c(const tran_low_t * input,tran_low_t * output)96 void idct4_c(const tran_low_t *input, tran_low_t *output) {
97 tran_low_t step[4];
98 tran_high_t temp1, temp2;
99
100 // stage 1
101 temp1 = (input[0] + input[2]) * cospi_16_64;
102 temp2 = (input[0] - input[2]) * cospi_16_64;
103 step[0] = WRAPLOW(dct_const_round_shift(temp1));
104 step[1] = WRAPLOW(dct_const_round_shift(temp2));
105 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
106 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
107 step[2] = WRAPLOW(dct_const_round_shift(temp1));
108 step[3] = WRAPLOW(dct_const_round_shift(temp2));
109
110 // stage 2
111 output[0] = WRAPLOW(step[0] + step[3]);
112 output[1] = WRAPLOW(step[1] + step[2]);
113 output[2] = WRAPLOW(step[1] - step[2]);
114 output[3] = WRAPLOW(step[0] - step[3]);
115 }
116
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)117 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
118 int i, j;
119 tran_low_t out[4 * 4];
120 tran_low_t *outptr = out;
121 tran_low_t temp_in[4], temp_out[4];
122
123 // Rows
124 for (i = 0; i < 4; ++i) {
125 idct4_c(input, outptr);
126 input += 4;
127 outptr += 4;
128 }
129
130 // Columns
131 for (i = 0; i < 4; ++i) {
132 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
133 idct4_c(temp_in, temp_out);
134 for (j = 0; j < 4; ++j) {
135 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
136 ROUND_POWER_OF_TWO(temp_out[j], 4));
137 }
138 }
139 }
140
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)141 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
142 int i;
143 tran_high_t a1;
144 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
145
146 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
147 a1 = ROUND_POWER_OF_TWO(out, 4);
148
149 for (i = 0; i < 4; i++) {
150 dest[0] = clip_pixel_add(dest[0], a1);
151 dest[1] = clip_pixel_add(dest[1], a1);
152 dest[2] = clip_pixel_add(dest[2], a1);
153 dest[3] = clip_pixel_add(dest[3], a1);
154 dest += stride;
155 }
156 }
157
idct8_c(const tran_low_t * input,tran_low_t * output)158 void idct8_c(const tran_low_t *input, tran_low_t *output) {
159 tran_low_t step1[8], step2[8];
160 tran_high_t temp1, temp2;
161
162 // stage 1
163 step1[0] = input[0];
164 step1[2] = input[4];
165 step1[1] = input[2];
166 step1[3] = input[6];
167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
170 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
174 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
175
176 // stage 2
177 temp1 = (step1[0] + step1[2]) * cospi_16_64;
178 temp2 = (step1[0] - step1[2]) * cospi_16_64;
179 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
180 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
181 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
182 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
183 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
184 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
185 step2[4] = WRAPLOW(step1[4] + step1[5]);
186 step2[5] = WRAPLOW(step1[4] - step1[5]);
187 step2[6] = WRAPLOW(-step1[6] + step1[7]);
188 step2[7] = WRAPLOW(step1[6] + step1[7]);
189
190 // stage 3
191 step1[0] = WRAPLOW(step2[0] + step2[3]);
192 step1[1] = WRAPLOW(step2[1] + step2[2]);
193 step1[2] = WRAPLOW(step2[1] - step2[2]);
194 step1[3] = WRAPLOW(step2[0] - step2[3]);
195 step1[4] = step2[4];
196 temp1 = (step2[6] - step2[5]) * cospi_16_64;
197 temp2 = (step2[5] + step2[6]) * cospi_16_64;
198 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
199 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
200 step1[7] = step2[7];
201
202 // stage 4
203 output[0] = WRAPLOW(step1[0] + step1[7]);
204 output[1] = WRAPLOW(step1[1] + step1[6]);
205 output[2] = WRAPLOW(step1[2] + step1[5]);
206 output[3] = WRAPLOW(step1[3] + step1[4]);
207 output[4] = WRAPLOW(step1[3] - step1[4]);
208 output[5] = WRAPLOW(step1[2] - step1[5]);
209 output[6] = WRAPLOW(step1[1] - step1[6]);
210 output[7] = WRAPLOW(step1[0] - step1[7]);
211 }
212
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)213 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
214 int i, j;
215 tran_low_t out[8 * 8];
216 tran_low_t *outptr = out;
217 tran_low_t temp_in[8], temp_out[8];
218
219 // First transform rows
220 for (i = 0; i < 8; ++i) {
221 idct8_c(input, outptr);
222 input += 8;
223 outptr += 8;
224 }
225
226 // Then transform columns
227 for (i = 0; i < 8; ++i) {
228 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
229 idct8_c(temp_in, temp_out);
230 for (j = 0; j < 8; ++j) {
231 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
232 ROUND_POWER_OF_TWO(temp_out[j], 5));
233 }
234 }
235 }
236
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)237 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
238 int i, j;
239 tran_high_t a1;
240 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
241
242 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
243 a1 = ROUND_POWER_OF_TWO(out, 5);
244 for (j = 0; j < 8; ++j) {
245 for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
246 dest += stride;
247 }
248 }
249
iadst4_c(const tran_low_t * input,tran_low_t * output)250 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
251 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
252 tran_low_t x0 = input[0];
253 tran_low_t x1 = input[1];
254 tran_low_t x2 = input[2];
255 tran_low_t x3 = input[3];
256
257 if (!(x0 | x1 | x2 | x3)) {
258 memset(output, 0, 4 * sizeof(*output));
259 return;
260 }
261
262 s0 = sinpi_1_9 * x0;
263 s1 = sinpi_2_9 * x0;
264 s2 = sinpi_3_9 * x1;
265 s3 = sinpi_4_9 * x2;
266 s4 = sinpi_1_9 * x2;
267 s5 = sinpi_2_9 * x3;
268 s6 = sinpi_4_9 * x3;
269 s7 = WRAPLOW(x0 - x2 + x3);
270
271 s0 = s0 + s3 + s5;
272 s1 = s1 - s4 - s6;
273 s3 = s2;
274 s2 = sinpi_3_9 * s7;
275
276 // 1-D transform scaling factor is sqrt(2).
277 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
278 // + 1b (addition) = 29b.
279 // Hence the output bit depth is 15b.
280 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
281 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
282 output[2] = WRAPLOW(dct_const_round_shift(s2));
283 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
284 }
285
iadst8_c(const tran_low_t * input,tran_low_t * output)286 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
287 int s0, s1, s2, s3, s4, s5, s6, s7;
288 tran_high_t x0 = input[7];
289 tran_high_t x1 = input[0];
290 tran_high_t x2 = input[5];
291 tran_high_t x3 = input[2];
292 tran_high_t x4 = input[3];
293 tran_high_t x5 = input[4];
294 tran_high_t x6 = input[1];
295 tran_high_t x7 = input[6];
296
297 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
298 memset(output, 0, 8 * sizeof(*output));
299 return;
300 }
301
302 // stage 1
303 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
304 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
305 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
306 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
307 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
308 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
309 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
310 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
311
312 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
313 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
314 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
315 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
316 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
317 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
318 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
319 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
320
321 // stage 2
322 s0 = (int)x0;
323 s1 = (int)x1;
324 s2 = (int)x2;
325 s3 = (int)x3;
326 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
327 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
328 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
329 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
330
331 x0 = WRAPLOW(s0 + s2);
332 x1 = WRAPLOW(s1 + s3);
333 x2 = WRAPLOW(s0 - s2);
334 x3 = WRAPLOW(s1 - s3);
335 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
336 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
337 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
338 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
339
340 // stage 3
341 s2 = (int)(cospi_16_64 * (x2 + x3));
342 s3 = (int)(cospi_16_64 * (x2 - x3));
343 s6 = (int)(cospi_16_64 * (x6 + x7));
344 s7 = (int)(cospi_16_64 * (x6 - x7));
345
346 x2 = WRAPLOW(dct_const_round_shift(s2));
347 x3 = WRAPLOW(dct_const_round_shift(s3));
348 x6 = WRAPLOW(dct_const_round_shift(s6));
349 x7 = WRAPLOW(dct_const_round_shift(s7));
350
351 output[0] = WRAPLOW(x0);
352 output[1] = WRAPLOW(-x4);
353 output[2] = WRAPLOW(x6);
354 output[3] = WRAPLOW(-x2);
355 output[4] = WRAPLOW(x3);
356 output[5] = WRAPLOW(-x7);
357 output[6] = WRAPLOW(x5);
358 output[7] = WRAPLOW(-x1);
359 }
360
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)361 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
362 int i, j;
363 tran_low_t out[8 * 8] = { 0 };
364 tran_low_t *outptr = out;
365 tran_low_t temp_in[8], temp_out[8];
366
367 // First transform rows
368 // Only first 4 row has non-zero coefs
369 for (i = 0; i < 4; ++i) {
370 idct8_c(input, outptr);
371 input += 8;
372 outptr += 8;
373 }
374
375 // Then transform columns
376 for (i = 0; i < 8; ++i) {
377 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
378 idct8_c(temp_in, temp_out);
379 for (j = 0; j < 8; ++j) {
380 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
381 ROUND_POWER_OF_TWO(temp_out[j], 5));
382 }
383 }
384 }
385
idct16_c(const tran_low_t * input,tran_low_t * output)386 void idct16_c(const tran_low_t *input, tran_low_t *output) {
387 tran_low_t step1[16], step2[16];
388 tran_high_t temp1, temp2;
389
390 // stage 1
391 step1[0] = input[0 / 2];
392 step1[1] = input[16 / 2];
393 step1[2] = input[8 / 2];
394 step1[3] = input[24 / 2];
395 step1[4] = input[4 / 2];
396 step1[5] = input[20 / 2];
397 step1[6] = input[12 / 2];
398 step1[7] = input[28 / 2];
399 step1[8] = input[2 / 2];
400 step1[9] = input[18 / 2];
401 step1[10] = input[10 / 2];
402 step1[11] = input[26 / 2];
403 step1[12] = input[6 / 2];
404 step1[13] = input[22 / 2];
405 step1[14] = input[14 / 2];
406 step1[15] = input[30 / 2];
407
408 // stage 2
409 step2[0] = step1[0];
410 step2[1] = step1[1];
411 step2[2] = step1[2];
412 step2[3] = step1[3];
413 step2[4] = step1[4];
414 step2[5] = step1[5];
415 step2[6] = step1[6];
416 step2[7] = step1[7];
417
418 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
419 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
420 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
421 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
422
423 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
424 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
425 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
426 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
427
428 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
429 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
430 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
431 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
432
433 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
434 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
435 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
436 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
437
438 // stage 3
439 step1[0] = step2[0];
440 step1[1] = step2[1];
441 step1[2] = step2[2];
442 step1[3] = step2[3];
443
444 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
445 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
446 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
447 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
448 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
449 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
450 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
451 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
452
453 step1[8] = WRAPLOW(step2[8] + step2[9]);
454 step1[9] = WRAPLOW(step2[8] - step2[9]);
455 step1[10] = WRAPLOW(-step2[10] + step2[11]);
456 step1[11] = WRAPLOW(step2[10] + step2[11]);
457 step1[12] = WRAPLOW(step2[12] + step2[13]);
458 step1[13] = WRAPLOW(step2[12] - step2[13]);
459 step1[14] = WRAPLOW(-step2[14] + step2[15]);
460 step1[15] = WRAPLOW(step2[14] + step2[15]);
461
462 // stage 4
463 temp1 = (step1[0] + step1[1]) * cospi_16_64;
464 temp2 = (step1[0] - step1[1]) * cospi_16_64;
465 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
466 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
467 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
468 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
469 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
470 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
471 step2[4] = WRAPLOW(step1[4] + step1[5]);
472 step2[5] = WRAPLOW(step1[4] - step1[5]);
473 step2[6] = WRAPLOW(-step1[6] + step1[7]);
474 step2[7] = WRAPLOW(step1[6] + step1[7]);
475
476 step2[8] = step1[8];
477 step2[15] = step1[15];
478 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
479 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
480 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
481 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
482 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
483 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
484 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
485 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
486 step2[11] = step1[11];
487 step2[12] = step1[12];
488
489 // stage 5
490 step1[0] = WRAPLOW(step2[0] + step2[3]);
491 step1[1] = WRAPLOW(step2[1] + step2[2]);
492 step1[2] = WRAPLOW(step2[1] - step2[2]);
493 step1[3] = WRAPLOW(step2[0] - step2[3]);
494 step1[4] = step2[4];
495 temp1 = (step2[6] - step2[5]) * cospi_16_64;
496 temp2 = (step2[5] + step2[6]) * cospi_16_64;
497 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
498 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
499 step1[7] = step2[7];
500
501 step1[8] = WRAPLOW(step2[8] + step2[11]);
502 step1[9] = WRAPLOW(step2[9] + step2[10]);
503 step1[10] = WRAPLOW(step2[9] - step2[10]);
504 step1[11] = WRAPLOW(step2[8] - step2[11]);
505 step1[12] = WRAPLOW(-step2[12] + step2[15]);
506 step1[13] = WRAPLOW(-step2[13] + step2[14]);
507 step1[14] = WRAPLOW(step2[13] + step2[14]);
508 step1[15] = WRAPLOW(step2[12] + step2[15]);
509
510 // stage 6
511 step2[0] = WRAPLOW(step1[0] + step1[7]);
512 step2[1] = WRAPLOW(step1[1] + step1[6]);
513 step2[2] = WRAPLOW(step1[2] + step1[5]);
514 step2[3] = WRAPLOW(step1[3] + step1[4]);
515 step2[4] = WRAPLOW(step1[3] - step1[4]);
516 step2[5] = WRAPLOW(step1[2] - step1[5]);
517 step2[6] = WRAPLOW(step1[1] - step1[6]);
518 step2[7] = WRAPLOW(step1[0] - step1[7]);
519 step2[8] = step1[8];
520 step2[9] = step1[9];
521 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
522 temp2 = (step1[10] + step1[13]) * cospi_16_64;
523 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
524 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
525 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
526 temp2 = (step1[11] + step1[12]) * cospi_16_64;
527 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
528 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
529 step2[14] = step1[14];
530 step2[15] = step1[15];
531
532 // stage 7
533 output[0] = WRAPLOW(step2[0] + step2[15]);
534 output[1] = WRAPLOW(step2[1] + step2[14]);
535 output[2] = WRAPLOW(step2[2] + step2[13]);
536 output[3] = WRAPLOW(step2[3] + step2[12]);
537 output[4] = WRAPLOW(step2[4] + step2[11]);
538 output[5] = WRAPLOW(step2[5] + step2[10]);
539 output[6] = WRAPLOW(step2[6] + step2[9]);
540 output[7] = WRAPLOW(step2[7] + step2[8]);
541 output[8] = WRAPLOW(step2[7] - step2[8]);
542 output[9] = WRAPLOW(step2[6] - step2[9]);
543 output[10] = WRAPLOW(step2[5] - step2[10]);
544 output[11] = WRAPLOW(step2[4] - step2[11]);
545 output[12] = WRAPLOW(step2[3] - step2[12]);
546 output[13] = WRAPLOW(step2[2] - step2[13]);
547 output[14] = WRAPLOW(step2[1] - step2[14]);
548 output[15] = WRAPLOW(step2[0] - step2[15]);
549 }
550
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)551 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
552 int stride) {
553 int i, j;
554 tran_low_t out[16 * 16];
555 tran_low_t *outptr = out;
556 tran_low_t temp_in[16], temp_out[16];
557
558 // First transform rows
559 for (i = 0; i < 16; ++i) {
560 idct16_c(input, outptr);
561 input += 16;
562 outptr += 16;
563 }
564
565 // Then transform columns
566 for (i = 0; i < 16; ++i) {
567 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
568 idct16_c(temp_in, temp_out);
569 for (j = 0; j < 16; ++j) {
570 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
571 ROUND_POWER_OF_TWO(temp_out[j], 6));
572 }
573 }
574 }
575
iadst16_c(const tran_low_t * input,tran_low_t * output)576 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
577 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
578 tran_high_t s9, s10, s11, s12, s13, s14, s15;
579 tran_high_t x0 = input[15];
580 tran_high_t x1 = input[0];
581 tran_high_t x2 = input[13];
582 tran_high_t x3 = input[2];
583 tran_high_t x4 = input[11];
584 tran_high_t x5 = input[4];
585 tran_high_t x6 = input[9];
586 tran_high_t x7 = input[6];
587 tran_high_t x8 = input[7];
588 tran_high_t x9 = input[8];
589 tran_high_t x10 = input[5];
590 tran_high_t x11 = input[10];
591 tran_high_t x12 = input[3];
592 tran_high_t x13 = input[12];
593 tran_high_t x14 = input[1];
594 tran_high_t x15 = input[14];
595
596 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
597 x13 | x14 | x15)) {
598 memset(output, 0, 16 * sizeof(*output));
599 return;
600 }
601
602 // stage 1
603 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
604 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
605 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
606 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
607 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
608 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
609 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
610 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
611 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
612 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
613 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
614 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
615 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
616 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
617 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
618 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
619
620 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
621 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
622 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
623 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
624 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
625 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
626 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
627 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
628 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
629 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
630 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
631 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
632 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
633 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
634 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
635 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
636
637 // stage 2
638 s0 = x0;
639 s1 = x1;
640 s2 = x2;
641 s3 = x3;
642 s4 = x4;
643 s5 = x5;
644 s6 = x6;
645 s7 = x7;
646 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
647 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
648 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
649 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
650 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
651 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
652 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
653 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
654
655 x0 = WRAPLOW(s0 + s4);
656 x1 = WRAPLOW(s1 + s5);
657 x2 = WRAPLOW(s2 + s6);
658 x3 = WRAPLOW(s3 + s7);
659 x4 = WRAPLOW(s0 - s4);
660 x5 = WRAPLOW(s1 - s5);
661 x6 = WRAPLOW(s2 - s6);
662 x7 = WRAPLOW(s3 - s7);
663 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
664 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
665 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
666 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
667 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
668 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
669 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
670 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
671
672 // stage 3
673 s0 = x0;
674 s1 = x1;
675 s2 = x2;
676 s3 = x3;
677 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
678 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
679 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
680 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
681 s8 = x8;
682 s9 = x9;
683 s10 = x10;
684 s11 = x11;
685 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
686 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
687 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
688 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
689
690 x0 = WRAPLOW(s0 + s2);
691 x1 = WRAPLOW(s1 + s3);
692 x2 = WRAPLOW(s0 - s2);
693 x3 = WRAPLOW(s1 - s3);
694 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
695 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
696 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
697 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
698 x8 = WRAPLOW(s8 + s10);
699 x9 = WRAPLOW(s9 + s11);
700 x10 = WRAPLOW(s8 - s10);
701 x11 = WRAPLOW(s9 - s11);
702 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
703 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
704 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
705 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
706
707 // stage 4
708 s2 = (-cospi_16_64) * (x2 + x3);
709 s3 = cospi_16_64 * (x2 - x3);
710 s6 = cospi_16_64 * (x6 + x7);
711 s7 = cospi_16_64 * (-x6 + x7);
712 s10 = cospi_16_64 * (x10 + x11);
713 s11 = cospi_16_64 * (-x10 + x11);
714 s14 = (-cospi_16_64) * (x14 + x15);
715 s15 = cospi_16_64 * (x14 - x15);
716
717 x2 = WRAPLOW(dct_const_round_shift(s2));
718 x3 = WRAPLOW(dct_const_round_shift(s3));
719 x6 = WRAPLOW(dct_const_round_shift(s6));
720 x7 = WRAPLOW(dct_const_round_shift(s7));
721 x10 = WRAPLOW(dct_const_round_shift(s10));
722 x11 = WRAPLOW(dct_const_round_shift(s11));
723 x14 = WRAPLOW(dct_const_round_shift(s14));
724 x15 = WRAPLOW(dct_const_round_shift(s15));
725
726 output[0] = WRAPLOW(x0);
727 output[1] = WRAPLOW(-x8);
728 output[2] = WRAPLOW(x12);
729 output[3] = WRAPLOW(-x4);
730 output[4] = WRAPLOW(x6);
731 output[5] = WRAPLOW(x14);
732 output[6] = WRAPLOW(x10);
733 output[7] = WRAPLOW(x2);
734 output[8] = WRAPLOW(x3);
735 output[9] = WRAPLOW(x11);
736 output[10] = WRAPLOW(x15);
737 output[11] = WRAPLOW(x7);
738 output[12] = WRAPLOW(x5);
739 output[13] = WRAPLOW(-x13);
740 output[14] = WRAPLOW(x9);
741 output[15] = WRAPLOW(-x1);
742 }
743
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)744 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
745 int stride) {
746 int i, j;
747 tran_low_t out[16 * 16] = { 0 };
748 tran_low_t *outptr = out;
749 tran_low_t temp_in[16], temp_out[16];
750
751 // First transform rows. Since all non-zero dct coefficients are in
752 // upper-left 4x4 area, we only need to calculate first 4 rows here.
753 for (i = 0; i < 4; ++i) {
754 idct16_c(input, outptr);
755 input += 16;
756 outptr += 16;
757 }
758
759 // Then transform columns
760 for (i = 0; i < 16; ++i) {
761 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
762 idct16_c(temp_in, temp_out);
763 for (j = 0; j < 16; ++j) {
764 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
765 ROUND_POWER_OF_TWO(temp_out[j], 6));
766 }
767 }
768 }
769
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)770 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
771 int i, j;
772 tran_high_t a1;
773 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
774
775 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
776 a1 = ROUND_POWER_OF_TWO(out, 6);
777 for (j = 0; j < 16; ++j) {
778 for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
779 dest += stride;
780 }
781 }
782
idct32_c(const tran_low_t * input,tran_low_t * output)783 void idct32_c(const tran_low_t *input, tran_low_t *output) {
784 tran_low_t step1[32], step2[32];
785 tran_high_t temp1, temp2;
786
787 // stage 1
788 step1[0] = input[0];
789 step1[1] = input[16];
790 step1[2] = input[8];
791 step1[3] = input[24];
792 step1[4] = input[4];
793 step1[5] = input[20];
794 step1[6] = input[12];
795 step1[7] = input[28];
796 step1[8] = input[2];
797 step1[9] = input[18];
798 step1[10] = input[10];
799 step1[11] = input[26];
800 step1[12] = input[6];
801 step1[13] = input[22];
802 step1[14] = input[14];
803 step1[15] = input[30];
804
805 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
806 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
807 step1[16] = WRAPLOW(dct_const_round_shift(temp1));
808 step1[31] = WRAPLOW(dct_const_round_shift(temp2));
809
810 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
811 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
812 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
813 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
814
815 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
816 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
817 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
818 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
819
820 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
821 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
822 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
823 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
824
825 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
826 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
827 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
828 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
829
830 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
831 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
832 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
833 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
834
835 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
836 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
837 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
838 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
839
840 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
841 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
842 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
843 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
844
845 // stage 2
846 step2[0] = step1[0];
847 step2[1] = step1[1];
848 step2[2] = step1[2];
849 step2[3] = step1[3];
850 step2[4] = step1[4];
851 step2[5] = step1[5];
852 step2[6] = step1[6];
853 step2[7] = step1[7];
854
855 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
856 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
857 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
858 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
859
860 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
861 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
862 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
863 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
864
865 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
866 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
867 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
868 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
869
870 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
871 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
872 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
873 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
874
875 step2[16] = WRAPLOW(step1[16] + step1[17]);
876 step2[17] = WRAPLOW(step1[16] - step1[17]);
877 step2[18] = WRAPLOW(-step1[18] + step1[19]);
878 step2[19] = WRAPLOW(step1[18] + step1[19]);
879 step2[20] = WRAPLOW(step1[20] + step1[21]);
880 step2[21] = WRAPLOW(step1[20] - step1[21]);
881 step2[22] = WRAPLOW(-step1[22] + step1[23]);
882 step2[23] = WRAPLOW(step1[22] + step1[23]);
883 step2[24] = WRAPLOW(step1[24] + step1[25]);
884 step2[25] = WRAPLOW(step1[24] - step1[25]);
885 step2[26] = WRAPLOW(-step1[26] + step1[27]);
886 step2[27] = WRAPLOW(step1[26] + step1[27]);
887 step2[28] = WRAPLOW(step1[28] + step1[29]);
888 step2[29] = WRAPLOW(step1[28] - step1[29]);
889 step2[30] = WRAPLOW(-step1[30] + step1[31]);
890 step2[31] = WRAPLOW(step1[30] + step1[31]);
891
892 // stage 3
893 step1[0] = step2[0];
894 step1[1] = step2[1];
895 step1[2] = step2[2];
896 step1[3] = step2[3];
897
898 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
899 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
900 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
901 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
902 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
903 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
904 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
905 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
906
907 step1[8] = WRAPLOW(step2[8] + step2[9]);
908 step1[9] = WRAPLOW(step2[8] - step2[9]);
909 step1[10] = WRAPLOW(-step2[10] + step2[11]);
910 step1[11] = WRAPLOW(step2[10] + step2[11]);
911 step1[12] = WRAPLOW(step2[12] + step2[13]);
912 step1[13] = WRAPLOW(step2[12] - step2[13]);
913 step1[14] = WRAPLOW(-step2[14] + step2[15]);
914 step1[15] = WRAPLOW(step2[14] + step2[15]);
915
916 step1[16] = step2[16];
917 step1[31] = step2[31];
918 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
919 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
920 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
921 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
922 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
923 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
924 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
925 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
926 step1[19] = step2[19];
927 step1[20] = step2[20];
928 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
929 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
930 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
931 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
932 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
933 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
934 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
935 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
936 step1[23] = step2[23];
937 step1[24] = step2[24];
938 step1[27] = step2[27];
939 step1[28] = step2[28];
940
941 // stage 4
942 temp1 = (step1[0] + step1[1]) * cospi_16_64;
943 temp2 = (step1[0] - step1[1]) * cospi_16_64;
944 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
945 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
946 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
947 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
948 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
949 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
950 step2[4] = WRAPLOW(step1[4] + step1[5]);
951 step2[5] = WRAPLOW(step1[4] - step1[5]);
952 step2[6] = WRAPLOW(-step1[6] + step1[7]);
953 step2[7] = WRAPLOW(step1[6] + step1[7]);
954
955 step2[8] = step1[8];
956 step2[15] = step1[15];
957 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
958 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
959 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
960 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
961 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
962 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
963 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
964 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
965 step2[11] = step1[11];
966 step2[12] = step1[12];
967
968 step2[16] = WRAPLOW(step1[16] + step1[19]);
969 step2[17] = WRAPLOW(step1[17] + step1[18]);
970 step2[18] = WRAPLOW(step1[17] - step1[18]);
971 step2[19] = WRAPLOW(step1[16] - step1[19]);
972 step2[20] = WRAPLOW(-step1[20] + step1[23]);
973 step2[21] = WRAPLOW(-step1[21] + step1[22]);
974 step2[22] = WRAPLOW(step1[21] + step1[22]);
975 step2[23] = WRAPLOW(step1[20] + step1[23]);
976
977 step2[24] = WRAPLOW(step1[24] + step1[27]);
978 step2[25] = WRAPLOW(step1[25] + step1[26]);
979 step2[26] = WRAPLOW(step1[25] - step1[26]);
980 step2[27] = WRAPLOW(step1[24] - step1[27]);
981 step2[28] = WRAPLOW(-step1[28] + step1[31]);
982 step2[29] = WRAPLOW(-step1[29] + step1[30]);
983 step2[30] = WRAPLOW(step1[29] + step1[30]);
984 step2[31] = WRAPLOW(step1[28] + step1[31]);
985
986 // stage 5
987 step1[0] = WRAPLOW(step2[0] + step2[3]);
988 step1[1] = WRAPLOW(step2[1] + step2[2]);
989 step1[2] = WRAPLOW(step2[1] - step2[2]);
990 step1[3] = WRAPLOW(step2[0] - step2[3]);
991 step1[4] = step2[4];
992 temp1 = (step2[6] - step2[5]) * cospi_16_64;
993 temp2 = (step2[5] + step2[6]) * cospi_16_64;
994 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
995 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
996 step1[7] = step2[7];
997
998 step1[8] = WRAPLOW(step2[8] + step2[11]);
999 step1[9] = WRAPLOW(step2[9] + step2[10]);
1000 step1[10] = WRAPLOW(step2[9] - step2[10]);
1001 step1[11] = WRAPLOW(step2[8] - step2[11]);
1002 step1[12] = WRAPLOW(-step2[12] + step2[15]);
1003 step1[13] = WRAPLOW(-step2[13] + step2[14]);
1004 step1[14] = WRAPLOW(step2[13] + step2[14]);
1005 step1[15] = WRAPLOW(step2[12] + step2[15]);
1006
1007 step1[16] = step2[16];
1008 step1[17] = step2[17];
1009 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1010 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1011 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1012 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1013 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1014 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1015 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1016 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1017 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1018 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1019 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1020 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1021 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1022 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1023 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1024 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1025 step1[22] = step2[22];
1026 step1[23] = step2[23];
1027 step1[24] = step2[24];
1028 step1[25] = step2[25];
1029 step1[30] = step2[30];
1030 step1[31] = step2[31];
1031
1032 // stage 6
1033 step2[0] = WRAPLOW(step1[0] + step1[7]);
1034 step2[1] = WRAPLOW(step1[1] + step1[6]);
1035 step2[2] = WRAPLOW(step1[2] + step1[5]);
1036 step2[3] = WRAPLOW(step1[3] + step1[4]);
1037 step2[4] = WRAPLOW(step1[3] - step1[4]);
1038 step2[5] = WRAPLOW(step1[2] - step1[5]);
1039 step2[6] = WRAPLOW(step1[1] - step1[6]);
1040 step2[7] = WRAPLOW(step1[0] - step1[7]);
1041 step2[8] = step1[8];
1042 step2[9] = step1[9];
1043 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1044 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1045 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1046 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1047 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1048 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1049 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1050 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1051 step2[14] = step1[14];
1052 step2[15] = step1[15];
1053
1054 step2[16] = WRAPLOW(step1[16] + step1[23]);
1055 step2[17] = WRAPLOW(step1[17] + step1[22]);
1056 step2[18] = WRAPLOW(step1[18] + step1[21]);
1057 step2[19] = WRAPLOW(step1[19] + step1[20]);
1058 step2[20] = WRAPLOW(step1[19] - step1[20]);
1059 step2[21] = WRAPLOW(step1[18] - step1[21]);
1060 step2[22] = WRAPLOW(step1[17] - step1[22]);
1061 step2[23] = WRAPLOW(step1[16] - step1[23]);
1062
1063 step2[24] = WRAPLOW(-step1[24] + step1[31]);
1064 step2[25] = WRAPLOW(-step1[25] + step1[30]);
1065 step2[26] = WRAPLOW(-step1[26] + step1[29]);
1066 step2[27] = WRAPLOW(-step1[27] + step1[28]);
1067 step2[28] = WRAPLOW(step1[27] + step1[28]);
1068 step2[29] = WRAPLOW(step1[26] + step1[29]);
1069 step2[30] = WRAPLOW(step1[25] + step1[30]);
1070 step2[31] = WRAPLOW(step1[24] + step1[31]);
1071
1072 // stage 7
1073 step1[0] = WRAPLOW(step2[0] + step2[15]);
1074 step1[1] = WRAPLOW(step2[1] + step2[14]);
1075 step1[2] = WRAPLOW(step2[2] + step2[13]);
1076 step1[3] = WRAPLOW(step2[3] + step2[12]);
1077 step1[4] = WRAPLOW(step2[4] + step2[11]);
1078 step1[5] = WRAPLOW(step2[5] + step2[10]);
1079 step1[6] = WRAPLOW(step2[6] + step2[9]);
1080 step1[7] = WRAPLOW(step2[7] + step2[8]);
1081 step1[8] = WRAPLOW(step2[7] - step2[8]);
1082 step1[9] = WRAPLOW(step2[6] - step2[9]);
1083 step1[10] = WRAPLOW(step2[5] - step2[10]);
1084 step1[11] = WRAPLOW(step2[4] - step2[11]);
1085 step1[12] = WRAPLOW(step2[3] - step2[12]);
1086 step1[13] = WRAPLOW(step2[2] - step2[13]);
1087 step1[14] = WRAPLOW(step2[1] - step2[14]);
1088 step1[15] = WRAPLOW(step2[0] - step2[15]);
1089
1090 step1[16] = step2[16];
1091 step1[17] = step2[17];
1092 step1[18] = step2[18];
1093 step1[19] = step2[19];
1094 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1095 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1096 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1097 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1098 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1099 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1100 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1101 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1102 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1103 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1104 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1105 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1106 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1107 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1108 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1109 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1110 step1[28] = step2[28];
1111 step1[29] = step2[29];
1112 step1[30] = step2[30];
1113 step1[31] = step2[31];
1114
1115 // final stage
1116 output[0] = WRAPLOW(step1[0] + step1[31]);
1117 output[1] = WRAPLOW(step1[1] + step1[30]);
1118 output[2] = WRAPLOW(step1[2] + step1[29]);
1119 output[3] = WRAPLOW(step1[3] + step1[28]);
1120 output[4] = WRAPLOW(step1[4] + step1[27]);
1121 output[5] = WRAPLOW(step1[5] + step1[26]);
1122 output[6] = WRAPLOW(step1[6] + step1[25]);
1123 output[7] = WRAPLOW(step1[7] + step1[24]);
1124 output[8] = WRAPLOW(step1[8] + step1[23]);
1125 output[9] = WRAPLOW(step1[9] + step1[22]);
1126 output[10] = WRAPLOW(step1[10] + step1[21]);
1127 output[11] = WRAPLOW(step1[11] + step1[20]);
1128 output[12] = WRAPLOW(step1[12] + step1[19]);
1129 output[13] = WRAPLOW(step1[13] + step1[18]);
1130 output[14] = WRAPLOW(step1[14] + step1[17]);
1131 output[15] = WRAPLOW(step1[15] + step1[16]);
1132 output[16] = WRAPLOW(step1[15] - step1[16]);
1133 output[17] = WRAPLOW(step1[14] - step1[17]);
1134 output[18] = WRAPLOW(step1[13] - step1[18]);
1135 output[19] = WRAPLOW(step1[12] - step1[19]);
1136 output[20] = WRAPLOW(step1[11] - step1[20]);
1137 output[21] = WRAPLOW(step1[10] - step1[21]);
1138 output[22] = WRAPLOW(step1[9] - step1[22]);
1139 output[23] = WRAPLOW(step1[8] - step1[23]);
1140 output[24] = WRAPLOW(step1[7] - step1[24]);
1141 output[25] = WRAPLOW(step1[6] - step1[25]);
1142 output[26] = WRAPLOW(step1[5] - step1[26]);
1143 output[27] = WRAPLOW(step1[4] - step1[27]);
1144 output[28] = WRAPLOW(step1[3] - step1[28]);
1145 output[29] = WRAPLOW(step1[2] - step1[29]);
1146 output[30] = WRAPLOW(step1[1] - step1[30]);
1147 output[31] = WRAPLOW(step1[0] - step1[31]);
1148 }
1149
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1150 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1151 int stride) {
1152 int i, j;
1153 tran_low_t out[32 * 32];
1154 tran_low_t *outptr = out;
1155 tran_low_t temp_in[32], temp_out[32];
1156
1157 // Rows
1158 for (i = 0; i < 32; ++i) {
1159 int16_t zero_coeff[16];
1160 for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1161 for (j = 0; j < 8; ++j)
1162 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1163 for (j = 0; j < 4; ++j)
1164 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1165 for (j = 0; j < 2; ++j)
1166 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1167
1168 if (zero_coeff[0] | zero_coeff[1])
1169 idct32_c(input, outptr);
1170 else
1171 memset(outptr, 0, sizeof(tran_low_t) * 32);
1172 input += 32;
1173 outptr += 32;
1174 }
1175
1176 // Columns
1177 for (i = 0; i < 32; ++i) {
1178 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1179 idct32_c(temp_in, temp_out);
1180 for (j = 0; j < 32; ++j) {
1181 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1182 ROUND_POWER_OF_TWO(temp_out[j], 6));
1183 }
1184 }
1185 }
1186
vpx_idct32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride)1187 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1188 int stride) {
1189 int i, j;
1190 tran_low_t out[32 * 32] = { 0 };
1191 tran_low_t *outptr = out;
1192 tran_low_t temp_in[32], temp_out[32];
1193
1194 // Rows
1195 // Only upper-left 16x16 has non-zero coeff
1196 for (i = 0; i < 16; ++i) {
1197 idct32_c(input, outptr);
1198 input += 32;
1199 outptr += 32;
1200 }
1201
1202 // Columns
1203 for (i = 0; i < 32; ++i) {
1204 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1205 idct32_c(temp_in, temp_out);
1206 for (j = 0; j < 32; ++j) {
1207 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1208 ROUND_POWER_OF_TWO(temp_out[j], 6));
1209 }
1210 }
1211 }
1212
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1213 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1214 int stride) {
1215 int i, j;
1216 tran_low_t out[32 * 32] = { 0 };
1217 tran_low_t *outptr = out;
1218 tran_low_t temp_in[32], temp_out[32];
1219
1220 // Rows
1221 // Only upper-left 8x8 has non-zero coeff
1222 for (i = 0; i < 8; ++i) {
1223 idct32_c(input, outptr);
1224 input += 32;
1225 outptr += 32;
1226 }
1227
1228 // Columns
1229 for (i = 0; i < 32; ++i) {
1230 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1231 idct32_c(temp_in, temp_out);
1232 for (j = 0; j < 32; ++j) {
1233 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1234 ROUND_POWER_OF_TWO(temp_out[j], 6));
1235 }
1236 }
1237 }
1238
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1239 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1240 int i, j;
1241 tran_high_t a1;
1242 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1243
1244 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1245 a1 = ROUND_POWER_OF_TWO(out, 6);
1246
1247 for (j = 0; j < 32; ++j) {
1248 for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1249 dest += stride;
1250 }
1251 }
1252
1253 #if CONFIG_VP9_HIGHBITDEPTH
1254
1255 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1256 // transform amplify bits + 1 bit for contingency in rounding and quantizing
1257 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1258
detect_invalid_highbd_input(const tran_low_t * input,int size)1259 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1260 int size) {
1261 int i;
1262 for (i = 0; i < size; ++i)
1263 if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1264 return 0;
1265 }
1266
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1267 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1268 int stride, int bd) {
1269 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1270 0.5 shifts per pixel. */
1271 int i;
1272 tran_low_t output[16];
1273 tran_high_t a1, b1, c1, d1, e1;
1274 const tran_low_t *ip = input;
1275 tran_low_t *op = output;
1276 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1277
1278 for (i = 0; i < 4; i++) {
1279 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1280 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1281 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1282 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1283 a1 += c1;
1284 d1 -= b1;
1285 e1 = (a1 - d1) >> 1;
1286 b1 = e1 - b1;
1287 c1 = e1 - c1;
1288 a1 -= b1;
1289 d1 += c1;
1290 op[0] = HIGHBD_WRAPLOW(a1, bd);
1291 op[1] = HIGHBD_WRAPLOW(b1, bd);
1292 op[2] = HIGHBD_WRAPLOW(c1, bd);
1293 op[3] = HIGHBD_WRAPLOW(d1, bd);
1294 ip += 4;
1295 op += 4;
1296 }
1297
1298 ip = output;
1299 for (i = 0; i < 4; i++) {
1300 a1 = ip[4 * 0];
1301 c1 = ip[4 * 1];
1302 d1 = ip[4 * 2];
1303 b1 = ip[4 * 3];
1304 a1 += c1;
1305 d1 -= b1;
1306 e1 = (a1 - d1) >> 1;
1307 b1 = e1 - b1;
1308 c1 = e1 - c1;
1309 a1 -= b1;
1310 d1 += c1;
1311 dest[stride * 0] =
1312 highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1313 dest[stride * 1] =
1314 highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1315 dest[stride * 2] =
1316 highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1317 dest[stride * 3] =
1318 highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1319
1320 ip++;
1321 dest++;
1322 }
1323 }
1324
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest8,int stride,int bd)1325 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1326 int stride, int bd) {
1327 int i;
1328 tran_high_t a1, e1;
1329 tran_low_t tmp[4];
1330 const tran_low_t *ip = in;
1331 tran_low_t *op = tmp;
1332 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1333 (void)bd;
1334
1335 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1336 e1 = a1 >> 1;
1337 a1 -= e1;
1338 op[0] = HIGHBD_WRAPLOW(a1, bd);
1339 op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1340
1341 ip = tmp;
1342 for (i = 0; i < 4; i++) {
1343 e1 = ip[0] >> 1;
1344 a1 = ip[0] - e1;
1345 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1346 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1347 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1348 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1349 ip++;
1350 dest++;
1351 }
1352 }
1353
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1354 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1355 tran_low_t step[4];
1356 tran_high_t temp1, temp2;
1357 (void)bd;
1358
1359 if (detect_invalid_highbd_input(input, 4)) {
1360 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1361 assert(0 && "invalid highbd txfm input");
1362 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1363 memset(output, 0, sizeof(*output) * 4);
1364 return;
1365 }
1366
1367 // stage 1
1368 temp1 = (input[0] + input[2]) * cospi_16_64;
1369 temp2 = (input[0] - input[2]) * cospi_16_64;
1370 step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1371 step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1372 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1373 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1374 step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1375 step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1376
1377 // stage 2
1378 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1379 output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1380 output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1381 output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1382 }
1383
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1384 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1385 int stride, int bd) {
1386 int i, j;
1387 tran_low_t out[4 * 4];
1388 tran_low_t *outptr = out;
1389 tran_low_t temp_in[4], temp_out[4];
1390 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1391
1392 // Rows
1393 for (i = 0; i < 4; ++i) {
1394 vpx_highbd_idct4_c(input, outptr, bd);
1395 input += 4;
1396 outptr += 4;
1397 }
1398
1399 // Columns
1400 for (i = 0; i < 4; ++i) {
1401 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1402 vpx_highbd_idct4_c(temp_in, temp_out, bd);
1403 for (j = 0; j < 4; ++j) {
1404 dest[j * stride + i] = highbd_clip_pixel_add(
1405 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1406 }
1407 }
1408 }
1409
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1410 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1411 int stride, int bd) {
1412 int i;
1413 tran_high_t a1;
1414 tran_low_t out =
1415 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1416 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1417
1418 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1419 a1 = ROUND_POWER_OF_TWO(out, 4);
1420
1421 for (i = 0; i < 4; i++) {
1422 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1423 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1424 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1425 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1426 dest += stride;
1427 }
1428 }
1429
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1430 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1431 tran_low_t step1[8], step2[8];
1432 tran_high_t temp1, temp2;
1433
1434 if (detect_invalid_highbd_input(input, 8)) {
1435 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1436 assert(0 && "invalid highbd txfm input");
1437 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1438 memset(output, 0, sizeof(*output) * 8);
1439 return;
1440 }
1441
1442 // stage 1
1443 step1[0] = input[0];
1444 step1[2] = input[4];
1445 step1[1] = input[2];
1446 step1[3] = input[6];
1447 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1448 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1449 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1450 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1451 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1452 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1453 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1454 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1455
1456 // stage 2 & stage 3 - even half
1457 vpx_highbd_idct4_c(step1, step1, bd);
1458
1459 // stage 2 - odd half
1460 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1461 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1462 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1463 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1464
1465 // stage 3 - odd half
1466 step1[4] = step2[4];
1467 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1468 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1469 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1470 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1471 step1[7] = step2[7];
1472
1473 // stage 4
1474 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1475 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1476 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1477 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1478 output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1479 output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1480 output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1481 output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1482 }
1483
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1484 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1485 int stride, int bd) {
1486 int i, j;
1487 tran_low_t out[8 * 8];
1488 tran_low_t *outptr = out;
1489 tran_low_t temp_in[8], temp_out[8];
1490 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1491
1492 // First transform rows
1493 for (i = 0; i < 8; ++i) {
1494 vpx_highbd_idct8_c(input, outptr, bd);
1495 input += 8;
1496 outptr += 8;
1497 }
1498
1499 // Then transform columns
1500 for (i = 0; i < 8; ++i) {
1501 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1502 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1503 for (j = 0; j < 8; ++j) {
1504 dest[j * stride + i] = highbd_clip_pixel_add(
1505 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1506 }
1507 }
1508 }
1509
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1510 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1511 int stride, int bd) {
1512 int i, j;
1513 tran_high_t a1;
1514 tran_low_t out =
1515 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1516 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1517
1518 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1519 a1 = ROUND_POWER_OF_TWO(out, 5);
1520 for (j = 0; j < 8; ++j) {
1521 for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1522 dest += stride;
1523 }
1524 }
1525
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1526 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1527 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1528 tran_low_t x0 = input[0];
1529 tran_low_t x1 = input[1];
1530 tran_low_t x2 = input[2];
1531 tran_low_t x3 = input[3];
1532 (void)bd;
1533
1534 if (detect_invalid_highbd_input(input, 4)) {
1535 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1536 assert(0 && "invalid highbd txfm input");
1537 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1538 memset(output, 0, sizeof(*output) * 4);
1539 return;
1540 }
1541
1542 if (!(x0 | x1 | x2 | x3)) {
1543 memset(output, 0, 4 * sizeof(*output));
1544 return;
1545 }
1546
1547 s0 = sinpi_1_9 * x0;
1548 s1 = sinpi_2_9 * x0;
1549 s2 = sinpi_3_9 * x1;
1550 s3 = sinpi_4_9 * x2;
1551 s4 = sinpi_1_9 * x2;
1552 s5 = sinpi_2_9 * x3;
1553 s6 = sinpi_4_9 * x3;
1554 s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1555
1556 s0 = s0 + s3 + s5;
1557 s1 = s1 - s4 - s6;
1558 s3 = s2;
1559 s2 = sinpi_3_9 * s7;
1560
1561 // 1-D transform scaling factor is sqrt(2).
1562 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1563 // + 1b (addition) = 29b.
1564 // Hence the output bit depth is 15b.
1565 output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1566 output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1567 output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1568 output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1569 }
1570
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1571 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1572 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1573 tran_low_t x0 = input[7];
1574 tran_low_t x1 = input[0];
1575 tran_low_t x2 = input[5];
1576 tran_low_t x3 = input[2];
1577 tran_low_t x4 = input[3];
1578 tran_low_t x5 = input[4];
1579 tran_low_t x6 = input[1];
1580 tran_low_t x7 = input[6];
1581 (void)bd;
1582
1583 if (detect_invalid_highbd_input(input, 8)) {
1584 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1585 assert(0 && "invalid highbd txfm input");
1586 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1587 memset(output, 0, sizeof(*output) * 8);
1588 return;
1589 }
1590
1591 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1592 memset(output, 0, 8 * sizeof(*output));
1593 return;
1594 }
1595
1596 // stage 1
1597 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1598 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1599 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1600 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1601 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1602 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1603 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1604 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1605
1606 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1607 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1608 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1609 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1610 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1611 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1612 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1613 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1614
1615 // stage 2
1616 s0 = x0;
1617 s1 = x1;
1618 s2 = x2;
1619 s3 = x3;
1620 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1621 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1622 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1623 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1624
1625 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1626 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1627 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1628 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1629 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1630 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1631 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1632 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1633
1634 // stage 3
1635 s2 = cospi_16_64 * (x2 + x3);
1636 s3 = cospi_16_64 * (x2 - x3);
1637 s6 = cospi_16_64 * (x6 + x7);
1638 s7 = cospi_16_64 * (x6 - x7);
1639
1640 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1641 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1642 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1643 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1644
1645 output[0] = HIGHBD_WRAPLOW(x0, bd);
1646 output[1] = HIGHBD_WRAPLOW(-x4, bd);
1647 output[2] = HIGHBD_WRAPLOW(x6, bd);
1648 output[3] = HIGHBD_WRAPLOW(-x2, bd);
1649 output[4] = HIGHBD_WRAPLOW(x3, bd);
1650 output[5] = HIGHBD_WRAPLOW(-x7, bd);
1651 output[6] = HIGHBD_WRAPLOW(x5, bd);
1652 output[7] = HIGHBD_WRAPLOW(-x1, bd);
1653 }
1654
vpx_highbd_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1655 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
1656 int stride, int bd) {
1657 int i, j;
1658 tran_low_t out[8 * 8] = { 0 };
1659 tran_low_t *outptr = out;
1660 tran_low_t temp_in[8], temp_out[8];
1661 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1662
1663 // First transform rows
1664 // Only first 4 row has non-zero coefs
1665 for (i = 0; i < 4; ++i) {
1666 vpx_highbd_idct8_c(input, outptr, bd);
1667 input += 8;
1668 outptr += 8;
1669 }
1670
1671 // Then transform columns
1672 for (i = 0; i < 8; ++i) {
1673 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1674 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1675 for (j = 0; j < 8; ++j) {
1676 dest[j * stride + i] = highbd_clip_pixel_add(
1677 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1678 }
1679 }
1680 }
1681
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1682 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1683 tran_low_t step1[16], step2[16];
1684 tran_high_t temp1, temp2;
1685 (void)bd;
1686
1687 if (detect_invalid_highbd_input(input, 16)) {
1688 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1689 assert(0 && "invalid highbd txfm input");
1690 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1691 memset(output, 0, sizeof(*output) * 16);
1692 return;
1693 }
1694
1695 // stage 1
1696 step1[0] = input[0 / 2];
1697 step1[1] = input[16 / 2];
1698 step1[2] = input[8 / 2];
1699 step1[3] = input[24 / 2];
1700 step1[4] = input[4 / 2];
1701 step1[5] = input[20 / 2];
1702 step1[6] = input[12 / 2];
1703 step1[7] = input[28 / 2];
1704 step1[8] = input[2 / 2];
1705 step1[9] = input[18 / 2];
1706 step1[10] = input[10 / 2];
1707 step1[11] = input[26 / 2];
1708 step1[12] = input[6 / 2];
1709 step1[13] = input[22 / 2];
1710 step1[14] = input[14 / 2];
1711 step1[15] = input[30 / 2];
1712
1713 // stage 2
1714 step2[0] = step1[0];
1715 step2[1] = step1[1];
1716 step2[2] = step1[2];
1717 step2[3] = step1[3];
1718 step2[4] = step1[4];
1719 step2[5] = step1[5];
1720 step2[6] = step1[6];
1721 step2[7] = step1[7];
1722
1723 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1724 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1725 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1726 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1727
1728 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1729 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1730 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1731 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1732
1733 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1734 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1735 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1736 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1737
1738 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1739 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1740 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1741 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1742
1743 // stage 3
1744 step1[0] = step2[0];
1745 step1[1] = step2[1];
1746 step1[2] = step2[2];
1747 step1[3] = step2[3];
1748
1749 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1750 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1751 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1752 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1753 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1754 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1755 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1756 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1757
1758 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1759 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1760 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1761 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1762 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1763 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1764 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1765 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1766
1767 // stage 4
1768 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1769 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1770 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1771 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1772 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1773 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1774 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1775 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1776 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1777 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1778 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1779 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1780
1781 step2[8] = step1[8];
1782 step2[15] = step1[15];
1783 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1784 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1785 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1786 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1787 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1788 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1789 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1790 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1791 step2[11] = step1[11];
1792 step2[12] = step1[12];
1793
1794 // stage 5
1795 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1796 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
1797 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
1798 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
1799 step1[4] = step2[4];
1800 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1801 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1802 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1803 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1804 step1[7] = step2[7];
1805
1806 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
1807 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
1808 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
1809 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
1810 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
1811 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
1812 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
1813 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
1814
1815 // stage 6
1816 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1817 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1818 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1819 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1820 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1821 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1822 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1823 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1824 step2[8] = step1[8];
1825 step2[9] = step1[9];
1826 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1827 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1828 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1829 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1830 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1831 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1832 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1833 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1834 step2[14] = step1[14];
1835 step2[15] = step1[15];
1836
1837 // stage 7
1838 output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
1839 output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
1840 output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
1841 output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
1842 output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
1843 output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
1844 output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
1845 output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
1846 output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
1847 output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
1848 output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
1849 output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
1850 output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
1851 output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
1852 output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
1853 output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
1854 }
1855
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1856 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1857 int stride, int bd) {
1858 int i, j;
1859 tran_low_t out[16 * 16];
1860 tran_low_t *outptr = out;
1861 tran_low_t temp_in[16], temp_out[16];
1862 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1863
1864 // First transform rows
1865 for (i = 0; i < 16; ++i) {
1866 vpx_highbd_idct16_c(input, outptr, bd);
1867 input += 16;
1868 outptr += 16;
1869 }
1870
1871 // Then transform columns
1872 for (i = 0; i < 16; ++i) {
1873 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
1874 vpx_highbd_idct16_c(temp_in, temp_out, bd);
1875 for (j = 0; j < 16; ++j) {
1876 dest[j * stride + i] = highbd_clip_pixel_add(
1877 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1878 }
1879 }
1880 }
1881
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1882 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1883 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1884 tran_high_t s9, s10, s11, s12, s13, s14, s15;
1885 tran_low_t x0 = input[15];
1886 tran_low_t x1 = input[0];
1887 tran_low_t x2 = input[13];
1888 tran_low_t x3 = input[2];
1889 tran_low_t x4 = input[11];
1890 tran_low_t x5 = input[4];
1891 tran_low_t x6 = input[9];
1892 tran_low_t x7 = input[6];
1893 tran_low_t x8 = input[7];
1894 tran_low_t x9 = input[8];
1895 tran_low_t x10 = input[5];
1896 tran_low_t x11 = input[10];
1897 tran_low_t x12 = input[3];
1898 tran_low_t x13 = input[12];
1899 tran_low_t x14 = input[1];
1900 tran_low_t x15 = input[14];
1901 (void)bd;
1902
1903 if (detect_invalid_highbd_input(input, 16)) {
1904 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1905 assert(0 && "invalid highbd txfm input");
1906 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1907 memset(output, 0, sizeof(*output) * 16);
1908 return;
1909 }
1910
1911 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1912 x13 | x14 | x15)) {
1913 memset(output, 0, 16 * sizeof(*output));
1914 return;
1915 }
1916
1917 // stage 1
1918 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1919 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1920 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1921 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1922 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1923 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1924 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1925 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1926 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1927 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1928 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1929 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1930 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1931 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1932 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1933 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1934
1935 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1936 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1937 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1938 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1939 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1940 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1941 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1942 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1943 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1944 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1945 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1946 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1947 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1948 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1949 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1950 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1951
1952 // stage 2
1953 s0 = x0;
1954 s1 = x1;
1955 s2 = x2;
1956 s3 = x3;
1957 s4 = x4;
1958 s5 = x5;
1959 s6 = x6;
1960 s7 = x7;
1961 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1962 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1963 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1964 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1965 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1966 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1967 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1968 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1969
1970 x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1971 x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1972 x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1973 x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1974 x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1975 x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1976 x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1977 x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1978 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1979 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1980 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1981 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1982 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1983 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1984 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1985 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1986
1987 // stage 3
1988 s0 = x0;
1989 s1 = x1;
1990 s2 = x2;
1991 s3 = x3;
1992 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1993 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1994 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1995 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1996 s8 = x8;
1997 s9 = x9;
1998 s10 = x10;
1999 s11 = x11;
2000 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
2001 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
2002 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
2003 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
2004
2005 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
2006 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
2007 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
2008 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
2009 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
2010 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
2011 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
2012 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
2013 x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
2014 x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
2015 x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
2016 x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
2017 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
2018 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
2019 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
2020 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
2021
2022 // stage 4
2023 s2 = (-cospi_16_64) * (x2 + x3);
2024 s3 = cospi_16_64 * (x2 - x3);
2025 s6 = cospi_16_64 * (x6 + x7);
2026 s7 = cospi_16_64 * (-x6 + x7);
2027 s10 = cospi_16_64 * (x10 + x11);
2028 s11 = cospi_16_64 * (-x10 + x11);
2029 s14 = (-cospi_16_64) * (x14 + x15);
2030 s15 = cospi_16_64 * (x14 - x15);
2031
2032 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
2033 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
2034 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
2035 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
2036 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
2037 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
2038 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
2039 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
2040
2041 output[0] = HIGHBD_WRAPLOW(x0, bd);
2042 output[1] = HIGHBD_WRAPLOW(-x8, bd);
2043 output[2] = HIGHBD_WRAPLOW(x12, bd);
2044 output[3] = HIGHBD_WRAPLOW(-x4, bd);
2045 output[4] = HIGHBD_WRAPLOW(x6, bd);
2046 output[5] = HIGHBD_WRAPLOW(x14, bd);
2047 output[6] = HIGHBD_WRAPLOW(x10, bd);
2048 output[7] = HIGHBD_WRAPLOW(x2, bd);
2049 output[8] = HIGHBD_WRAPLOW(x3, bd);
2050 output[9] = HIGHBD_WRAPLOW(x11, bd);
2051 output[10] = HIGHBD_WRAPLOW(x15, bd);
2052 output[11] = HIGHBD_WRAPLOW(x7, bd);
2053 output[12] = HIGHBD_WRAPLOW(x5, bd);
2054 output[13] = HIGHBD_WRAPLOW(-x13, bd);
2055 output[14] = HIGHBD_WRAPLOW(x9, bd);
2056 output[15] = HIGHBD_WRAPLOW(-x1, bd);
2057 }
2058
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2059 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
2060 int stride, int bd) {
2061 int i, j;
2062 tran_low_t out[16 * 16] = { 0 };
2063 tran_low_t *outptr = out;
2064 tran_low_t temp_in[16], temp_out[16];
2065 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2066
2067 // First transform rows. Since all non-zero dct coefficients are in
2068 // upper-left 4x4 area, we only need to calculate first 4 rows here.
2069 for (i = 0; i < 4; ++i) {
2070 vpx_highbd_idct16_c(input, outptr, bd);
2071 input += 16;
2072 outptr += 16;
2073 }
2074
2075 // Then transform columns
2076 for (i = 0; i < 16; ++i) {
2077 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2078 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2079 for (j = 0; j < 16; ++j) {
2080 dest[j * stride + i] = highbd_clip_pixel_add(
2081 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2082 }
2083 }
2084 }
2085
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2086 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2087 int stride, int bd) {
2088 int i, j;
2089 tran_high_t a1;
2090 tran_low_t out =
2091 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2092 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2093
2094 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2095 a1 = ROUND_POWER_OF_TWO(out, 6);
2096 for (j = 0; j < 16; ++j) {
2097 for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2098 dest += stride;
2099 }
2100 }
2101
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2102 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2103 int bd) {
2104 tran_low_t step1[32], step2[32];
2105 tran_high_t temp1, temp2;
2106 (void)bd;
2107
2108 if (detect_invalid_highbd_input(input, 32)) {
2109 #if CONFIG_COEFFICIENT_RANGE_CHECKING
2110 assert(0 && "invalid highbd txfm input");
2111 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
2112 memset(output, 0, sizeof(*output) * 32);
2113 return;
2114 }
2115
2116 // stage 1
2117 step1[0] = input[0];
2118 step1[1] = input[16];
2119 step1[2] = input[8];
2120 step1[3] = input[24];
2121 step1[4] = input[4];
2122 step1[5] = input[20];
2123 step1[6] = input[12];
2124 step1[7] = input[28];
2125 step1[8] = input[2];
2126 step1[9] = input[18];
2127 step1[10] = input[10];
2128 step1[11] = input[26];
2129 step1[12] = input[6];
2130 step1[13] = input[22];
2131 step1[14] = input[14];
2132 step1[15] = input[30];
2133
2134 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2135 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2136 step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2137 step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2138
2139 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2140 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2141 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2142 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2143
2144 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2145 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2146 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2147 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2148
2149 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2150 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2151 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2152 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2153
2154 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2155 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2156 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2157 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2158
2159 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2160 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2161 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2162 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2163
2164 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2165 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2166 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2167 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2168
2169 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2170 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2171 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2172 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2173
2174 // stage 2
2175 step2[0] = step1[0];
2176 step2[1] = step1[1];
2177 step2[2] = step1[2];
2178 step2[3] = step1[3];
2179 step2[4] = step1[4];
2180 step2[5] = step1[5];
2181 step2[6] = step1[6];
2182 step2[7] = step1[7];
2183
2184 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2185 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2186 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2187 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2188
2189 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2190 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2191 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2192 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2193
2194 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2195 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2196 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2197 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2198
2199 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2200 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2201 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2202 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2203
2204 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2205 step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2206 step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2207 step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2208 step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2209 step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2210 step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2211 step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2212 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2213 step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2214 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2215 step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2216 step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2217 step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2218 step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2219 step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2220
2221 // stage 3
2222 step1[0] = step2[0];
2223 step1[1] = step2[1];
2224 step1[2] = step2[2];
2225 step1[3] = step2[3];
2226
2227 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2228 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2229 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2230 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2231 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2232 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2233 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2234 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2235
2236 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2237 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2238 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2239 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2240 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2241 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2242 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2243 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2244
2245 step1[16] = step2[16];
2246 step1[31] = step2[31];
2247 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2248 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2249 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2250 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2251 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2252 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2253 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2254 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2255 step1[19] = step2[19];
2256 step1[20] = step2[20];
2257 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2258 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2259 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2260 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2261 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2262 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2263 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2264 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2265 step1[23] = step2[23];
2266 step1[24] = step2[24];
2267 step1[27] = step2[27];
2268 step1[28] = step2[28];
2269
2270 // stage 4
2271 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2272 temp2 = (step1[0] - step1[1]) * cospi_16_64;
2273 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2274 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2275 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2276 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2277 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2278 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2279 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2280 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2281 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2282 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2283
2284 step2[8] = step1[8];
2285 step2[15] = step1[15];
2286 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2287 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2288 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2289 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2290 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2291 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2292 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2293 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2294 step2[11] = step1[11];
2295 step2[12] = step1[12];
2296
2297 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2298 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2299 step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2300 step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2301 step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2302 step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2303 step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2304 step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2305
2306 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2307 step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2308 step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2309 step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2310 step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2311 step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2312 step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2313 step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2314
2315 // stage 5
2316 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2317 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2318 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2319 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2320 step1[4] = step2[4];
2321 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2322 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2323 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2324 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2325 step1[7] = step2[7];
2326
2327 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2328 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2329 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2330 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2331 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2332 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2333 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2334 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2335
2336 step1[16] = step2[16];
2337 step1[17] = step2[17];
2338 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2339 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2340 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2341 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2342 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2343 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2344 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2345 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2346 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2347 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2348 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2349 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2350 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2351 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2352 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2353 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2354 step1[22] = step2[22];
2355 step1[23] = step2[23];
2356 step1[24] = step2[24];
2357 step1[25] = step2[25];
2358 step1[30] = step2[30];
2359 step1[31] = step2[31];
2360
2361 // stage 6
2362 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2363 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2364 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2365 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2366 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2367 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2368 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2369 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2370 step2[8] = step1[8];
2371 step2[9] = step1[9];
2372 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2373 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2374 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2375 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2376 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2377 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2378 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2379 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2380 step2[14] = step1[14];
2381 step2[15] = step1[15];
2382
2383 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2384 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2385 step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2386 step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2387 step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2388 step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2389 step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2390 step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2391
2392 step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2393 step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2394 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2395 step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2396 step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2397 step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2398 step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2399 step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2400
2401 // stage 7
2402 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2403 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2404 step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2405 step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2406 step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2407 step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2408 step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2409 step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2410 step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2411 step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2412 step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2413 step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2414 step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2415 step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2416 step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2417 step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2418
2419 step1[16] = step2[16];
2420 step1[17] = step2[17];
2421 step1[18] = step2[18];
2422 step1[19] = step2[19];
2423 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2424 temp2 = (step2[20] + step2[27]) * cospi_16_64;
2425 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2426 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2427 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2428 temp2 = (step2[21] + step2[26]) * cospi_16_64;
2429 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2430 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2431 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2432 temp2 = (step2[22] + step2[25]) * cospi_16_64;
2433 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2434 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2435 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2436 temp2 = (step2[23] + step2[24]) * cospi_16_64;
2437 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2438 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2439 step1[28] = step2[28];
2440 step1[29] = step2[29];
2441 step1[30] = step2[30];
2442 step1[31] = step2[31];
2443
2444 // final stage
2445 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2446 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2447 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2448 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2449 output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2450 output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2451 output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2452 output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2453 output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2454 output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2455 output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2456 output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2457 output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2458 output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2459 output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2460 output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2461 output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2462 output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2463 output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2464 output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2465 output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2466 output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2467 output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2468 output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2469 output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2470 output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2471 output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2472 output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2473 output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2474 output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2475 output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2476 output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2477 }
2478
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2479 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2480 int stride, int bd) {
2481 int i, j;
2482 tran_low_t out[32 * 32];
2483 tran_low_t *outptr = out;
2484 tran_low_t temp_in[32], temp_out[32];
2485 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2486
2487 // Rows
2488 for (i = 0; i < 32; ++i) {
2489 tran_low_t zero_coeff[16];
2490 for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2491 for (j = 0; j < 8; ++j)
2492 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2493 for (j = 0; j < 4; ++j)
2494 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2495 for (j = 0; j < 2; ++j)
2496 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2497
2498 if (zero_coeff[0] | zero_coeff[1])
2499 highbd_idct32_c(input, outptr, bd);
2500 else
2501 memset(outptr, 0, sizeof(tran_low_t) * 32);
2502 input += 32;
2503 outptr += 32;
2504 }
2505
2506 // Columns
2507 for (i = 0; i < 32; ++i) {
2508 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2509 highbd_idct32_c(temp_in, temp_out, bd);
2510 for (j = 0; j < 32; ++j) {
2511 dest[j * stride + i] = highbd_clip_pixel_add(
2512 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2513 }
2514 }
2515 }
2516
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2517 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2518 int stride, int bd) {
2519 int i, j;
2520 tran_low_t out[32 * 32] = { 0 };
2521 tran_low_t *outptr = out;
2522 tran_low_t temp_in[32], temp_out[32];
2523 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2524
2525 // Rows
2526 // Only upper-left 8x8 has non-zero coeff
2527 for (i = 0; i < 8; ++i) {
2528 highbd_idct32_c(input, outptr, bd);
2529 input += 32;
2530 outptr += 32;
2531 }
2532
2533 // Columns
2534 for (i = 0; i < 32; ++i) {
2535 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2536 highbd_idct32_c(temp_in, temp_out, bd);
2537 for (j = 0; j < 32; ++j) {
2538 dest[j * stride + i] = highbd_clip_pixel_add(
2539 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2540 }
2541 }
2542 }
2543
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2544 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2545 int stride, int bd) {
2546 int i, j;
2547 int a1;
2548 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2549 tran_low_t out =
2550 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2551
2552 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2553 a1 = ROUND_POWER_OF_TWO(out, 6);
2554
2555 for (j = 0; j < 32; ++j) {
2556 for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2557 dest += stride;
2558 }
2559 }
2560
2561 #endif // CONFIG_VP9_HIGHBITDEPTH
2562