1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <math.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/inv_txfm.h"
17 
vpx_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20      0.5 shifts per pixel. */
21   int i;
22   tran_low_t output[16];
23   tran_high_t a1, b1, c1, d1, e1;
24   const tran_low_t *ip = input;
25   tran_low_t *op = output;
26 
27   for (i = 0; i < 4; i++) {
28     a1 = ip[0] >> UNIT_QUANT_SHIFT;
29     c1 = ip[1] >> UNIT_QUANT_SHIFT;
30     d1 = ip[2] >> UNIT_QUANT_SHIFT;
31     b1 = ip[3] >> UNIT_QUANT_SHIFT;
32     a1 += c1;
33     d1 -= b1;
34     e1 = (a1 - d1) >> 1;
35     b1 = e1 - b1;
36     c1 = e1 - c1;
37     a1 -= b1;
38     d1 += c1;
39     op[0] = WRAPLOW(a1);
40     op[1] = WRAPLOW(b1);
41     op[2] = WRAPLOW(c1);
42     op[3] = WRAPLOW(d1);
43     ip += 4;
44     op += 4;
45   }
46 
47   ip = output;
48   for (i = 0; i < 4; i++) {
49     a1 = ip[4 * 0];
50     c1 = ip[4 * 1];
51     d1 = ip[4 * 2];
52     b1 = ip[4 * 3];
53     a1 += c1;
54     d1 -= b1;
55     e1 = (a1 - d1) >> 1;
56     b1 = e1 - b1;
57     c1 = e1 - c1;
58     a1 -= b1;
59     d1 += c1;
60     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64 
65     ip++;
66     dest++;
67   }
68 }
69 
vpx_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int stride)70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
71   int i;
72   tran_high_t a1, e1;
73   tran_low_t tmp[4];
74   const tran_low_t *ip = in;
75   tran_low_t *op = tmp;
76 
77   a1 = ip[0] >> UNIT_QUANT_SHIFT;
78   e1 = a1 >> 1;
79   a1 -= e1;
80   op[0] = WRAPLOW(a1);
81   op[1] = op[2] = op[3] = WRAPLOW(e1);
82 
83   ip = tmp;
84   for (i = 0; i < 4; i++) {
85     e1 = ip[0] >> 1;
86     a1 = ip[0] - e1;
87     dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88     dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89     dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90     dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
91     ip++;
92     dest++;
93   }
94 }
95 
idct4_c(const tran_low_t * input,tran_low_t * output)96 void idct4_c(const tran_low_t *input, tran_low_t *output) {
97   tran_low_t step[4];
98   tran_high_t temp1, temp2;
99 
100   // stage 1
101   temp1 = (input[0] + input[2]) * cospi_16_64;
102   temp2 = (input[0] - input[2]) * cospi_16_64;
103   step[0] = WRAPLOW(dct_const_round_shift(temp1));
104   step[1] = WRAPLOW(dct_const_round_shift(temp2));
105   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
106   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
107   step[2] = WRAPLOW(dct_const_round_shift(temp1));
108   step[3] = WRAPLOW(dct_const_round_shift(temp2));
109 
110   // stage 2
111   output[0] = WRAPLOW(step[0] + step[3]);
112   output[1] = WRAPLOW(step[1] + step[2]);
113   output[2] = WRAPLOW(step[1] - step[2]);
114   output[3] = WRAPLOW(step[0] - step[3]);
115 }
116 
vpx_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)117 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
118   int i, j;
119   tran_low_t out[4 * 4];
120   tran_low_t *outptr = out;
121   tran_low_t temp_in[4], temp_out[4];
122 
123   // Rows
124   for (i = 0; i < 4; ++i) {
125     idct4_c(input, outptr);
126     input += 4;
127     outptr += 4;
128   }
129 
130   // Columns
131   for (i = 0; i < 4; ++i) {
132     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
133     idct4_c(temp_in, temp_out);
134     for (j = 0; j < 4; ++j) {
135       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
136                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
137     }
138   }
139 }
140 
vpx_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)141 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
142   int i;
143   tran_high_t a1;
144   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
145 
146   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
147   a1 = ROUND_POWER_OF_TWO(out, 4);
148 
149   for (i = 0; i < 4; i++) {
150     dest[0] = clip_pixel_add(dest[0], a1);
151     dest[1] = clip_pixel_add(dest[1], a1);
152     dest[2] = clip_pixel_add(dest[2], a1);
153     dest[3] = clip_pixel_add(dest[3], a1);
154     dest += stride;
155   }
156 }
157 
idct8_c(const tran_low_t * input,tran_low_t * output)158 void idct8_c(const tran_low_t *input, tran_low_t *output) {
159   tran_low_t step1[8], step2[8];
160   tran_high_t temp1, temp2;
161 
162   // stage 1
163   step1[0] = input[0];
164   step1[2] = input[4];
165   step1[1] = input[2];
166   step1[3] = input[6];
167   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
168   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
169   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
170   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
171   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
172   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
173   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
174   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
175 
176   // stage 2
177   temp1 = (step1[0] + step1[2]) * cospi_16_64;
178   temp2 = (step1[0] - step1[2]) * cospi_16_64;
179   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
180   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
181   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
182   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
183   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
184   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
185   step2[4] = WRAPLOW(step1[4] + step1[5]);
186   step2[5] = WRAPLOW(step1[4] - step1[5]);
187   step2[6] = WRAPLOW(-step1[6] + step1[7]);
188   step2[7] = WRAPLOW(step1[6] + step1[7]);
189 
190   // stage 3
191   step1[0] = WRAPLOW(step2[0] + step2[3]);
192   step1[1] = WRAPLOW(step2[1] + step2[2]);
193   step1[2] = WRAPLOW(step2[1] - step2[2]);
194   step1[3] = WRAPLOW(step2[0] - step2[3]);
195   step1[4] = step2[4];
196   temp1 = (step2[6] - step2[5]) * cospi_16_64;
197   temp2 = (step2[5] + step2[6]) * cospi_16_64;
198   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
199   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
200   step1[7] = step2[7];
201 
202   // stage 4
203   output[0] = WRAPLOW(step1[0] + step1[7]);
204   output[1] = WRAPLOW(step1[1] + step1[6]);
205   output[2] = WRAPLOW(step1[2] + step1[5]);
206   output[3] = WRAPLOW(step1[3] + step1[4]);
207   output[4] = WRAPLOW(step1[3] - step1[4]);
208   output[5] = WRAPLOW(step1[2] - step1[5]);
209   output[6] = WRAPLOW(step1[1] - step1[6]);
210   output[7] = WRAPLOW(step1[0] - step1[7]);
211 }
212 
vpx_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)213 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
214   int i, j;
215   tran_low_t out[8 * 8];
216   tran_low_t *outptr = out;
217   tran_low_t temp_in[8], temp_out[8];
218 
219   // First transform rows
220   for (i = 0; i < 8; ++i) {
221     idct8_c(input, outptr);
222     input += 8;
223     outptr += 8;
224   }
225 
226   // Then transform columns
227   for (i = 0; i < 8; ++i) {
228     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
229     idct8_c(temp_in, temp_out);
230     for (j = 0; j < 8; ++j) {
231       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
232                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
233     }
234   }
235 }
236 
vpx_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)237 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
238   int i, j;
239   tran_high_t a1;
240   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
241 
242   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
243   a1 = ROUND_POWER_OF_TWO(out, 5);
244   for (j = 0; j < 8; ++j) {
245     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
246     dest += stride;
247   }
248 }
249 
iadst4_c(const tran_low_t * input,tran_low_t * output)250 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
251   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
252   tran_low_t x0 = input[0];
253   tran_low_t x1 = input[1];
254   tran_low_t x2 = input[2];
255   tran_low_t x3 = input[3];
256 
257   if (!(x0 | x1 | x2 | x3)) {
258     memset(output, 0, 4 * sizeof(*output));
259     return;
260   }
261 
262   s0 = sinpi_1_9 * x0;
263   s1 = sinpi_2_9 * x0;
264   s2 = sinpi_3_9 * x1;
265   s3 = sinpi_4_9 * x2;
266   s4 = sinpi_1_9 * x2;
267   s5 = sinpi_2_9 * x3;
268   s6 = sinpi_4_9 * x3;
269   s7 = WRAPLOW(x0 - x2 + x3);
270 
271   s0 = s0 + s3 + s5;
272   s1 = s1 - s4 - s6;
273   s3 = s2;
274   s2 = sinpi_3_9 * s7;
275 
276   // 1-D transform scaling factor is sqrt(2).
277   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
278   // + 1b (addition) = 29b.
279   // Hence the output bit depth is 15b.
280   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
281   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
282   output[2] = WRAPLOW(dct_const_round_shift(s2));
283   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
284 }
285 
iadst8_c(const tran_low_t * input,tran_low_t * output)286 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
287   int s0, s1, s2, s3, s4, s5, s6, s7;
288   tran_high_t x0 = input[7];
289   tran_high_t x1 = input[0];
290   tran_high_t x2 = input[5];
291   tran_high_t x3 = input[2];
292   tran_high_t x4 = input[3];
293   tran_high_t x5 = input[4];
294   tran_high_t x6 = input[1];
295   tran_high_t x7 = input[6];
296 
297   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
298     memset(output, 0, 8 * sizeof(*output));
299     return;
300   }
301 
302   // stage 1
303   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
304   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
305   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
306   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
307   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
308   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
309   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
310   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
311 
312   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
313   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
314   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
315   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
316   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
317   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
318   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
319   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
320 
321   // stage 2
322   s0 = (int)x0;
323   s1 = (int)x1;
324   s2 = (int)x2;
325   s3 = (int)x3;
326   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
327   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
328   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
329   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
330 
331   x0 = WRAPLOW(s0 + s2);
332   x1 = WRAPLOW(s1 + s3);
333   x2 = WRAPLOW(s0 - s2);
334   x3 = WRAPLOW(s1 - s3);
335   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
336   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
337   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
338   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
339 
340   // stage 3
341   s2 = (int)(cospi_16_64 * (x2 + x3));
342   s3 = (int)(cospi_16_64 * (x2 - x3));
343   s6 = (int)(cospi_16_64 * (x6 + x7));
344   s7 = (int)(cospi_16_64 * (x6 - x7));
345 
346   x2 = WRAPLOW(dct_const_round_shift(s2));
347   x3 = WRAPLOW(dct_const_round_shift(s3));
348   x6 = WRAPLOW(dct_const_round_shift(s6));
349   x7 = WRAPLOW(dct_const_round_shift(s7));
350 
351   output[0] = WRAPLOW(x0);
352   output[1] = WRAPLOW(-x4);
353   output[2] = WRAPLOW(x6);
354   output[3] = WRAPLOW(-x2);
355   output[4] = WRAPLOW(x3);
356   output[5] = WRAPLOW(-x7);
357   output[6] = WRAPLOW(x5);
358   output[7] = WRAPLOW(-x1);
359 }
360 
vpx_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)361 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
362   int i, j;
363   tran_low_t out[8 * 8] = { 0 };
364   tran_low_t *outptr = out;
365   tran_low_t temp_in[8], temp_out[8];
366 
367   // First transform rows
368   // Only first 4 row has non-zero coefs
369   for (i = 0; i < 4; ++i) {
370     idct8_c(input, outptr);
371     input += 8;
372     outptr += 8;
373   }
374 
375   // Then transform columns
376   for (i = 0; i < 8; ++i) {
377     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
378     idct8_c(temp_in, temp_out);
379     for (j = 0; j < 8; ++j) {
380       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
381                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
382     }
383   }
384 }
385 
idct16_c(const tran_low_t * input,tran_low_t * output)386 void idct16_c(const tran_low_t *input, tran_low_t *output) {
387   tran_low_t step1[16], step2[16];
388   tran_high_t temp1, temp2;
389 
390   // stage 1
391   step1[0] = input[0 / 2];
392   step1[1] = input[16 / 2];
393   step1[2] = input[8 / 2];
394   step1[3] = input[24 / 2];
395   step1[4] = input[4 / 2];
396   step1[5] = input[20 / 2];
397   step1[6] = input[12 / 2];
398   step1[7] = input[28 / 2];
399   step1[8] = input[2 / 2];
400   step1[9] = input[18 / 2];
401   step1[10] = input[10 / 2];
402   step1[11] = input[26 / 2];
403   step1[12] = input[6 / 2];
404   step1[13] = input[22 / 2];
405   step1[14] = input[14 / 2];
406   step1[15] = input[30 / 2];
407 
408   // stage 2
409   step2[0] = step1[0];
410   step2[1] = step1[1];
411   step2[2] = step1[2];
412   step2[3] = step1[3];
413   step2[4] = step1[4];
414   step2[5] = step1[5];
415   step2[6] = step1[6];
416   step2[7] = step1[7];
417 
418   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
419   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
420   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
421   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
422 
423   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
424   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
425   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
426   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
427 
428   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
429   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
430   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
431   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
432 
433   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
434   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
435   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
436   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
437 
438   // stage 3
439   step1[0] = step2[0];
440   step1[1] = step2[1];
441   step1[2] = step2[2];
442   step1[3] = step2[3];
443 
444   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
445   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
446   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
447   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
448   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
449   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
450   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
451   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
452 
453   step1[8] = WRAPLOW(step2[8] + step2[9]);
454   step1[9] = WRAPLOW(step2[8] - step2[9]);
455   step1[10] = WRAPLOW(-step2[10] + step2[11]);
456   step1[11] = WRAPLOW(step2[10] + step2[11]);
457   step1[12] = WRAPLOW(step2[12] + step2[13]);
458   step1[13] = WRAPLOW(step2[12] - step2[13]);
459   step1[14] = WRAPLOW(-step2[14] + step2[15]);
460   step1[15] = WRAPLOW(step2[14] + step2[15]);
461 
462   // stage 4
463   temp1 = (step1[0] + step1[1]) * cospi_16_64;
464   temp2 = (step1[0] - step1[1]) * cospi_16_64;
465   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
466   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
467   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
468   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
469   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
470   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
471   step2[4] = WRAPLOW(step1[4] + step1[5]);
472   step2[5] = WRAPLOW(step1[4] - step1[5]);
473   step2[6] = WRAPLOW(-step1[6] + step1[7]);
474   step2[7] = WRAPLOW(step1[6] + step1[7]);
475 
476   step2[8] = step1[8];
477   step2[15] = step1[15];
478   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
479   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
480   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
481   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
482   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
483   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
484   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
485   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
486   step2[11] = step1[11];
487   step2[12] = step1[12];
488 
489   // stage 5
490   step1[0] = WRAPLOW(step2[0] + step2[3]);
491   step1[1] = WRAPLOW(step2[1] + step2[2]);
492   step1[2] = WRAPLOW(step2[1] - step2[2]);
493   step1[3] = WRAPLOW(step2[0] - step2[3]);
494   step1[4] = step2[4];
495   temp1 = (step2[6] - step2[5]) * cospi_16_64;
496   temp2 = (step2[5] + step2[6]) * cospi_16_64;
497   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
498   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
499   step1[7] = step2[7];
500 
501   step1[8] = WRAPLOW(step2[8] + step2[11]);
502   step1[9] = WRAPLOW(step2[9] + step2[10]);
503   step1[10] = WRAPLOW(step2[9] - step2[10]);
504   step1[11] = WRAPLOW(step2[8] - step2[11]);
505   step1[12] = WRAPLOW(-step2[12] + step2[15]);
506   step1[13] = WRAPLOW(-step2[13] + step2[14]);
507   step1[14] = WRAPLOW(step2[13] + step2[14]);
508   step1[15] = WRAPLOW(step2[12] + step2[15]);
509 
510   // stage 6
511   step2[0] = WRAPLOW(step1[0] + step1[7]);
512   step2[1] = WRAPLOW(step1[1] + step1[6]);
513   step2[2] = WRAPLOW(step1[2] + step1[5]);
514   step2[3] = WRAPLOW(step1[3] + step1[4]);
515   step2[4] = WRAPLOW(step1[3] - step1[4]);
516   step2[5] = WRAPLOW(step1[2] - step1[5]);
517   step2[6] = WRAPLOW(step1[1] - step1[6]);
518   step2[7] = WRAPLOW(step1[0] - step1[7]);
519   step2[8] = step1[8];
520   step2[9] = step1[9];
521   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
522   temp2 = (step1[10] + step1[13]) * cospi_16_64;
523   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
524   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
525   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
526   temp2 = (step1[11] + step1[12]) * cospi_16_64;
527   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
528   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
529   step2[14] = step1[14];
530   step2[15] = step1[15];
531 
532   // stage 7
533   output[0] = WRAPLOW(step2[0] + step2[15]);
534   output[1] = WRAPLOW(step2[1] + step2[14]);
535   output[2] = WRAPLOW(step2[2] + step2[13]);
536   output[3] = WRAPLOW(step2[3] + step2[12]);
537   output[4] = WRAPLOW(step2[4] + step2[11]);
538   output[5] = WRAPLOW(step2[5] + step2[10]);
539   output[6] = WRAPLOW(step2[6] + step2[9]);
540   output[7] = WRAPLOW(step2[7] + step2[8]);
541   output[8] = WRAPLOW(step2[7] - step2[8]);
542   output[9] = WRAPLOW(step2[6] - step2[9]);
543   output[10] = WRAPLOW(step2[5] - step2[10]);
544   output[11] = WRAPLOW(step2[4] - step2[11]);
545   output[12] = WRAPLOW(step2[3] - step2[12]);
546   output[13] = WRAPLOW(step2[2] - step2[13]);
547   output[14] = WRAPLOW(step2[1] - step2[14]);
548   output[15] = WRAPLOW(step2[0] - step2[15]);
549 }
550 
vpx_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)551 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
552                              int stride) {
553   int i, j;
554   tran_low_t out[16 * 16];
555   tran_low_t *outptr = out;
556   tran_low_t temp_in[16], temp_out[16];
557 
558   // First transform rows
559   for (i = 0; i < 16; ++i) {
560     idct16_c(input, outptr);
561     input += 16;
562     outptr += 16;
563   }
564 
565   // Then transform columns
566   for (i = 0; i < 16; ++i) {
567     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
568     idct16_c(temp_in, temp_out);
569     for (j = 0; j < 16; ++j) {
570       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
571                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
572     }
573   }
574 }
575 
iadst16_c(const tran_low_t * input,tran_low_t * output)576 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
577   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
578   tran_high_t s9, s10, s11, s12, s13, s14, s15;
579   tran_high_t x0 = input[15];
580   tran_high_t x1 = input[0];
581   tran_high_t x2 = input[13];
582   tran_high_t x3 = input[2];
583   tran_high_t x4 = input[11];
584   tran_high_t x5 = input[4];
585   tran_high_t x6 = input[9];
586   tran_high_t x7 = input[6];
587   tran_high_t x8 = input[7];
588   tran_high_t x9 = input[8];
589   tran_high_t x10 = input[5];
590   tran_high_t x11 = input[10];
591   tran_high_t x12 = input[3];
592   tran_high_t x13 = input[12];
593   tran_high_t x14 = input[1];
594   tran_high_t x15 = input[14];
595 
596   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
597         x13 | x14 | x15)) {
598     memset(output, 0, 16 * sizeof(*output));
599     return;
600   }
601 
602   // stage 1
603   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
604   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
605   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
606   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
607   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
608   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
609   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
610   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
611   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
612   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
613   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
614   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
615   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
616   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
617   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
618   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
619 
620   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
621   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
622   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
623   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
624   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
625   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
626   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
627   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
628   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
629   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
630   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
631   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
632   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
633   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
634   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
635   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
636 
637   // stage 2
638   s0 = x0;
639   s1 = x1;
640   s2 = x2;
641   s3 = x3;
642   s4 = x4;
643   s5 = x5;
644   s6 = x6;
645   s7 = x7;
646   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
647   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
648   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
649   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
650   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
651   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
652   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
653   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
654 
655   x0 = WRAPLOW(s0 + s4);
656   x1 = WRAPLOW(s1 + s5);
657   x2 = WRAPLOW(s2 + s6);
658   x3 = WRAPLOW(s3 + s7);
659   x4 = WRAPLOW(s0 - s4);
660   x5 = WRAPLOW(s1 - s5);
661   x6 = WRAPLOW(s2 - s6);
662   x7 = WRAPLOW(s3 - s7);
663   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
664   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
665   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
666   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
667   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
668   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
669   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
670   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
671 
672   // stage 3
673   s0 = x0;
674   s1 = x1;
675   s2 = x2;
676   s3 = x3;
677   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
678   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
679   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
680   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
681   s8 = x8;
682   s9 = x9;
683   s10 = x10;
684   s11 = x11;
685   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
686   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
687   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
688   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
689 
690   x0 = WRAPLOW(s0 + s2);
691   x1 = WRAPLOW(s1 + s3);
692   x2 = WRAPLOW(s0 - s2);
693   x3 = WRAPLOW(s1 - s3);
694   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
695   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
696   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
697   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
698   x8 = WRAPLOW(s8 + s10);
699   x9 = WRAPLOW(s9 + s11);
700   x10 = WRAPLOW(s8 - s10);
701   x11 = WRAPLOW(s9 - s11);
702   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
703   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
704   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
705   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
706 
707   // stage 4
708   s2 = (-cospi_16_64) * (x2 + x3);
709   s3 = cospi_16_64 * (x2 - x3);
710   s6 = cospi_16_64 * (x6 + x7);
711   s7 = cospi_16_64 * (-x6 + x7);
712   s10 = cospi_16_64 * (x10 + x11);
713   s11 = cospi_16_64 * (-x10 + x11);
714   s14 = (-cospi_16_64) * (x14 + x15);
715   s15 = cospi_16_64 * (x14 - x15);
716 
717   x2 = WRAPLOW(dct_const_round_shift(s2));
718   x3 = WRAPLOW(dct_const_round_shift(s3));
719   x6 = WRAPLOW(dct_const_round_shift(s6));
720   x7 = WRAPLOW(dct_const_round_shift(s7));
721   x10 = WRAPLOW(dct_const_round_shift(s10));
722   x11 = WRAPLOW(dct_const_round_shift(s11));
723   x14 = WRAPLOW(dct_const_round_shift(s14));
724   x15 = WRAPLOW(dct_const_round_shift(s15));
725 
726   output[0] = WRAPLOW(x0);
727   output[1] = WRAPLOW(-x8);
728   output[2] = WRAPLOW(x12);
729   output[3] = WRAPLOW(-x4);
730   output[4] = WRAPLOW(x6);
731   output[5] = WRAPLOW(x14);
732   output[6] = WRAPLOW(x10);
733   output[7] = WRAPLOW(x2);
734   output[8] = WRAPLOW(x3);
735   output[9] = WRAPLOW(x11);
736   output[10] = WRAPLOW(x15);
737   output[11] = WRAPLOW(x7);
738   output[12] = WRAPLOW(x5);
739   output[13] = WRAPLOW(-x13);
740   output[14] = WRAPLOW(x9);
741   output[15] = WRAPLOW(-x1);
742 }
743 
vpx_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)744 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
745                             int stride) {
746   int i, j;
747   tran_low_t out[16 * 16] = { 0 };
748   tran_low_t *outptr = out;
749   tran_low_t temp_in[16], temp_out[16];
750 
751   // First transform rows. Since all non-zero dct coefficients are in
752   // upper-left 4x4 area, we only need to calculate first 4 rows here.
753   for (i = 0; i < 4; ++i) {
754     idct16_c(input, outptr);
755     input += 16;
756     outptr += 16;
757   }
758 
759   // Then transform columns
760   for (i = 0; i < 16; ++i) {
761     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
762     idct16_c(temp_in, temp_out);
763     for (j = 0; j < 16; ++j) {
764       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
765                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
766     }
767   }
768 }
769 
vpx_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)770 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
771   int i, j;
772   tran_high_t a1;
773   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
774 
775   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
776   a1 = ROUND_POWER_OF_TWO(out, 6);
777   for (j = 0; j < 16; ++j) {
778     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
779     dest += stride;
780   }
781 }
782 
idct32_c(const tran_low_t * input,tran_low_t * output)783 void idct32_c(const tran_low_t *input, tran_low_t *output) {
784   tran_low_t step1[32], step2[32];
785   tran_high_t temp1, temp2;
786 
787   // stage 1
788   step1[0] = input[0];
789   step1[1] = input[16];
790   step1[2] = input[8];
791   step1[3] = input[24];
792   step1[4] = input[4];
793   step1[5] = input[20];
794   step1[6] = input[12];
795   step1[7] = input[28];
796   step1[8] = input[2];
797   step1[9] = input[18];
798   step1[10] = input[10];
799   step1[11] = input[26];
800   step1[12] = input[6];
801   step1[13] = input[22];
802   step1[14] = input[14];
803   step1[15] = input[30];
804 
805   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
806   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
807   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
808   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
809 
810   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
811   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
812   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
813   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
814 
815   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
816   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
817   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
818   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
819 
820   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
821   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
822   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
823   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
824 
825   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
826   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
827   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
828   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
829 
830   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
831   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
832   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
833   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
834 
835   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
836   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
837   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
838   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
839 
840   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
841   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
842   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
843   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
844 
845   // stage 2
846   step2[0] = step1[0];
847   step2[1] = step1[1];
848   step2[2] = step1[2];
849   step2[3] = step1[3];
850   step2[4] = step1[4];
851   step2[5] = step1[5];
852   step2[6] = step1[6];
853   step2[7] = step1[7];
854 
855   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
856   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
857   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
858   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
859 
860   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
861   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
862   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
863   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
864 
865   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
866   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
867   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
868   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
869 
870   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
871   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
872   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
873   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
874 
875   step2[16] = WRAPLOW(step1[16] + step1[17]);
876   step2[17] = WRAPLOW(step1[16] - step1[17]);
877   step2[18] = WRAPLOW(-step1[18] + step1[19]);
878   step2[19] = WRAPLOW(step1[18] + step1[19]);
879   step2[20] = WRAPLOW(step1[20] + step1[21]);
880   step2[21] = WRAPLOW(step1[20] - step1[21]);
881   step2[22] = WRAPLOW(-step1[22] + step1[23]);
882   step2[23] = WRAPLOW(step1[22] + step1[23]);
883   step2[24] = WRAPLOW(step1[24] + step1[25]);
884   step2[25] = WRAPLOW(step1[24] - step1[25]);
885   step2[26] = WRAPLOW(-step1[26] + step1[27]);
886   step2[27] = WRAPLOW(step1[26] + step1[27]);
887   step2[28] = WRAPLOW(step1[28] + step1[29]);
888   step2[29] = WRAPLOW(step1[28] - step1[29]);
889   step2[30] = WRAPLOW(-step1[30] + step1[31]);
890   step2[31] = WRAPLOW(step1[30] + step1[31]);
891 
892   // stage 3
893   step1[0] = step2[0];
894   step1[1] = step2[1];
895   step1[2] = step2[2];
896   step1[3] = step2[3];
897 
898   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
899   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
900   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
901   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
902   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
903   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
904   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
905   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
906 
907   step1[8] = WRAPLOW(step2[8] + step2[9]);
908   step1[9] = WRAPLOW(step2[8] - step2[9]);
909   step1[10] = WRAPLOW(-step2[10] + step2[11]);
910   step1[11] = WRAPLOW(step2[10] + step2[11]);
911   step1[12] = WRAPLOW(step2[12] + step2[13]);
912   step1[13] = WRAPLOW(step2[12] - step2[13]);
913   step1[14] = WRAPLOW(-step2[14] + step2[15]);
914   step1[15] = WRAPLOW(step2[14] + step2[15]);
915 
916   step1[16] = step2[16];
917   step1[31] = step2[31];
918   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
919   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
920   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
921   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
922   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
923   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
924   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
925   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
926   step1[19] = step2[19];
927   step1[20] = step2[20];
928   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
929   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
930   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
931   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
932   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
933   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
934   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
935   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
936   step1[23] = step2[23];
937   step1[24] = step2[24];
938   step1[27] = step2[27];
939   step1[28] = step2[28];
940 
941   // stage 4
942   temp1 = (step1[0] + step1[1]) * cospi_16_64;
943   temp2 = (step1[0] - step1[1]) * cospi_16_64;
944   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
945   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
946   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
947   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
948   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
949   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
950   step2[4] = WRAPLOW(step1[4] + step1[5]);
951   step2[5] = WRAPLOW(step1[4] - step1[5]);
952   step2[6] = WRAPLOW(-step1[6] + step1[7]);
953   step2[7] = WRAPLOW(step1[6] + step1[7]);
954 
955   step2[8] = step1[8];
956   step2[15] = step1[15];
957   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
958   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
959   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
960   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
961   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
962   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
963   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
964   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
965   step2[11] = step1[11];
966   step2[12] = step1[12];
967 
968   step2[16] = WRAPLOW(step1[16] + step1[19]);
969   step2[17] = WRAPLOW(step1[17] + step1[18]);
970   step2[18] = WRAPLOW(step1[17] - step1[18]);
971   step2[19] = WRAPLOW(step1[16] - step1[19]);
972   step2[20] = WRAPLOW(-step1[20] + step1[23]);
973   step2[21] = WRAPLOW(-step1[21] + step1[22]);
974   step2[22] = WRAPLOW(step1[21] + step1[22]);
975   step2[23] = WRAPLOW(step1[20] + step1[23]);
976 
977   step2[24] = WRAPLOW(step1[24] + step1[27]);
978   step2[25] = WRAPLOW(step1[25] + step1[26]);
979   step2[26] = WRAPLOW(step1[25] - step1[26]);
980   step2[27] = WRAPLOW(step1[24] - step1[27]);
981   step2[28] = WRAPLOW(-step1[28] + step1[31]);
982   step2[29] = WRAPLOW(-step1[29] + step1[30]);
983   step2[30] = WRAPLOW(step1[29] + step1[30]);
984   step2[31] = WRAPLOW(step1[28] + step1[31]);
985 
986   // stage 5
987   step1[0] = WRAPLOW(step2[0] + step2[3]);
988   step1[1] = WRAPLOW(step2[1] + step2[2]);
989   step1[2] = WRAPLOW(step2[1] - step2[2]);
990   step1[3] = WRAPLOW(step2[0] - step2[3]);
991   step1[4] = step2[4];
992   temp1 = (step2[6] - step2[5]) * cospi_16_64;
993   temp2 = (step2[5] + step2[6]) * cospi_16_64;
994   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
995   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
996   step1[7] = step2[7];
997 
998   step1[8] = WRAPLOW(step2[8] + step2[11]);
999   step1[9] = WRAPLOW(step2[9] + step2[10]);
1000   step1[10] = WRAPLOW(step2[9] - step2[10]);
1001   step1[11] = WRAPLOW(step2[8] - step2[11]);
1002   step1[12] = WRAPLOW(-step2[12] + step2[15]);
1003   step1[13] = WRAPLOW(-step2[13] + step2[14]);
1004   step1[14] = WRAPLOW(step2[13] + step2[14]);
1005   step1[15] = WRAPLOW(step2[12] + step2[15]);
1006 
1007   step1[16] = step2[16];
1008   step1[17] = step2[17];
1009   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1010   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1011   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1012   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1013   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1014   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1015   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1016   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1017   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1018   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1019   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1020   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1021   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1022   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1023   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1024   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1025   step1[22] = step2[22];
1026   step1[23] = step2[23];
1027   step1[24] = step2[24];
1028   step1[25] = step2[25];
1029   step1[30] = step2[30];
1030   step1[31] = step2[31];
1031 
1032   // stage 6
1033   step2[0] = WRAPLOW(step1[0] + step1[7]);
1034   step2[1] = WRAPLOW(step1[1] + step1[6]);
1035   step2[2] = WRAPLOW(step1[2] + step1[5]);
1036   step2[3] = WRAPLOW(step1[3] + step1[4]);
1037   step2[4] = WRAPLOW(step1[3] - step1[4]);
1038   step2[5] = WRAPLOW(step1[2] - step1[5]);
1039   step2[6] = WRAPLOW(step1[1] - step1[6]);
1040   step2[7] = WRAPLOW(step1[0] - step1[7]);
1041   step2[8] = step1[8];
1042   step2[9] = step1[9];
1043   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1044   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1045   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1046   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1047   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1048   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1049   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1050   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1051   step2[14] = step1[14];
1052   step2[15] = step1[15];
1053 
1054   step2[16] = WRAPLOW(step1[16] + step1[23]);
1055   step2[17] = WRAPLOW(step1[17] + step1[22]);
1056   step2[18] = WRAPLOW(step1[18] + step1[21]);
1057   step2[19] = WRAPLOW(step1[19] + step1[20]);
1058   step2[20] = WRAPLOW(step1[19] - step1[20]);
1059   step2[21] = WRAPLOW(step1[18] - step1[21]);
1060   step2[22] = WRAPLOW(step1[17] - step1[22]);
1061   step2[23] = WRAPLOW(step1[16] - step1[23]);
1062 
1063   step2[24] = WRAPLOW(-step1[24] + step1[31]);
1064   step2[25] = WRAPLOW(-step1[25] + step1[30]);
1065   step2[26] = WRAPLOW(-step1[26] + step1[29]);
1066   step2[27] = WRAPLOW(-step1[27] + step1[28]);
1067   step2[28] = WRAPLOW(step1[27] + step1[28]);
1068   step2[29] = WRAPLOW(step1[26] + step1[29]);
1069   step2[30] = WRAPLOW(step1[25] + step1[30]);
1070   step2[31] = WRAPLOW(step1[24] + step1[31]);
1071 
1072   // stage 7
1073   step1[0] = WRAPLOW(step2[0] + step2[15]);
1074   step1[1] = WRAPLOW(step2[1] + step2[14]);
1075   step1[2] = WRAPLOW(step2[2] + step2[13]);
1076   step1[3] = WRAPLOW(step2[3] + step2[12]);
1077   step1[4] = WRAPLOW(step2[4] + step2[11]);
1078   step1[5] = WRAPLOW(step2[5] + step2[10]);
1079   step1[6] = WRAPLOW(step2[6] + step2[9]);
1080   step1[7] = WRAPLOW(step2[7] + step2[8]);
1081   step1[8] = WRAPLOW(step2[7] - step2[8]);
1082   step1[9] = WRAPLOW(step2[6] - step2[9]);
1083   step1[10] = WRAPLOW(step2[5] - step2[10]);
1084   step1[11] = WRAPLOW(step2[4] - step2[11]);
1085   step1[12] = WRAPLOW(step2[3] - step2[12]);
1086   step1[13] = WRAPLOW(step2[2] - step2[13]);
1087   step1[14] = WRAPLOW(step2[1] - step2[14]);
1088   step1[15] = WRAPLOW(step2[0] - step2[15]);
1089 
1090   step1[16] = step2[16];
1091   step1[17] = step2[17];
1092   step1[18] = step2[18];
1093   step1[19] = step2[19];
1094   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1095   temp2 = (step2[20] + step2[27]) * cospi_16_64;
1096   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1097   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1098   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1099   temp2 = (step2[21] + step2[26]) * cospi_16_64;
1100   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1101   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1102   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1103   temp2 = (step2[22] + step2[25]) * cospi_16_64;
1104   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1105   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1106   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1107   temp2 = (step2[23] + step2[24]) * cospi_16_64;
1108   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1109   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1110   step1[28] = step2[28];
1111   step1[29] = step2[29];
1112   step1[30] = step2[30];
1113   step1[31] = step2[31];
1114 
1115   // final stage
1116   output[0] = WRAPLOW(step1[0] + step1[31]);
1117   output[1] = WRAPLOW(step1[1] + step1[30]);
1118   output[2] = WRAPLOW(step1[2] + step1[29]);
1119   output[3] = WRAPLOW(step1[3] + step1[28]);
1120   output[4] = WRAPLOW(step1[4] + step1[27]);
1121   output[5] = WRAPLOW(step1[5] + step1[26]);
1122   output[6] = WRAPLOW(step1[6] + step1[25]);
1123   output[7] = WRAPLOW(step1[7] + step1[24]);
1124   output[8] = WRAPLOW(step1[8] + step1[23]);
1125   output[9] = WRAPLOW(step1[9] + step1[22]);
1126   output[10] = WRAPLOW(step1[10] + step1[21]);
1127   output[11] = WRAPLOW(step1[11] + step1[20]);
1128   output[12] = WRAPLOW(step1[12] + step1[19]);
1129   output[13] = WRAPLOW(step1[13] + step1[18]);
1130   output[14] = WRAPLOW(step1[14] + step1[17]);
1131   output[15] = WRAPLOW(step1[15] + step1[16]);
1132   output[16] = WRAPLOW(step1[15] - step1[16]);
1133   output[17] = WRAPLOW(step1[14] - step1[17]);
1134   output[18] = WRAPLOW(step1[13] - step1[18]);
1135   output[19] = WRAPLOW(step1[12] - step1[19]);
1136   output[20] = WRAPLOW(step1[11] - step1[20]);
1137   output[21] = WRAPLOW(step1[10] - step1[21]);
1138   output[22] = WRAPLOW(step1[9] - step1[22]);
1139   output[23] = WRAPLOW(step1[8] - step1[23]);
1140   output[24] = WRAPLOW(step1[7] - step1[24]);
1141   output[25] = WRAPLOW(step1[6] - step1[25]);
1142   output[26] = WRAPLOW(step1[5] - step1[26]);
1143   output[27] = WRAPLOW(step1[4] - step1[27]);
1144   output[28] = WRAPLOW(step1[3] - step1[28]);
1145   output[29] = WRAPLOW(step1[2] - step1[29]);
1146   output[30] = WRAPLOW(step1[1] - step1[30]);
1147   output[31] = WRAPLOW(step1[0] - step1[31]);
1148 }
1149 
vpx_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1150 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1151                               int stride) {
1152   int i, j;
1153   tran_low_t out[32 * 32];
1154   tran_low_t *outptr = out;
1155   tran_low_t temp_in[32], temp_out[32];
1156 
1157   // Rows
1158   for (i = 0; i < 32; ++i) {
1159     int16_t zero_coeff[16];
1160     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1161     for (j = 0; j < 8; ++j)
1162       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1163     for (j = 0; j < 4; ++j)
1164       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1165     for (j = 0; j < 2; ++j)
1166       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1167 
1168     if (zero_coeff[0] | zero_coeff[1])
1169       idct32_c(input, outptr);
1170     else
1171       memset(outptr, 0, sizeof(tran_low_t) * 32);
1172     input += 32;
1173     outptr += 32;
1174   }
1175 
1176   // Columns
1177   for (i = 0; i < 32; ++i) {
1178     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1179     idct32_c(temp_in, temp_out);
1180     for (j = 0; j < 32; ++j) {
1181       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1182                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1183     }
1184   }
1185 }
1186 
vpx_idct32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride)1187 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1188                              int stride) {
1189   int i, j;
1190   tran_low_t out[32 * 32] = { 0 };
1191   tran_low_t *outptr = out;
1192   tran_low_t temp_in[32], temp_out[32];
1193 
1194   // Rows
1195   // Only upper-left 16x16 has non-zero coeff
1196   for (i = 0; i < 16; ++i) {
1197     idct32_c(input, outptr);
1198     input += 32;
1199     outptr += 32;
1200   }
1201 
1202   // Columns
1203   for (i = 0; i < 32; ++i) {
1204     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1205     idct32_c(temp_in, temp_out);
1206     for (j = 0; j < 32; ++j) {
1207       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1208                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1209     }
1210   }
1211 }
1212 
vpx_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1213 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1214                             int stride) {
1215   int i, j;
1216   tran_low_t out[32 * 32] = { 0 };
1217   tran_low_t *outptr = out;
1218   tran_low_t temp_in[32], temp_out[32];
1219 
1220   // Rows
1221   // Only upper-left 8x8 has non-zero coeff
1222   for (i = 0; i < 8; ++i) {
1223     idct32_c(input, outptr);
1224     input += 32;
1225     outptr += 32;
1226   }
1227 
1228   // Columns
1229   for (i = 0; i < 32; ++i) {
1230     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1231     idct32_c(temp_in, temp_out);
1232     for (j = 0; j < 32; ++j) {
1233       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1234                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1235     }
1236   }
1237 }
1238 
vpx_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1239 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1240   int i, j;
1241   tran_high_t a1;
1242   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1243 
1244   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1245   a1 = ROUND_POWER_OF_TWO(out, 6);
1246 
1247   for (j = 0; j < 32; ++j) {
1248     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1249     dest += stride;
1250   }
1251 }
1252 
1253 #if CONFIG_VP9_HIGHBITDEPTH
1254 
1255 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1256 // transform amplify bits + 1 bit for contingency in rounding and quantizing
1257 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1258 
detect_invalid_highbd_input(const tran_low_t * input,int size)1259 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1260                                               int size) {
1261   int i;
1262   for (i = 0; i < size; ++i)
1263     if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1264   return 0;
1265 }
1266 
vpx_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1267 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1268                                  int stride, int bd) {
1269   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1270      0.5 shifts per pixel. */
1271   int i;
1272   tran_low_t output[16];
1273   tran_high_t a1, b1, c1, d1, e1;
1274   const tran_low_t *ip = input;
1275   tran_low_t *op = output;
1276   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1277 
1278   for (i = 0; i < 4; i++) {
1279     a1 = ip[0] >> UNIT_QUANT_SHIFT;
1280     c1 = ip[1] >> UNIT_QUANT_SHIFT;
1281     d1 = ip[2] >> UNIT_QUANT_SHIFT;
1282     b1 = ip[3] >> UNIT_QUANT_SHIFT;
1283     a1 += c1;
1284     d1 -= b1;
1285     e1 = (a1 - d1) >> 1;
1286     b1 = e1 - b1;
1287     c1 = e1 - c1;
1288     a1 -= b1;
1289     d1 += c1;
1290     op[0] = HIGHBD_WRAPLOW(a1, bd);
1291     op[1] = HIGHBD_WRAPLOW(b1, bd);
1292     op[2] = HIGHBD_WRAPLOW(c1, bd);
1293     op[3] = HIGHBD_WRAPLOW(d1, bd);
1294     ip += 4;
1295     op += 4;
1296   }
1297 
1298   ip = output;
1299   for (i = 0; i < 4; i++) {
1300     a1 = ip[4 * 0];
1301     c1 = ip[4 * 1];
1302     d1 = ip[4 * 2];
1303     b1 = ip[4 * 3];
1304     a1 += c1;
1305     d1 -= b1;
1306     e1 = (a1 - d1) >> 1;
1307     b1 = e1 - b1;
1308     c1 = e1 - c1;
1309     a1 -= b1;
1310     d1 += c1;
1311     dest[stride * 0] =
1312         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1313     dest[stride * 1] =
1314         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1315     dest[stride * 2] =
1316         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1317     dest[stride * 3] =
1318         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1319 
1320     ip++;
1321     dest++;
1322   }
1323 }
1324 
vpx_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest8,int stride,int bd)1325 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1326                                 int stride, int bd) {
1327   int i;
1328   tran_high_t a1, e1;
1329   tran_low_t tmp[4];
1330   const tran_low_t *ip = in;
1331   tran_low_t *op = tmp;
1332   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1333   (void)bd;
1334 
1335   a1 = ip[0] >> UNIT_QUANT_SHIFT;
1336   e1 = a1 >> 1;
1337   a1 -= e1;
1338   op[0] = HIGHBD_WRAPLOW(a1, bd);
1339   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1340 
1341   ip = tmp;
1342   for (i = 0; i < 4; i++) {
1343     e1 = ip[0] >> 1;
1344     a1 = ip[0] - e1;
1345     dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1346     dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1347     dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1348     dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1349     ip++;
1350     dest++;
1351   }
1352 }
1353 
vpx_highbd_idct4_c(const tran_low_t * input,tran_low_t * output,int bd)1354 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1355   tran_low_t step[4];
1356   tran_high_t temp1, temp2;
1357   (void)bd;
1358 
1359   if (detect_invalid_highbd_input(input, 4)) {
1360 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1361     assert(0 && "invalid highbd txfm input");
1362 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1363     memset(output, 0, sizeof(*output) * 4);
1364     return;
1365   }
1366 
1367   // stage 1
1368   temp1 = (input[0] + input[2]) * cospi_16_64;
1369   temp2 = (input[0] - input[2]) * cospi_16_64;
1370   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1371   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1372   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1373   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1374   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1375   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1376 
1377   // stage 2
1378   output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1379   output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1380   output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1381   output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1382 }
1383 
vpx_highbd_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1384 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1385                                  int stride, int bd) {
1386   int i, j;
1387   tran_low_t out[4 * 4];
1388   tran_low_t *outptr = out;
1389   tran_low_t temp_in[4], temp_out[4];
1390   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1391 
1392   // Rows
1393   for (i = 0; i < 4; ++i) {
1394     vpx_highbd_idct4_c(input, outptr, bd);
1395     input += 4;
1396     outptr += 4;
1397   }
1398 
1399   // Columns
1400   for (i = 0; i < 4; ++i) {
1401     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1402     vpx_highbd_idct4_c(temp_in, temp_out, bd);
1403     for (j = 0; j < 4; ++j) {
1404       dest[j * stride + i] = highbd_clip_pixel_add(
1405           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1406     }
1407   }
1408 }
1409 
vpx_highbd_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1410 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1411                                 int stride, int bd) {
1412   int i;
1413   tran_high_t a1;
1414   tran_low_t out =
1415       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1416   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1417 
1418   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1419   a1 = ROUND_POWER_OF_TWO(out, 4);
1420 
1421   for (i = 0; i < 4; i++) {
1422     dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1423     dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1424     dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1425     dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1426     dest += stride;
1427   }
1428 }
1429 
vpx_highbd_idct8_c(const tran_low_t * input,tran_low_t * output,int bd)1430 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1431   tran_low_t step1[8], step2[8];
1432   tran_high_t temp1, temp2;
1433 
1434   if (detect_invalid_highbd_input(input, 8)) {
1435 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1436     assert(0 && "invalid highbd txfm input");
1437 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1438     memset(output, 0, sizeof(*output) * 8);
1439     return;
1440   }
1441 
1442   // stage 1
1443   step1[0] = input[0];
1444   step1[2] = input[4];
1445   step1[1] = input[2];
1446   step1[3] = input[6];
1447   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1448   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1449   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1450   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1451   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1452   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1453   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1454   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1455 
1456   // stage 2 & stage 3 - even half
1457   vpx_highbd_idct4_c(step1, step1, bd);
1458 
1459   // stage 2 - odd half
1460   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1461   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1462   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1463   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1464 
1465   // stage 3 - odd half
1466   step1[4] = step2[4];
1467   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1468   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1469   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1470   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1471   step1[7] = step2[7];
1472 
1473   // stage 4
1474   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1475   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1476   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1477   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1478   output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1479   output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1480   output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1481   output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1482 }
1483 
vpx_highbd_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1484 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1485                                  int stride, int bd) {
1486   int i, j;
1487   tran_low_t out[8 * 8];
1488   tran_low_t *outptr = out;
1489   tran_low_t temp_in[8], temp_out[8];
1490   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1491 
1492   // First transform rows
1493   for (i = 0; i < 8; ++i) {
1494     vpx_highbd_idct8_c(input, outptr, bd);
1495     input += 8;
1496     outptr += 8;
1497   }
1498 
1499   // Then transform columns
1500   for (i = 0; i < 8; ++i) {
1501     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1502     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1503     for (j = 0; j < 8; ++j) {
1504       dest[j * stride + i] = highbd_clip_pixel_add(
1505           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1506     }
1507   }
1508 }
1509 
vpx_highbd_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1510 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1511                                 int stride, int bd) {
1512   int i, j;
1513   tran_high_t a1;
1514   tran_low_t out =
1515       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1516   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1517 
1518   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1519   a1 = ROUND_POWER_OF_TWO(out, 5);
1520   for (j = 0; j < 8; ++j) {
1521     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1522     dest += stride;
1523   }
1524 }
1525 
vpx_highbd_iadst4_c(const tran_low_t * input,tran_low_t * output,int bd)1526 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1527   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1528   tran_low_t x0 = input[0];
1529   tran_low_t x1 = input[1];
1530   tran_low_t x2 = input[2];
1531   tran_low_t x3 = input[3];
1532   (void)bd;
1533 
1534   if (detect_invalid_highbd_input(input, 4)) {
1535 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1536     assert(0 && "invalid highbd txfm input");
1537 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1538     memset(output, 0, sizeof(*output) * 4);
1539     return;
1540   }
1541 
1542   if (!(x0 | x1 | x2 | x3)) {
1543     memset(output, 0, 4 * sizeof(*output));
1544     return;
1545   }
1546 
1547   s0 = sinpi_1_9 * x0;
1548   s1 = sinpi_2_9 * x0;
1549   s2 = sinpi_3_9 * x1;
1550   s3 = sinpi_4_9 * x2;
1551   s4 = sinpi_1_9 * x2;
1552   s5 = sinpi_2_9 * x3;
1553   s6 = sinpi_4_9 * x3;
1554   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1555 
1556   s0 = s0 + s3 + s5;
1557   s1 = s1 - s4 - s6;
1558   s3 = s2;
1559   s2 = sinpi_3_9 * s7;
1560 
1561   // 1-D transform scaling factor is sqrt(2).
1562   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1563   // + 1b (addition) = 29b.
1564   // Hence the output bit depth is 15b.
1565   output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1566   output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1567   output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1568   output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1569 }
1570 
vpx_highbd_iadst8_c(const tran_low_t * input,tran_low_t * output,int bd)1571 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1572   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1573   tran_low_t x0 = input[7];
1574   tran_low_t x1 = input[0];
1575   tran_low_t x2 = input[5];
1576   tran_low_t x3 = input[2];
1577   tran_low_t x4 = input[3];
1578   tran_low_t x5 = input[4];
1579   tran_low_t x6 = input[1];
1580   tran_low_t x7 = input[6];
1581   (void)bd;
1582 
1583   if (detect_invalid_highbd_input(input, 8)) {
1584 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1585     assert(0 && "invalid highbd txfm input");
1586 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1587     memset(output, 0, sizeof(*output) * 8);
1588     return;
1589   }
1590 
1591   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1592     memset(output, 0, 8 * sizeof(*output));
1593     return;
1594   }
1595 
1596   // stage 1
1597   s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1598   s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1599   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1600   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1601   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1602   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1603   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1604   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1605 
1606   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1607   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1608   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1609   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1610   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1611   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1612   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1613   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1614 
1615   // stage 2
1616   s0 = x0;
1617   s1 = x1;
1618   s2 = x2;
1619   s3 = x3;
1620   s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1621   s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1622   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1623   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1624 
1625   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1626   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1627   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1628   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1629   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1630   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1631   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1632   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1633 
1634   // stage 3
1635   s2 = cospi_16_64 * (x2 + x3);
1636   s3 = cospi_16_64 * (x2 - x3);
1637   s6 = cospi_16_64 * (x6 + x7);
1638   s7 = cospi_16_64 * (x6 - x7);
1639 
1640   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1641   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1642   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1643   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1644 
1645   output[0] = HIGHBD_WRAPLOW(x0, bd);
1646   output[1] = HIGHBD_WRAPLOW(-x4, bd);
1647   output[2] = HIGHBD_WRAPLOW(x6, bd);
1648   output[3] = HIGHBD_WRAPLOW(-x2, bd);
1649   output[4] = HIGHBD_WRAPLOW(x3, bd);
1650   output[5] = HIGHBD_WRAPLOW(-x7, bd);
1651   output[6] = HIGHBD_WRAPLOW(x5, bd);
1652   output[7] = HIGHBD_WRAPLOW(-x1, bd);
1653 }
1654 
vpx_highbd_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1655 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
1656                                  int stride, int bd) {
1657   int i, j;
1658   tran_low_t out[8 * 8] = { 0 };
1659   tran_low_t *outptr = out;
1660   tran_low_t temp_in[8], temp_out[8];
1661   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1662 
1663   // First transform rows
1664   // Only first 4 row has non-zero coefs
1665   for (i = 0; i < 4; ++i) {
1666     vpx_highbd_idct8_c(input, outptr, bd);
1667     input += 8;
1668     outptr += 8;
1669   }
1670 
1671   // Then transform columns
1672   for (i = 0; i < 8; ++i) {
1673     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1674     vpx_highbd_idct8_c(temp_in, temp_out, bd);
1675     for (j = 0; j < 8; ++j) {
1676       dest[j * stride + i] = highbd_clip_pixel_add(
1677           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1678     }
1679   }
1680 }
1681 
vpx_highbd_idct16_c(const tran_low_t * input,tran_low_t * output,int bd)1682 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1683   tran_low_t step1[16], step2[16];
1684   tran_high_t temp1, temp2;
1685   (void)bd;
1686 
1687   if (detect_invalid_highbd_input(input, 16)) {
1688 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1689     assert(0 && "invalid highbd txfm input");
1690 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1691     memset(output, 0, sizeof(*output) * 16);
1692     return;
1693   }
1694 
1695   // stage 1
1696   step1[0] = input[0 / 2];
1697   step1[1] = input[16 / 2];
1698   step1[2] = input[8 / 2];
1699   step1[3] = input[24 / 2];
1700   step1[4] = input[4 / 2];
1701   step1[5] = input[20 / 2];
1702   step1[6] = input[12 / 2];
1703   step1[7] = input[28 / 2];
1704   step1[8] = input[2 / 2];
1705   step1[9] = input[18 / 2];
1706   step1[10] = input[10 / 2];
1707   step1[11] = input[26 / 2];
1708   step1[12] = input[6 / 2];
1709   step1[13] = input[22 / 2];
1710   step1[14] = input[14 / 2];
1711   step1[15] = input[30 / 2];
1712 
1713   // stage 2
1714   step2[0] = step1[0];
1715   step2[1] = step1[1];
1716   step2[2] = step1[2];
1717   step2[3] = step1[3];
1718   step2[4] = step1[4];
1719   step2[5] = step1[5];
1720   step2[6] = step1[6];
1721   step2[7] = step1[7];
1722 
1723   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1724   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1725   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1726   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1727 
1728   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1729   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1730   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1731   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1732 
1733   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1734   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1735   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1736   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1737 
1738   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1739   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1740   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1741   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1742 
1743   // stage 3
1744   step1[0] = step2[0];
1745   step1[1] = step2[1];
1746   step1[2] = step2[2];
1747   step1[3] = step2[3];
1748 
1749   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1750   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1751   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1752   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1753   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1754   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1755   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1756   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1757 
1758   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1759   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1760   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1761   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1762   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1763   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1764   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1765   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1766 
1767   // stage 4
1768   temp1 = (step1[0] + step1[1]) * cospi_16_64;
1769   temp2 = (step1[0] - step1[1]) * cospi_16_64;
1770   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1771   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1772   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1773   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1774   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1775   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1776   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1777   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1778   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1779   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1780 
1781   step2[8] = step1[8];
1782   step2[15] = step1[15];
1783   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1784   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1785   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1786   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1787   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1788   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1789   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1790   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1791   step2[11] = step1[11];
1792   step2[12] = step1[12];
1793 
1794   // stage 5
1795   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1796   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
1797   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
1798   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
1799   step1[4] = step2[4];
1800   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1801   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1802   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1803   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1804   step1[7] = step2[7];
1805 
1806   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
1807   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
1808   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
1809   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
1810   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
1811   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
1812   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
1813   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
1814 
1815   // stage 6
1816   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1817   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1818   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1819   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1820   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1821   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1822   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1823   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1824   step2[8] = step1[8];
1825   step2[9] = step1[9];
1826   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1827   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1828   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1829   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1830   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1831   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1832   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1833   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1834   step2[14] = step1[14];
1835   step2[15] = step1[15];
1836 
1837   // stage 7
1838   output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
1839   output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
1840   output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
1841   output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
1842   output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
1843   output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
1844   output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
1845   output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
1846   output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
1847   output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
1848   output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
1849   output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
1850   output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
1851   output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
1852   output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
1853   output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
1854 }
1855 
vpx_highbd_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1856 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
1857                                     int stride, int bd) {
1858   int i, j;
1859   tran_low_t out[16 * 16];
1860   tran_low_t *outptr = out;
1861   tran_low_t temp_in[16], temp_out[16];
1862   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1863 
1864   // First transform rows
1865   for (i = 0; i < 16; ++i) {
1866     vpx_highbd_idct16_c(input, outptr, bd);
1867     input += 16;
1868     outptr += 16;
1869   }
1870 
1871   // Then transform columns
1872   for (i = 0; i < 16; ++i) {
1873     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
1874     vpx_highbd_idct16_c(temp_in, temp_out, bd);
1875     for (j = 0; j < 16; ++j) {
1876       dest[j * stride + i] = highbd_clip_pixel_add(
1877           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1878     }
1879   }
1880 }
1881 
vpx_highbd_iadst16_c(const tran_low_t * input,tran_low_t * output,int bd)1882 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1883   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1884   tran_high_t s9, s10, s11, s12, s13, s14, s15;
1885   tran_low_t x0 = input[15];
1886   tran_low_t x1 = input[0];
1887   tran_low_t x2 = input[13];
1888   tran_low_t x3 = input[2];
1889   tran_low_t x4 = input[11];
1890   tran_low_t x5 = input[4];
1891   tran_low_t x6 = input[9];
1892   tran_low_t x7 = input[6];
1893   tran_low_t x8 = input[7];
1894   tran_low_t x9 = input[8];
1895   tran_low_t x10 = input[5];
1896   tran_low_t x11 = input[10];
1897   tran_low_t x12 = input[3];
1898   tran_low_t x13 = input[12];
1899   tran_low_t x14 = input[1];
1900   tran_low_t x15 = input[14];
1901   (void)bd;
1902 
1903   if (detect_invalid_highbd_input(input, 16)) {
1904 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1905     assert(0 && "invalid highbd txfm input");
1906 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1907     memset(output, 0, sizeof(*output) * 16);
1908     return;
1909   }
1910 
1911   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1912         x13 | x14 | x15)) {
1913     memset(output, 0, 16 * sizeof(*output));
1914     return;
1915   }
1916 
1917   // stage 1
1918   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1919   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1920   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1921   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1922   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1923   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1924   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1925   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1926   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1927   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1928   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1929   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1930   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1931   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1932   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1933   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1934 
1935   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1936   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1937   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1938   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1939   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1940   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1941   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1942   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1943   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1944   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1945   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1946   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1947   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1948   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1949   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1950   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1951 
1952   // stage 2
1953   s0 = x0;
1954   s1 = x1;
1955   s2 = x2;
1956   s3 = x3;
1957   s4 = x4;
1958   s5 = x5;
1959   s6 = x6;
1960   s7 = x7;
1961   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1962   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1963   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1964   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1965   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1966   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1967   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1968   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1969 
1970   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1971   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1972   x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1973   x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1974   x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1975   x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1976   x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1977   x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1978   x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1979   x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1980   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1981   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1982   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1983   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1984   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1985   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1986 
1987   // stage 3
1988   s0 = x0;
1989   s1 = x1;
1990   s2 = x2;
1991   s3 = x3;
1992   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1993   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1994   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1995   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1996   s8 = x8;
1997   s9 = x9;
1998   s10 = x10;
1999   s11 = x11;
2000   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
2001   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
2002   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
2003   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
2004 
2005   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
2006   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
2007   x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
2008   x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
2009   x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
2010   x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
2011   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
2012   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
2013   x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
2014   x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
2015   x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
2016   x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
2017   x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
2018   x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
2019   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
2020   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
2021 
2022   // stage 4
2023   s2 = (-cospi_16_64) * (x2 + x3);
2024   s3 = cospi_16_64 * (x2 - x3);
2025   s6 = cospi_16_64 * (x6 + x7);
2026   s7 = cospi_16_64 * (-x6 + x7);
2027   s10 = cospi_16_64 * (x10 + x11);
2028   s11 = cospi_16_64 * (-x10 + x11);
2029   s14 = (-cospi_16_64) * (x14 + x15);
2030   s15 = cospi_16_64 * (x14 - x15);
2031 
2032   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
2033   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
2034   x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
2035   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
2036   x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
2037   x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
2038   x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
2039   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
2040 
2041   output[0] = HIGHBD_WRAPLOW(x0, bd);
2042   output[1] = HIGHBD_WRAPLOW(-x8, bd);
2043   output[2] = HIGHBD_WRAPLOW(x12, bd);
2044   output[3] = HIGHBD_WRAPLOW(-x4, bd);
2045   output[4] = HIGHBD_WRAPLOW(x6, bd);
2046   output[5] = HIGHBD_WRAPLOW(x14, bd);
2047   output[6] = HIGHBD_WRAPLOW(x10, bd);
2048   output[7] = HIGHBD_WRAPLOW(x2, bd);
2049   output[8] = HIGHBD_WRAPLOW(x3, bd);
2050   output[9] = HIGHBD_WRAPLOW(x11, bd);
2051   output[10] = HIGHBD_WRAPLOW(x15, bd);
2052   output[11] = HIGHBD_WRAPLOW(x7, bd);
2053   output[12] = HIGHBD_WRAPLOW(x5, bd);
2054   output[13] = HIGHBD_WRAPLOW(-x13, bd);
2055   output[14] = HIGHBD_WRAPLOW(x9, bd);
2056   output[15] = HIGHBD_WRAPLOW(-x1, bd);
2057 }
2058 
vpx_highbd_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2059 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
2060                                    int stride, int bd) {
2061   int i, j;
2062   tran_low_t out[16 * 16] = { 0 };
2063   tran_low_t *outptr = out;
2064   tran_low_t temp_in[16], temp_out[16];
2065   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2066 
2067   // First transform rows. Since all non-zero dct coefficients are in
2068   // upper-left 4x4 area, we only need to calculate first 4 rows here.
2069   for (i = 0; i < 4; ++i) {
2070     vpx_highbd_idct16_c(input, outptr, bd);
2071     input += 16;
2072     outptr += 16;
2073   }
2074 
2075   // Then transform columns
2076   for (i = 0; i < 16; ++i) {
2077     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2078     vpx_highbd_idct16_c(temp_in, temp_out, bd);
2079     for (j = 0; j < 16; ++j) {
2080       dest[j * stride + i] = highbd_clip_pixel_add(
2081           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2082     }
2083   }
2084 }
2085 
vpx_highbd_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2086 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2087                                   int stride, int bd) {
2088   int i, j;
2089   tran_high_t a1;
2090   tran_low_t out =
2091       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2092   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2093 
2094   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2095   a1 = ROUND_POWER_OF_TWO(out, 6);
2096   for (j = 0; j < 16; ++j) {
2097     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2098     dest += stride;
2099   }
2100 }
2101 
highbd_idct32_c(const tran_low_t * input,tran_low_t * output,int bd)2102 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2103                             int bd) {
2104   tran_low_t step1[32], step2[32];
2105   tran_high_t temp1, temp2;
2106   (void)bd;
2107 
2108   if (detect_invalid_highbd_input(input, 32)) {
2109 #if CONFIG_COEFFICIENT_RANGE_CHECKING
2110     assert(0 && "invalid highbd txfm input");
2111 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
2112     memset(output, 0, sizeof(*output) * 32);
2113     return;
2114   }
2115 
2116   // stage 1
2117   step1[0] = input[0];
2118   step1[1] = input[16];
2119   step1[2] = input[8];
2120   step1[3] = input[24];
2121   step1[4] = input[4];
2122   step1[5] = input[20];
2123   step1[6] = input[12];
2124   step1[7] = input[28];
2125   step1[8] = input[2];
2126   step1[9] = input[18];
2127   step1[10] = input[10];
2128   step1[11] = input[26];
2129   step1[12] = input[6];
2130   step1[13] = input[22];
2131   step1[14] = input[14];
2132   step1[15] = input[30];
2133 
2134   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2135   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2136   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2137   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2138 
2139   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2140   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2141   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2142   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2143 
2144   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2145   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2146   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2147   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2148 
2149   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2150   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2151   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2152   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2153 
2154   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2155   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2156   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2157   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2158 
2159   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2160   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2161   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2162   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2163 
2164   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2165   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2166   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2167   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2168 
2169   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2170   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2171   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2172   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2173 
2174   // stage 2
2175   step2[0] = step1[0];
2176   step2[1] = step1[1];
2177   step2[2] = step1[2];
2178   step2[3] = step1[3];
2179   step2[4] = step1[4];
2180   step2[5] = step1[5];
2181   step2[6] = step1[6];
2182   step2[7] = step1[7];
2183 
2184   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2185   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2186   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2187   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2188 
2189   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2190   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2191   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2192   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2193 
2194   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2195   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2196   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2197   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2198 
2199   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2200   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2201   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2202   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2203 
2204   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2205   step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2206   step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2207   step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2208   step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2209   step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2210   step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2211   step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2212   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2213   step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2214   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2215   step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2216   step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2217   step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2218   step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2219   step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2220 
2221   // stage 3
2222   step1[0] = step2[0];
2223   step1[1] = step2[1];
2224   step1[2] = step2[2];
2225   step1[3] = step2[3];
2226 
2227   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2228   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2229   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2230   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2231   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2232   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2233   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2234   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2235 
2236   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2237   step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2238   step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2239   step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2240   step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2241   step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2242   step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2243   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2244 
2245   step1[16] = step2[16];
2246   step1[31] = step2[31];
2247   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2248   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2249   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2250   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2251   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2252   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2253   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2254   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2255   step1[19] = step2[19];
2256   step1[20] = step2[20];
2257   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2258   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2259   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2260   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2261   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2262   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2263   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2264   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2265   step1[23] = step2[23];
2266   step1[24] = step2[24];
2267   step1[27] = step2[27];
2268   step1[28] = step2[28];
2269 
2270   // stage 4
2271   temp1 = (step1[0] + step1[1]) * cospi_16_64;
2272   temp2 = (step1[0] - step1[1]) * cospi_16_64;
2273   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2274   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2275   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2276   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2277   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2278   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2279   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2280   step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2281   step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2282   step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2283 
2284   step2[8] = step1[8];
2285   step2[15] = step1[15];
2286   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2287   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2288   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2289   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2290   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2291   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2292   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2293   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2294   step2[11] = step1[11];
2295   step2[12] = step1[12];
2296 
2297   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2298   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2299   step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2300   step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2301   step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2302   step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2303   step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2304   step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2305 
2306   step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2307   step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2308   step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2309   step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2310   step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2311   step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2312   step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2313   step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2314 
2315   // stage 5
2316   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2317   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2318   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2319   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2320   step1[4] = step2[4];
2321   temp1 = (step2[6] - step2[5]) * cospi_16_64;
2322   temp2 = (step2[5] + step2[6]) * cospi_16_64;
2323   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2324   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2325   step1[7] = step2[7];
2326 
2327   step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2328   step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2329   step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2330   step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2331   step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2332   step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2333   step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2334   step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2335 
2336   step1[16] = step2[16];
2337   step1[17] = step2[17];
2338   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2339   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2340   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2341   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2342   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2343   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2344   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2345   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2346   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2347   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2348   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2349   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2350   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2351   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2352   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2353   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2354   step1[22] = step2[22];
2355   step1[23] = step2[23];
2356   step1[24] = step2[24];
2357   step1[25] = step2[25];
2358   step1[30] = step2[30];
2359   step1[31] = step2[31];
2360 
2361   // stage 6
2362   step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2363   step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2364   step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2365   step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2366   step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2367   step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2368   step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2369   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2370   step2[8] = step1[8];
2371   step2[9] = step1[9];
2372   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2373   temp2 = (step1[10] + step1[13]) * cospi_16_64;
2374   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2375   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2376   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2377   temp2 = (step1[11] + step1[12]) * cospi_16_64;
2378   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2379   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2380   step2[14] = step1[14];
2381   step2[15] = step1[15];
2382 
2383   step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2384   step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2385   step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2386   step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2387   step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2388   step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2389   step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2390   step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2391 
2392   step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2393   step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2394   step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2395   step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2396   step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2397   step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2398   step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2399   step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2400 
2401   // stage 7
2402   step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2403   step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2404   step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2405   step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2406   step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2407   step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2408   step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2409   step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2410   step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2411   step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2412   step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2413   step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2414   step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2415   step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2416   step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2417   step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2418 
2419   step1[16] = step2[16];
2420   step1[17] = step2[17];
2421   step1[18] = step2[18];
2422   step1[19] = step2[19];
2423   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2424   temp2 = (step2[20] + step2[27]) * cospi_16_64;
2425   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2426   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2427   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2428   temp2 = (step2[21] + step2[26]) * cospi_16_64;
2429   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2430   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2431   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2432   temp2 = (step2[22] + step2[25]) * cospi_16_64;
2433   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2434   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2435   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2436   temp2 = (step2[23] + step2[24]) * cospi_16_64;
2437   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2438   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2439   step1[28] = step2[28];
2440   step1[29] = step2[29];
2441   step1[30] = step2[30];
2442   step1[31] = step2[31];
2443 
2444   // final stage
2445   output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2446   output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2447   output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2448   output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2449   output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2450   output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2451   output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2452   output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2453   output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2454   output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2455   output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2456   output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2457   output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2458   output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2459   output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2460   output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2461   output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2462   output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2463   output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2464   output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2465   output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2466   output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2467   output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2468   output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2469   output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2470   output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2471   output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2472   output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2473   output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2474   output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2475   output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2476   output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2477 }
2478 
vpx_highbd_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2479 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2480                                      int stride, int bd) {
2481   int i, j;
2482   tran_low_t out[32 * 32];
2483   tran_low_t *outptr = out;
2484   tran_low_t temp_in[32], temp_out[32];
2485   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2486 
2487   // Rows
2488   for (i = 0; i < 32; ++i) {
2489     tran_low_t zero_coeff[16];
2490     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2491     for (j = 0; j < 8; ++j)
2492       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2493     for (j = 0; j < 4; ++j)
2494       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2495     for (j = 0; j < 2; ++j)
2496       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2497 
2498     if (zero_coeff[0] | zero_coeff[1])
2499       highbd_idct32_c(input, outptr, bd);
2500     else
2501       memset(outptr, 0, sizeof(tran_low_t) * 32);
2502     input += 32;
2503     outptr += 32;
2504   }
2505 
2506   // Columns
2507   for (i = 0; i < 32; ++i) {
2508     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2509     highbd_idct32_c(temp_in, temp_out, bd);
2510     for (j = 0; j < 32; ++j) {
2511       dest[j * stride + i] = highbd_clip_pixel_add(
2512           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2513     }
2514   }
2515 }
2516 
vpx_highbd_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2517 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2518                                    int stride, int bd) {
2519   int i, j;
2520   tran_low_t out[32 * 32] = { 0 };
2521   tran_low_t *outptr = out;
2522   tran_low_t temp_in[32], temp_out[32];
2523   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2524 
2525   // Rows
2526   // Only upper-left 8x8 has non-zero coeff
2527   for (i = 0; i < 8; ++i) {
2528     highbd_idct32_c(input, outptr, bd);
2529     input += 32;
2530     outptr += 32;
2531   }
2532 
2533   // Columns
2534   for (i = 0; i < 32; ++i) {
2535     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2536     highbd_idct32_c(temp_in, temp_out, bd);
2537     for (j = 0; j < 32; ++j) {
2538       dest[j * stride + i] = highbd_clip_pixel_add(
2539           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2540     }
2541   }
2542 }
2543 
vpx_highbd_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)2544 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2545                                   int stride, int bd) {
2546   int i, j;
2547   int a1;
2548   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2549   tran_low_t out =
2550       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2551 
2552   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2553   a1 = ROUND_POWER_OF_TWO(out, 6);
2554 
2555   for (j = 0; j < 32; ++j) {
2556     for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2557     dest += stride;
2558   }
2559 }
2560 
2561 #endif  // CONFIG_VP9_HIGHBITDEPTH
2562