1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <math.h>
13 #include <string.h>
14 
15 #include "./aom_dsp_rtcd.h"
16 #include "aom_dsp/inv_txfm.h"
17 #if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
18     CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
19 #include "av1/common/daala_tx.h"
20 #endif
21 
aom_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)22 void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
23   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
24      0.5 shifts per pixel. */
25   int i;
26   tran_low_t output[16];
27   tran_high_t a1, b1, c1, d1, e1;
28   const tran_low_t *ip = input;
29   tran_low_t *op = output;
30 
31   for (i = 0; i < 4; i++) {
32     a1 = ip[0] >> UNIT_QUANT_SHIFT;
33     c1 = ip[1] >> UNIT_QUANT_SHIFT;
34     d1 = ip[2] >> UNIT_QUANT_SHIFT;
35     b1 = ip[3] >> UNIT_QUANT_SHIFT;
36     a1 += c1;
37     d1 -= b1;
38     e1 = (a1 - d1) >> 1;
39     b1 = e1 - b1;
40     c1 = e1 - c1;
41     a1 -= b1;
42     d1 += c1;
43     op[0] = WRAPLOW(a1);
44     op[1] = WRAPLOW(b1);
45     op[2] = WRAPLOW(c1);
46     op[3] = WRAPLOW(d1);
47     ip += 4;
48     op += 4;
49   }
50 
51   ip = output;
52   for (i = 0; i < 4; i++) {
53     a1 = ip[4 * 0];
54     c1 = ip[4 * 1];
55     d1 = ip[4 * 2];
56     b1 = ip[4 * 3];
57     a1 += c1;
58     d1 -= b1;
59     e1 = (a1 - d1) >> 1;
60     b1 = e1 - b1;
61     c1 = e1 - c1;
62     a1 -= b1;
63     d1 += c1;
64     dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
65     dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
66     dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
67     dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
68 
69     ip++;
70     dest++;
71   }
72 }
73 
aom_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int dest_stride)74 void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
75   int i;
76   tran_high_t a1, e1;
77   tran_low_t tmp[4];
78   const tran_low_t *ip = in;
79   tran_low_t *op = tmp;
80 
81   a1 = ip[0] >> UNIT_QUANT_SHIFT;
82   e1 = a1 >> 1;
83   a1 -= e1;
84   op[0] = WRAPLOW(a1);
85   op[1] = op[2] = op[3] = WRAPLOW(e1);
86 
87   ip = tmp;
88   for (i = 0; i < 4; i++) {
89     e1 = ip[0] >> 1;
90     a1 = ip[0] - e1;
91     dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
92     dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
93     dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
94     dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
95     ip++;
96     dest++;
97   }
98 }
99 
aom_idct4_c(const tran_low_t * input,tran_low_t * output)100 void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
101   tran_low_t step[4];
102   tran_high_t temp1, temp2;
103   // stage 1
104   temp1 = (input[0] + input[2]) * cospi_16_64;
105   temp2 = (input[0] - input[2]) * cospi_16_64;
106   step[0] = WRAPLOW(dct_const_round_shift(temp1));
107   step[1] = WRAPLOW(dct_const_round_shift(temp2));
108   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
109   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
110   step[2] = WRAPLOW(dct_const_round_shift(temp1));
111   step[3] = WRAPLOW(dct_const_round_shift(temp2));
112 
113   // stage 2
114   output[0] = WRAPLOW(step[0] + step[3]);
115   output[1] = WRAPLOW(step[1] + step[2]);
116   output[2] = WRAPLOW(step[1] - step[2]);
117   output[3] = WRAPLOW(step[0] - step[3]);
118 }
119 
aom_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)120 void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
121   tran_low_t out[4 * 4];
122   tran_low_t *outptr = out;
123   int i, j;
124   tran_low_t temp_in[4], temp_out[4];
125 
126   // Rows
127   for (i = 0; i < 4; ++i) {
128     aom_idct4_c(input, outptr);
129     input += 4;
130     outptr += 4;
131   }
132 
133   // Columns
134   for (i = 0; i < 4; ++i) {
135     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
136     aom_idct4_c(temp_in, temp_out);
137     for (j = 0; j < 4; ++j) {
138       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
139                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
140     }
141   }
142 }
143 
aom_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int dest_stride)144 void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
145                          int dest_stride) {
146   int i;
147   tran_high_t a1;
148   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
149   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
150   a1 = ROUND_POWER_OF_TWO(out, 4);
151 
152   if (a1 == 0) return;
153 
154   for (i = 0; i < 4; i++) {
155     dest[0] = clip_pixel_add(dest[0], a1);
156     dest[1] = clip_pixel_add(dest[1], a1);
157     dest[2] = clip_pixel_add(dest[2], a1);
158     dest[3] = clip_pixel_add(dest[3], a1);
159     dest += dest_stride;
160   }
161 }
162 
aom_idct8_c(const tran_low_t * input,tran_low_t * output)163 void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
164   tran_low_t step1[8], step2[8];
165   tran_high_t temp1, temp2;
166   // stage 1
167   step1[0] = input[0];
168   step1[2] = input[4];
169   step1[1] = input[2];
170   step1[3] = input[6];
171   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
172   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
173   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
174   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
175   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
176   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
177   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
178   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
179 
180   // stage 2
181   temp1 = (step1[0] + step1[2]) * cospi_16_64;
182   temp2 = (step1[0] - step1[2]) * cospi_16_64;
183   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
184   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
185   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
186   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
187   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
188   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
189   step2[4] = WRAPLOW(step1[4] + step1[5]);
190   step2[5] = WRAPLOW(step1[4] - step1[5]);
191   step2[6] = WRAPLOW(-step1[6] + step1[7]);
192   step2[7] = WRAPLOW(step1[6] + step1[7]);
193 
194   // stage 3
195   step1[0] = WRAPLOW(step2[0] + step2[3]);
196   step1[1] = WRAPLOW(step2[1] + step2[2]);
197   step1[2] = WRAPLOW(step2[1] - step2[2]);
198   step1[3] = WRAPLOW(step2[0] - step2[3]);
199   step1[4] = step2[4];
200   temp1 = (step2[6] - step2[5]) * cospi_16_64;
201   temp2 = (step2[5] + step2[6]) * cospi_16_64;
202   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
203   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
204   step1[7] = step2[7];
205 
206   // stage 4
207   output[0] = WRAPLOW(step1[0] + step1[7]);
208   output[1] = WRAPLOW(step1[1] + step1[6]);
209   output[2] = WRAPLOW(step1[2] + step1[5]);
210   output[3] = WRAPLOW(step1[3] + step1[4]);
211   output[4] = WRAPLOW(step1[3] - step1[4]);
212   output[5] = WRAPLOW(step1[2] - step1[5]);
213   output[6] = WRAPLOW(step1[1] - step1[6]);
214   output[7] = WRAPLOW(step1[0] - step1[7]);
215 }
216 
aom_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)217 void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
218   tran_low_t out[8 * 8];
219   tran_low_t *outptr = out;
220   int i, j;
221   tran_low_t temp_in[8], temp_out[8];
222 
223   // First transform rows
224   for (i = 0; i < 8; ++i) {
225     aom_idct8_c(input, outptr);
226     input += 8;
227     outptr += 8;
228   }
229 
230   // Then transform columns
231   for (i = 0; i < 8; ++i) {
232     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
233     aom_idct8_c(temp_in, temp_out);
234     for (j = 0; j < 8; ++j) {
235       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
236                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
237     }
238   }
239 }
240 
aom_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)241 void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
242   int i, j;
243   tran_high_t a1;
244   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
245   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
246   a1 = ROUND_POWER_OF_TWO(out, 5);
247   if (a1 == 0) return;
248   for (j = 0; j < 8; ++j) {
249     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
250     dest += stride;
251   }
252 }
253 
aom_iadst4_c(const tran_low_t * input,tran_low_t * output)254 void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
255   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
256 
257   tran_low_t x0 = input[0];
258   tran_low_t x1 = input[1];
259   tran_low_t x2 = input[2];
260   tran_low_t x3 = input[3];
261 
262   if (!(x0 | x1 | x2 | x3)) {
263     output[0] = output[1] = output[2] = output[3] = 0;
264     return;
265   }
266 
267   s0 = sinpi_1_9 * x0;
268   s1 = sinpi_2_9 * x0;
269   s2 = sinpi_3_9 * x1;
270   s3 = sinpi_4_9 * x2;
271   s4 = sinpi_1_9 * x2;
272   s5 = sinpi_2_9 * x3;
273   s6 = sinpi_4_9 * x3;
274   s7 = WRAPLOW(x0 - x2 + x3);
275 
276   s0 = s0 + s3 + s5;
277   s1 = s1 - s4 - s6;
278   s3 = s2;
279   s2 = sinpi_3_9 * s7;
280 
281   // 1-D transform scaling factor is sqrt(2).
282   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
283   // + 1b (addition) = 29b.
284   // Hence the output bit depth is 15b.
285   output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
286   output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
287   output[2] = WRAPLOW(dct_const_round_shift(s2));
288   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
289 }
290 
aom_iadst8_c(const tran_low_t * input,tran_low_t * output)291 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
292   int s0, s1, s2, s3, s4, s5, s6, s7;
293 
294   tran_high_t x0 = input[7];
295   tran_high_t x1 = input[0];
296   tran_high_t x2 = input[5];
297   tran_high_t x3 = input[2];
298   tran_high_t x4 = input[3];
299   tran_high_t x5 = input[4];
300   tran_high_t x6 = input[1];
301   tran_high_t x7 = input[6];
302 
303   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
304     output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
305         output[6] = output[7] = 0;
306     return;
307   }
308 
309   // stage 1
310   s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
311   s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
312   s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
313   s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
314   s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
315   s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
316   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
317   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
318 
319   x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
320   x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
321   x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
322   x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
323   x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
324   x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
325   x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
326   x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
327 
328   // stage 2
329   s0 = (int)x0;
330   s1 = (int)x1;
331   s2 = (int)x2;
332   s3 = (int)x3;
333   s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
334   s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
335   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
336   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
337 
338   x0 = WRAPLOW(s0 + s2);
339   x1 = WRAPLOW(s1 + s3);
340   x2 = WRAPLOW(s0 - s2);
341   x3 = WRAPLOW(s1 - s3);
342   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
343   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
344   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
345   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
346 
347   // stage 3
348   s2 = (int)(cospi_16_64 * (x2 + x3));
349   s3 = (int)(cospi_16_64 * (x2 - x3));
350   s6 = (int)(cospi_16_64 * (x6 + x7));
351   s7 = (int)(cospi_16_64 * (x6 - x7));
352 
353   x2 = WRAPLOW(dct_const_round_shift(s2));
354   x3 = WRAPLOW(dct_const_round_shift(s3));
355   x6 = WRAPLOW(dct_const_round_shift(s6));
356   x7 = WRAPLOW(dct_const_round_shift(s7));
357 
358   output[0] = WRAPLOW(x0);
359   output[1] = WRAPLOW(-x4);
360   output[2] = WRAPLOW(x6);
361   output[3] = WRAPLOW(-x2);
362   output[4] = WRAPLOW(x3);
363   output[5] = WRAPLOW(-x7);
364   output[6] = WRAPLOW(x5);
365   output[7] = WRAPLOW(-x1);
366 }
367 
aom_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)368 void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
369   tran_low_t out[8 * 8] = { 0 };
370   tran_low_t *outptr = out;
371   int i, j;
372   tran_low_t temp_in[8], temp_out[8];
373 
374   // First transform rows
375   // only first 4 row has non-zero coefs
376   for (i = 0; i < 4; ++i) {
377     aom_idct8_c(input, outptr);
378     input += 8;
379     outptr += 8;
380   }
381 
382   // Then transform columns
383   for (i = 0; i < 8; ++i) {
384     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
385     aom_idct8_c(temp_in, temp_out);
386     for (j = 0; j < 8; ++j) {
387       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
388                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
389     }
390   }
391 }
392 
aom_idct16_c(const tran_low_t * input,tran_low_t * output)393 void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
394   tran_low_t step1[16], step2[16];
395   tran_high_t temp1, temp2;
396 
397   // stage 1
398   step1[0] = input[0 / 2];
399   step1[1] = input[16 / 2];
400   step1[2] = input[8 / 2];
401   step1[3] = input[24 / 2];
402   step1[4] = input[4 / 2];
403   step1[5] = input[20 / 2];
404   step1[6] = input[12 / 2];
405   step1[7] = input[28 / 2];
406   step1[8] = input[2 / 2];
407   step1[9] = input[18 / 2];
408   step1[10] = input[10 / 2];
409   step1[11] = input[26 / 2];
410   step1[12] = input[6 / 2];
411   step1[13] = input[22 / 2];
412   step1[14] = input[14 / 2];
413   step1[15] = input[30 / 2];
414 
415   // stage 2
416   step2[0] = step1[0];
417   step2[1] = step1[1];
418   step2[2] = step1[2];
419   step2[3] = step1[3];
420   step2[4] = step1[4];
421   step2[5] = step1[5];
422   step2[6] = step1[6];
423   step2[7] = step1[7];
424 
425   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
426   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
427   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
428   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
429 
430   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
431   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
432   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
433   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
434 
435   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
436   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
437   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
438   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
439 
440   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
441   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
442   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
443   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
444 
445   // stage 3
446   step1[0] = step2[0];
447   step1[1] = step2[1];
448   step1[2] = step2[2];
449   step1[3] = step2[3];
450 
451   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
452   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
453   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
454   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
455   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
456   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
457   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
458   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
459 
460   step1[8] = WRAPLOW(step2[8] + step2[9]);
461   step1[9] = WRAPLOW(step2[8] - step2[9]);
462   step1[10] = WRAPLOW(-step2[10] + step2[11]);
463   step1[11] = WRAPLOW(step2[10] + step2[11]);
464   step1[12] = WRAPLOW(step2[12] + step2[13]);
465   step1[13] = WRAPLOW(step2[12] - step2[13]);
466   step1[14] = WRAPLOW(-step2[14] + step2[15]);
467   step1[15] = WRAPLOW(step2[14] + step2[15]);
468 
469   // stage 4
470   temp1 = (step1[0] + step1[1]) * cospi_16_64;
471   temp2 = (step1[0] - step1[1]) * cospi_16_64;
472   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
473   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
474   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
475   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
476   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
477   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
478   step2[4] = WRAPLOW(step1[4] + step1[5]);
479   step2[5] = WRAPLOW(step1[4] - step1[5]);
480   step2[6] = WRAPLOW(-step1[6] + step1[7]);
481   step2[7] = WRAPLOW(step1[6] + step1[7]);
482 
483   step2[8] = step1[8];
484   step2[15] = step1[15];
485   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
486   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
487   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
488   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
489   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
490   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
491   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
492   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
493   step2[11] = step1[11];
494   step2[12] = step1[12];
495 
496   // stage 5
497   step1[0] = WRAPLOW(step2[0] + step2[3]);
498   step1[1] = WRAPLOW(step2[1] + step2[2]);
499   step1[2] = WRAPLOW(step2[1] - step2[2]);
500   step1[3] = WRAPLOW(step2[0] - step2[3]);
501   step1[4] = step2[4];
502   temp1 = (step2[6] - step2[5]) * cospi_16_64;
503   temp2 = (step2[5] + step2[6]) * cospi_16_64;
504   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
505   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
506   step1[7] = step2[7];
507 
508   step1[8] = WRAPLOW(step2[8] + step2[11]);
509   step1[9] = WRAPLOW(step2[9] + step2[10]);
510   step1[10] = WRAPLOW(step2[9] - step2[10]);
511   step1[11] = WRAPLOW(step2[8] - step2[11]);
512   step1[12] = WRAPLOW(-step2[12] + step2[15]);
513   step1[13] = WRAPLOW(-step2[13] + step2[14]);
514   step1[14] = WRAPLOW(step2[13] + step2[14]);
515   step1[15] = WRAPLOW(step2[12] + step2[15]);
516 
517   // stage 6
518   step2[0] = WRAPLOW(step1[0] + step1[7]);
519   step2[1] = WRAPLOW(step1[1] + step1[6]);
520   step2[2] = WRAPLOW(step1[2] + step1[5]);
521   step2[3] = WRAPLOW(step1[3] + step1[4]);
522   step2[4] = WRAPLOW(step1[3] - step1[4]);
523   step2[5] = WRAPLOW(step1[2] - step1[5]);
524   step2[6] = WRAPLOW(step1[1] - step1[6]);
525   step2[7] = WRAPLOW(step1[0] - step1[7]);
526   step2[8] = step1[8];
527   step2[9] = step1[9];
528   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
529   temp2 = (step1[10] + step1[13]) * cospi_16_64;
530   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
531   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
532   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
533   temp2 = (step1[11] + step1[12]) * cospi_16_64;
534   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
535   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
536   step2[14] = step1[14];
537   step2[15] = step1[15];
538 
539   // stage 7
540   output[0] = WRAPLOW(step2[0] + step2[15]);
541   output[1] = WRAPLOW(step2[1] + step2[14]);
542   output[2] = WRAPLOW(step2[2] + step2[13]);
543   output[3] = WRAPLOW(step2[3] + step2[12]);
544   output[4] = WRAPLOW(step2[4] + step2[11]);
545   output[5] = WRAPLOW(step2[5] + step2[10]);
546   output[6] = WRAPLOW(step2[6] + step2[9]);
547   output[7] = WRAPLOW(step2[7] + step2[8]);
548   output[8] = WRAPLOW(step2[7] - step2[8]);
549   output[9] = WRAPLOW(step2[6] - step2[9]);
550   output[10] = WRAPLOW(step2[5] - step2[10]);
551   output[11] = WRAPLOW(step2[4] - step2[11]);
552   output[12] = WRAPLOW(step2[3] - step2[12]);
553   output[13] = WRAPLOW(step2[2] - step2[13]);
554   output[14] = WRAPLOW(step2[1] - step2[14]);
555   output[15] = WRAPLOW(step2[0] - step2[15]);
556 }
557 
aom_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)558 void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
559                              int stride) {
560   tran_low_t out[16 * 16];
561   tran_low_t *outptr = out;
562   int i, j;
563   tran_low_t temp_in[16], temp_out[16];
564 
565   // First transform rows
566   for (i = 0; i < 16; ++i) {
567     aom_idct16_c(input, outptr);
568     input += 16;
569     outptr += 16;
570   }
571 
572   // Then transform columns
573   for (i = 0; i < 16; ++i) {
574     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
575     aom_idct16_c(temp_in, temp_out);
576     for (j = 0; j < 16; ++j) {
577       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
578                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
579     }
580   }
581 }
582 
aom_iadst16_c(const tran_low_t * input,tran_low_t * output)583 void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
584   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
585   tran_high_t s9, s10, s11, s12, s13, s14, s15;
586 
587   tran_high_t x0 = input[15];
588   tran_high_t x1 = input[0];
589   tran_high_t x2 = input[13];
590   tran_high_t x3 = input[2];
591   tran_high_t x4 = input[11];
592   tran_high_t x5 = input[4];
593   tran_high_t x6 = input[9];
594   tran_high_t x7 = input[6];
595   tran_high_t x8 = input[7];
596   tran_high_t x9 = input[8];
597   tran_high_t x10 = input[5];
598   tran_high_t x11 = input[10];
599   tran_high_t x12 = input[3];
600   tran_high_t x13 = input[12];
601   tran_high_t x14 = input[1];
602   tran_high_t x15 = input[14];
603 
604   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
605         x13 | x14 | x15)) {
606     output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
607         output[6] = output[7] = output[8] = output[9] = output[10] =
608             output[11] = output[12] = output[13] = output[14] = output[15] = 0;
609     return;
610   }
611 
612   // stage 1
613   s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
614   s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
615   s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
616   s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
617   s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
618   s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
619   s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
620   s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
621   s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
622   s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
623   s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
624   s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
625   s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
626   s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
627   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
628   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
629 
630   x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
631   x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
632   x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
633   x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
634   x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
635   x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
636   x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
637   x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
638   x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
639   x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
640   x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
641   x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
642   x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
643   x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
644   x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
645   x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
646 
647   // stage 2
648   s0 = x0;
649   s1 = x1;
650   s2 = x2;
651   s3 = x3;
652   s4 = x4;
653   s5 = x5;
654   s6 = x6;
655   s7 = x7;
656   s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
657   s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
658   s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
659   s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
660   s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
661   s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
662   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
663   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
664 
665   x0 = WRAPLOW(s0 + s4);
666   x1 = WRAPLOW(s1 + s5);
667   x2 = WRAPLOW(s2 + s6);
668   x3 = WRAPLOW(s3 + s7);
669   x4 = WRAPLOW(s0 - s4);
670   x5 = WRAPLOW(s1 - s5);
671   x6 = WRAPLOW(s2 - s6);
672   x7 = WRAPLOW(s3 - s7);
673   x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
674   x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
675   x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
676   x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
677   x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
678   x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
679   x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
680   x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
681 
682   // stage 3
683   s0 = x0;
684   s1 = x1;
685   s2 = x2;
686   s3 = x3;
687   s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
688   s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
689   s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
690   s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
691   s8 = x8;
692   s9 = x9;
693   s10 = x10;
694   s11 = x11;
695   s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
696   s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
697   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
698   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
699 
700   x0 = WRAPLOW(s0 + s2);
701   x1 = WRAPLOW(s1 + s3);
702   x2 = WRAPLOW(s0 - s2);
703   x3 = WRAPLOW(s1 - s3);
704   x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
705   x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
706   x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
707   x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
708   x8 = WRAPLOW(s8 + s10);
709   x9 = WRAPLOW(s9 + s11);
710   x10 = WRAPLOW(s8 - s10);
711   x11 = WRAPLOW(s9 - s11);
712   x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
713   x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
714   x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
715   x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
716 
717   // stage 4
718   s2 = (-cospi_16_64) * (x2 + x3);
719   s3 = cospi_16_64 * (x2 - x3);
720   s6 = cospi_16_64 * (x6 + x7);
721   s7 = cospi_16_64 * (-x6 + x7);
722   s10 = cospi_16_64 * (x10 + x11);
723   s11 = cospi_16_64 * (-x10 + x11);
724   s14 = (-cospi_16_64) * (x14 + x15);
725   s15 = cospi_16_64 * (x14 - x15);
726 
727   x2 = WRAPLOW(dct_const_round_shift(s2));
728   x3 = WRAPLOW(dct_const_round_shift(s3));
729   x6 = WRAPLOW(dct_const_round_shift(s6));
730   x7 = WRAPLOW(dct_const_round_shift(s7));
731   x10 = WRAPLOW(dct_const_round_shift(s10));
732   x11 = WRAPLOW(dct_const_round_shift(s11));
733   x14 = WRAPLOW(dct_const_round_shift(s14));
734   x15 = WRAPLOW(dct_const_round_shift(s15));
735 
736   output[0] = WRAPLOW(x0);
737   output[1] = WRAPLOW(-x8);
738   output[2] = WRAPLOW(x12);
739   output[3] = WRAPLOW(-x4);
740   output[4] = WRAPLOW(x6);
741   output[5] = WRAPLOW(x14);
742   output[6] = WRAPLOW(x10);
743   output[7] = WRAPLOW(x2);
744   output[8] = WRAPLOW(x3);
745   output[9] = WRAPLOW(x11);
746   output[10] = WRAPLOW(x15);
747   output[11] = WRAPLOW(x7);
748   output[12] = WRAPLOW(x5);
749   output[13] = WRAPLOW(-x13);
750   output[14] = WRAPLOW(x9);
751   output[15] = WRAPLOW(-x1);
752 }
753 
aom_idct16x16_38_add_c(const tran_low_t * input,uint8_t * dest,int stride)754 void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
755                             int stride) {
756   int i, j;
757   tran_low_t out[16 * 16] = { 0 };
758   tran_low_t *outptr = out;
759   tran_low_t temp_in[16], temp_out[16];
760 
761   // First transform rows. Since all non-zero dct coefficients are in
762   // upper-left 8x8 area, we only need to calculate first 8 rows here.
763   for (i = 0; i < 8; ++i) {
764     aom_idct16_c(input, outptr);
765     input += 16;
766     outptr += 16;
767   }
768 
769   // Then transform columns
770   for (i = 0; i < 16; ++i) {
771     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
772     aom_idct16_c(temp_in, temp_out);
773     for (j = 0; j < 16; ++j) {
774       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
775                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
776     }
777   }
778 }
779 
aom_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)780 void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
781                             int stride) {
782   tran_low_t out[16 * 16] = { 0 };
783   tran_low_t *outptr = out;
784   int i, j;
785   tran_low_t temp_in[16], temp_out[16];
786 
787   // First transform rows. Since all non-zero dct coefficients are in
788   // upper-left 4x4 area, we only need to calculate first 4 rows here.
789   for (i = 0; i < 4; ++i) {
790     aom_idct16_c(input, outptr);
791     input += 16;
792     outptr += 16;
793   }
794 
795   // Then transform columns
796   for (i = 0; i < 16; ++i) {
797     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
798     aom_idct16_c(temp_in, temp_out);
799     for (j = 0; j < 16; ++j) {
800       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
801                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
802     }
803   }
804 }
805 
aom_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)806 void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
807   int i, j;
808   tran_high_t a1;
809   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
810   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
811   a1 = ROUND_POWER_OF_TWO(out, 6);
812   if (a1 == 0) return;
813   for (j = 0; j < 16; ++j) {
814     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
815     dest += stride;
816   }
817 }
818 
aom_idct32_c(const tran_low_t * input,tran_low_t * output)819 void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
820   tran_low_t step1[32], step2[32];
821   tran_high_t temp1, temp2;
822 
823   // stage 1
824   step1[0] = input[0];
825   step1[1] = input[16];
826   step1[2] = input[8];
827   step1[3] = input[24];
828   step1[4] = input[4];
829   step1[5] = input[20];
830   step1[6] = input[12];
831   step1[7] = input[28];
832   step1[8] = input[2];
833   step1[9] = input[18];
834   step1[10] = input[10];
835   step1[11] = input[26];
836   step1[12] = input[6];
837   step1[13] = input[22];
838   step1[14] = input[14];
839   step1[15] = input[30];
840 
841   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
842   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
843   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
844   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
845 
846   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
847   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
848   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
849   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
850 
851   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
852   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
853   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
854   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
855 
856   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
857   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
858   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
859   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
860 
861   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
862   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
863   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
864   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
865 
866   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
867   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
868   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
869   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
870 
871   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
872   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
873   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
874   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
875 
876   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
877   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
878   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
879   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
880 
881   // stage 2
882   step2[0] = step1[0];
883   step2[1] = step1[1];
884   step2[2] = step1[2];
885   step2[3] = step1[3];
886   step2[4] = step1[4];
887   step2[5] = step1[5];
888   step2[6] = step1[6];
889   step2[7] = step1[7];
890 
891   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
892   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
893   step2[8] = WRAPLOW(dct_const_round_shift(temp1));
894   step2[15] = WRAPLOW(dct_const_round_shift(temp2));
895 
896   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
897   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
898   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
899   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
900 
901   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
902   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
903   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
904   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
905 
906   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
907   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
908   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
909   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
910 
911   step2[16] = WRAPLOW(step1[16] + step1[17]);
912   step2[17] = WRAPLOW(step1[16] - step1[17]);
913   step2[18] = WRAPLOW(-step1[18] + step1[19]);
914   step2[19] = WRAPLOW(step1[18] + step1[19]);
915   step2[20] = WRAPLOW(step1[20] + step1[21]);
916   step2[21] = WRAPLOW(step1[20] - step1[21]);
917   step2[22] = WRAPLOW(-step1[22] + step1[23]);
918   step2[23] = WRAPLOW(step1[22] + step1[23]);
919   step2[24] = WRAPLOW(step1[24] + step1[25]);
920   step2[25] = WRAPLOW(step1[24] - step1[25]);
921   step2[26] = WRAPLOW(-step1[26] + step1[27]);
922   step2[27] = WRAPLOW(step1[26] + step1[27]);
923   step2[28] = WRAPLOW(step1[28] + step1[29]);
924   step2[29] = WRAPLOW(step1[28] - step1[29]);
925   step2[30] = WRAPLOW(-step1[30] + step1[31]);
926   step2[31] = WRAPLOW(step1[30] + step1[31]);
927 
928   // stage 3
929   step1[0] = step2[0];
930   step1[1] = step2[1];
931   step1[2] = step2[2];
932   step1[3] = step2[3];
933 
934   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
935   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
936   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
937   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
938   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
939   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
940   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
941   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
942 
943   step1[8] = WRAPLOW(step2[8] + step2[9]);
944   step1[9] = WRAPLOW(step2[8] - step2[9]);
945   step1[10] = WRAPLOW(-step2[10] + step2[11]);
946   step1[11] = WRAPLOW(step2[10] + step2[11]);
947   step1[12] = WRAPLOW(step2[12] + step2[13]);
948   step1[13] = WRAPLOW(step2[12] - step2[13]);
949   step1[14] = WRAPLOW(-step2[14] + step2[15]);
950   step1[15] = WRAPLOW(step2[14] + step2[15]);
951 
952   step1[16] = step2[16];
953   step1[31] = step2[31];
954   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
955   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
956   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
957   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
958   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
959   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
960   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
961   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
962   step1[19] = step2[19];
963   step1[20] = step2[20];
964   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
965   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
966   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
967   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
968   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
969   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
970   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
971   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
972   step1[23] = step2[23];
973   step1[24] = step2[24];
974   step1[27] = step2[27];
975   step1[28] = step2[28];
976 
977   // stage 4
978   temp1 = (step1[0] + step1[1]) * cospi_16_64;
979   temp2 = (step1[0] - step1[1]) * cospi_16_64;
980   step2[0] = WRAPLOW(dct_const_round_shift(temp1));
981   step2[1] = WRAPLOW(dct_const_round_shift(temp2));
982   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
983   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
984   step2[2] = WRAPLOW(dct_const_round_shift(temp1));
985   step2[3] = WRAPLOW(dct_const_round_shift(temp2));
986   step2[4] = WRAPLOW(step1[4] + step1[5]);
987   step2[5] = WRAPLOW(step1[4] - step1[5]);
988   step2[6] = WRAPLOW(-step1[6] + step1[7]);
989   step2[7] = WRAPLOW(step1[6] + step1[7]);
990 
991   step2[8] = step1[8];
992   step2[15] = step1[15];
993   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
994   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
995   step2[9] = WRAPLOW(dct_const_round_shift(temp1));
996   step2[14] = WRAPLOW(dct_const_round_shift(temp2));
997   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
998   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
999   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1000   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1001   step2[11] = step1[11];
1002   step2[12] = step1[12];
1003 
1004   step2[16] = WRAPLOW(step1[16] + step1[19]);
1005   step2[17] = WRAPLOW(step1[17] + step1[18]);
1006   step2[18] = WRAPLOW(step1[17] - step1[18]);
1007   step2[19] = WRAPLOW(step1[16] - step1[19]);
1008   step2[20] = WRAPLOW(-step1[20] + step1[23]);
1009   step2[21] = WRAPLOW(-step1[21] + step1[22]);
1010   step2[22] = WRAPLOW(step1[21] + step1[22]);
1011   step2[23] = WRAPLOW(step1[20] + step1[23]);
1012 
1013   step2[24] = WRAPLOW(step1[24] + step1[27]);
1014   step2[25] = WRAPLOW(step1[25] + step1[26]);
1015   step2[26] = WRAPLOW(step1[25] - step1[26]);
1016   step2[27] = WRAPLOW(step1[24] - step1[27]);
1017   step2[28] = WRAPLOW(-step1[28] + step1[31]);
1018   step2[29] = WRAPLOW(-step1[29] + step1[30]);
1019   step2[30] = WRAPLOW(step1[29] + step1[30]);
1020   step2[31] = WRAPLOW(step1[28] + step1[31]);
1021 
1022   // stage 5
1023   step1[0] = WRAPLOW(step2[0] + step2[3]);
1024   step1[1] = WRAPLOW(step2[1] + step2[2]);
1025   step1[2] = WRAPLOW(step2[1] - step2[2]);
1026   step1[3] = WRAPLOW(step2[0] - step2[3]);
1027   step1[4] = step2[4];
1028   temp1 = (step2[6] - step2[5]) * cospi_16_64;
1029   temp2 = (step2[5] + step2[6]) * cospi_16_64;
1030   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1031   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1032   step1[7] = step2[7];
1033 
1034   step1[8] = WRAPLOW(step2[8] + step2[11]);
1035   step1[9] = WRAPLOW(step2[9] + step2[10]);
1036   step1[10] = WRAPLOW(step2[9] - step2[10]);
1037   step1[11] = WRAPLOW(step2[8] - step2[11]);
1038   step1[12] = WRAPLOW(-step2[12] + step2[15]);
1039   step1[13] = WRAPLOW(-step2[13] + step2[14]);
1040   step1[14] = WRAPLOW(step2[13] + step2[14]);
1041   step1[15] = WRAPLOW(step2[12] + step2[15]);
1042 
1043   step1[16] = step2[16];
1044   step1[17] = step2[17];
1045   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1046   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1047   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1048   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1049   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1050   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1051   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1052   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1053   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1054   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1055   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1056   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1057   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1058   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1059   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1060   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1061   step1[22] = step2[22];
1062   step1[23] = step2[23];
1063   step1[24] = step2[24];
1064   step1[25] = step2[25];
1065   step1[30] = step2[30];
1066   step1[31] = step2[31];
1067 
1068   // stage 6
1069   step2[0] = WRAPLOW(step1[0] + step1[7]);
1070   step2[1] = WRAPLOW(step1[1] + step1[6]);
1071   step2[2] = WRAPLOW(step1[2] + step1[5]);
1072   step2[3] = WRAPLOW(step1[3] + step1[4]);
1073   step2[4] = WRAPLOW(step1[3] - step1[4]);
1074   step2[5] = WRAPLOW(step1[2] - step1[5]);
1075   step2[6] = WRAPLOW(step1[1] - step1[6]);
1076   step2[7] = WRAPLOW(step1[0] - step1[7]);
1077   step2[8] = step1[8];
1078   step2[9] = step1[9];
1079   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1080   temp2 = (step1[10] + step1[13]) * cospi_16_64;
1081   step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1082   step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1083   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1084   temp2 = (step1[11] + step1[12]) * cospi_16_64;
1085   step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1086   step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1087   step2[14] = step1[14];
1088   step2[15] = step1[15];
1089 
1090   step2[16] = WRAPLOW(step1[16] + step1[23]);
1091   step2[17] = WRAPLOW(step1[17] + step1[22]);
1092   step2[18] = WRAPLOW(step1[18] + step1[21]);
1093   step2[19] = WRAPLOW(step1[19] + step1[20]);
1094   step2[20] = WRAPLOW(step1[19] - step1[20]);
1095   step2[21] = WRAPLOW(step1[18] - step1[21]);
1096   step2[22] = WRAPLOW(step1[17] - step1[22]);
1097   step2[23] = WRAPLOW(step1[16] - step1[23]);
1098 
1099   step2[24] = WRAPLOW(-step1[24] + step1[31]);
1100   step2[25] = WRAPLOW(-step1[25] + step1[30]);
1101   step2[26] = WRAPLOW(-step1[26] + step1[29]);
1102   step2[27] = WRAPLOW(-step1[27] + step1[28]);
1103   step2[28] = WRAPLOW(step1[27] + step1[28]);
1104   step2[29] = WRAPLOW(step1[26] + step1[29]);
1105   step2[30] = WRAPLOW(step1[25] + step1[30]);
1106   step2[31] = WRAPLOW(step1[24] + step1[31]);
1107 
1108   // stage 7
1109   step1[0] = WRAPLOW(step2[0] + step2[15]);
1110   step1[1] = WRAPLOW(step2[1] + step2[14]);
1111   step1[2] = WRAPLOW(step2[2] + step2[13]);
1112   step1[3] = WRAPLOW(step2[3] + step2[12]);
1113   step1[4] = WRAPLOW(step2[4] + step2[11]);
1114   step1[5] = WRAPLOW(step2[5] + step2[10]);
1115   step1[6] = WRAPLOW(step2[6] + step2[9]);
1116   step1[7] = WRAPLOW(step2[7] + step2[8]);
1117   step1[8] = WRAPLOW(step2[7] - step2[8]);
1118   step1[9] = WRAPLOW(step2[6] - step2[9]);
1119   step1[10] = WRAPLOW(step2[5] - step2[10]);
1120   step1[11] = WRAPLOW(step2[4] - step2[11]);
1121   step1[12] = WRAPLOW(step2[3] - step2[12]);
1122   step1[13] = WRAPLOW(step2[2] - step2[13]);
1123   step1[14] = WRAPLOW(step2[1] - step2[14]);
1124   step1[15] = WRAPLOW(step2[0] - step2[15]);
1125 
1126   step1[16] = step2[16];
1127   step1[17] = step2[17];
1128   step1[18] = step2[18];
1129   step1[19] = step2[19];
1130   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1131   temp2 = (step2[20] + step2[27]) * cospi_16_64;
1132   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1133   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1134   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1135   temp2 = (step2[21] + step2[26]) * cospi_16_64;
1136   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1137   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1138   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1139   temp2 = (step2[22] + step2[25]) * cospi_16_64;
1140   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1141   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1142   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1143   temp2 = (step2[23] + step2[24]) * cospi_16_64;
1144   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1145   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1146   step1[28] = step2[28];
1147   step1[29] = step2[29];
1148   step1[30] = step2[30];
1149   step1[31] = step2[31];
1150 
1151   // final stage
1152   output[0] = WRAPLOW(step1[0] + step1[31]);
1153   output[1] = WRAPLOW(step1[1] + step1[30]);
1154   output[2] = WRAPLOW(step1[2] + step1[29]);
1155   output[3] = WRAPLOW(step1[3] + step1[28]);
1156   output[4] = WRAPLOW(step1[4] + step1[27]);
1157   output[5] = WRAPLOW(step1[5] + step1[26]);
1158   output[6] = WRAPLOW(step1[6] + step1[25]);
1159   output[7] = WRAPLOW(step1[7] + step1[24]);
1160   output[8] = WRAPLOW(step1[8] + step1[23]);
1161   output[9] = WRAPLOW(step1[9] + step1[22]);
1162   output[10] = WRAPLOW(step1[10] + step1[21]);
1163   output[11] = WRAPLOW(step1[11] + step1[20]);
1164   output[12] = WRAPLOW(step1[12] + step1[19]);
1165   output[13] = WRAPLOW(step1[13] + step1[18]);
1166   output[14] = WRAPLOW(step1[14] + step1[17]);
1167   output[15] = WRAPLOW(step1[15] + step1[16]);
1168   output[16] = WRAPLOW(step1[15] - step1[16]);
1169   output[17] = WRAPLOW(step1[14] - step1[17]);
1170   output[18] = WRAPLOW(step1[13] - step1[18]);
1171   output[19] = WRAPLOW(step1[12] - step1[19]);
1172   output[20] = WRAPLOW(step1[11] - step1[20]);
1173   output[21] = WRAPLOW(step1[10] - step1[21]);
1174   output[22] = WRAPLOW(step1[9] - step1[22]);
1175   output[23] = WRAPLOW(step1[8] - step1[23]);
1176   output[24] = WRAPLOW(step1[7] - step1[24]);
1177   output[25] = WRAPLOW(step1[6] - step1[25]);
1178   output[26] = WRAPLOW(step1[5] - step1[26]);
1179   output[27] = WRAPLOW(step1[4] - step1[27]);
1180   output[28] = WRAPLOW(step1[3] - step1[28]);
1181   output[29] = WRAPLOW(step1[2] - step1[29]);
1182   output[30] = WRAPLOW(step1[1] - step1[30]);
1183   output[31] = WRAPLOW(step1[0] - step1[31]);
1184 }
1185 
1186 #if CONFIG_MRC_TX
aom_imrc32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride,uint8_t * mask)1187 void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1188                               int stride, uint8_t *mask) {
1189   tran_low_t out[32 * 32];
1190   tran_low_t *outptr = out;
1191   int i, j;
1192   tran_low_t temp_in[32], temp_out[32];
1193 
1194   // Rows
1195   for (i = 0; i < 32; ++i) {
1196     int16_t zero_coeff[16];
1197     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1198     for (j = 0; j < 8; ++j)
1199       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1200     for (j = 0; j < 4; ++j)
1201       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1202     for (j = 0; j < 2; ++j)
1203       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1204 
1205     if (zero_coeff[0] | zero_coeff[1])
1206       aom_idct32_c(input, outptr);
1207     else
1208       memset(outptr, 0, sizeof(tran_low_t) * 32);
1209     input += 32;
1210     outptr += 32;
1211   }
1212 
1213   // Columns
1214   for (i = 0; i < 32; ++i) {
1215     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1216     aom_idct32_c(temp_in, temp_out);
1217     for (j = 0; j < 32; ++j) {
1218       // Only add the coefficient if the mask value is 1
1219       int mask_val = mask[j * 32 + i];
1220       dest[j * stride + i] =
1221           mask_val ? clip_pixel_add(dest[j * stride + i],
1222                                     ROUND_POWER_OF_TWO(temp_out[j], 6))
1223                    : dest[j * stride + i];
1224     }
1225   }
1226 }
1227 
aom_imrc32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride,uint8_t * mask)1228 void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1229                              uint8_t *mask) {
1230   tran_low_t out[32 * 32] = { 0 };
1231   tran_low_t *outptr = out;
1232   int i, j;
1233   tran_low_t temp_in[32], temp_out[32];
1234 
1235   // Rows
1236   // only upper-left 16x16 has non-zero coeff
1237   for (i = 0; i < 16; ++i) {
1238     aom_idct32_c(input, outptr);
1239     input += 32;
1240     outptr += 32;
1241   }
1242 
1243   // Columns
1244   for (i = 0; i < 32; ++i) {
1245     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1246     aom_idct32_c(temp_in, temp_out);
1247     for (j = 0; j < 32; ++j) {
1248       // Only add the coefficient if the mask value is 1
1249       int mask_val = mask[j * 32 + i];
1250       dest[j * stride + i] =
1251           mask_val ? clip_pixel_add(dest[j * stride + i],
1252                                     ROUND_POWER_OF_TWO(temp_out[j], 6))
1253                    : dest[j * stride + i];
1254     }
1255   }
1256 }
1257 
aom_imrc32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride,uint8_t * mask)1258 void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1259                             uint8_t *mask) {
1260   tran_low_t out[32 * 32] = { 0 };
1261   tran_low_t *outptr = out;
1262   int i, j;
1263   tran_low_t temp_in[32], temp_out[32];
1264 
1265   // Rows
1266   // only upper-left 8x8 has non-zero coeff
1267   for (i = 0; i < 8; ++i) {
1268     aom_idct32_c(input, outptr);
1269     input += 32;
1270     outptr += 32;
1271   }
1272 
1273   // Columns
1274   for (i = 0; i < 32; ++i) {
1275     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1276     aom_idct32_c(temp_in, temp_out);
1277     for (j = 0; j < 32; ++j) {
1278       // Only add the coefficient if the mask value is 1
1279       int mask_val = mask[j * 32 + i];
1280       dest[j * stride + i] =
1281           mask_val ? clip_pixel_add(dest[j * stride + i],
1282                                     ROUND_POWER_OF_TWO(temp_out[j], 6))
1283                    : dest[j * stride + i];
1284     }
1285   }
1286 }
1287 #endif  // CONFIG_MRC_TX
1288 
aom_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1289 void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1290                               int stride) {
1291   tran_low_t out[32 * 32];
1292   tran_low_t *outptr = out;
1293   int i, j;
1294   tran_low_t temp_in[32], temp_out[32];
1295 
1296   // Rows
1297   for (i = 0; i < 32; ++i) {
1298     int16_t zero_coeff[16];
1299     for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1300     for (j = 0; j < 8; ++j)
1301       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1302     for (j = 0; j < 4; ++j)
1303       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1304     for (j = 0; j < 2; ++j)
1305       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1306 
1307     if (zero_coeff[0] | zero_coeff[1])
1308       aom_idct32_c(input, outptr);
1309     else
1310       memset(outptr, 0, sizeof(tran_low_t) * 32);
1311     input += 32;
1312     outptr += 32;
1313   }
1314 
1315   // Columns
1316   for (i = 0; i < 32; ++i) {
1317     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1318     aom_idct32_c(temp_in, temp_out);
1319     for (j = 0; j < 32; ++j) {
1320       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1321                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1322     }
1323   }
1324 }
1325 
aom_idct32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride)1326 void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1327                              int stride) {
1328   tran_low_t out[32 * 32] = { 0 };
1329   tran_low_t *outptr = out;
1330   int i, j;
1331   tran_low_t temp_in[32], temp_out[32];
1332 
1333   // Rows
1334   // only upper-left 16x16 has non-zero coeff
1335   for (i = 0; i < 16; ++i) {
1336     aom_idct32_c(input, outptr);
1337     input += 32;
1338     outptr += 32;
1339   }
1340 
1341   // Columns
1342   for (i = 0; i < 32; ++i) {
1343     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1344     aom_idct32_c(temp_in, temp_out);
1345     for (j = 0; j < 32; ++j) {
1346       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1347                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1348     }
1349   }
1350 }
1351 
aom_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1352 void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1353                             int stride) {
1354   tran_low_t out[32 * 32] = { 0 };
1355   tran_low_t *outptr = out;
1356   int i, j;
1357   tran_low_t temp_in[32], temp_out[32];
1358 
1359   // Rows
1360   // only upper-left 8x8 has non-zero coeff
1361   for (i = 0; i < 8; ++i) {
1362     aom_idct32_c(input, outptr);
1363     input += 32;
1364     outptr += 32;
1365   }
1366 
1367   // Columns
1368   for (i = 0; i < 32; ++i) {
1369     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1370     aom_idct32_c(temp_in, temp_out);
1371     for (j = 0; j < 32; ++j) {
1372       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1373                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
1374     }
1375   }
1376 }
1377 
aom_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1378 void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1379   int i, j;
1380   tran_high_t a1;
1381 
1382   tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1383   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1384   a1 = ROUND_POWER_OF_TWO(out, 6);
1385   if (a1 == 0) return;
1386 
1387   for (j = 0; j < 32; ++j) {
1388     for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1389     dest += stride;
1390   }
1391 }
1392 
aom_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1393 void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1394                                  int stride, int bd) {
1395   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1396      0.5 shifts per pixel. */
1397   int i;
1398   tran_low_t output[16];
1399   tran_high_t a1, b1, c1, d1, e1;
1400   const tran_low_t *ip = input;
1401   tran_low_t *op = output;
1402   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1403 
1404   for (i = 0; i < 4; i++) {
1405     a1 = ip[0] >> UNIT_QUANT_SHIFT;
1406     c1 = ip[1] >> UNIT_QUANT_SHIFT;
1407     d1 = ip[2] >> UNIT_QUANT_SHIFT;
1408     b1 = ip[3] >> UNIT_QUANT_SHIFT;
1409     a1 += c1;
1410     d1 -= b1;
1411     e1 = (a1 - d1) >> 1;
1412     b1 = e1 - b1;
1413     c1 = e1 - c1;
1414     a1 -= b1;
1415     d1 += c1;
1416     op[0] = HIGHBD_WRAPLOW(a1, bd);
1417     op[1] = HIGHBD_WRAPLOW(b1, bd);
1418     op[2] = HIGHBD_WRAPLOW(c1, bd);
1419     op[3] = HIGHBD_WRAPLOW(d1, bd);
1420     ip += 4;
1421     op += 4;
1422   }
1423 
1424   ip = output;
1425   for (i = 0; i < 4; i++) {
1426     a1 = ip[4 * 0];
1427     c1 = ip[4 * 1];
1428     d1 = ip[4 * 2];
1429     b1 = ip[4 * 3];
1430     a1 += c1;
1431     d1 -= b1;
1432     e1 = (a1 - d1) >> 1;
1433     b1 = e1 - b1;
1434     c1 = e1 - c1;
1435     a1 -= b1;
1436     d1 += c1;
1437     dest[stride * 0] =
1438         highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1439     dest[stride * 1] =
1440         highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1441     dest[stride * 2] =
1442         highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1443     dest[stride * 3] =
1444         highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1445 
1446     ip++;
1447     dest++;
1448   }
1449 }
1450 
aom_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest8,int dest_stride,int bd)1451 void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1452                                 int dest_stride, int bd) {
1453   int i;
1454   tran_high_t a1, e1;
1455   tran_low_t tmp[4];
1456   const tran_low_t *ip = in;
1457   tran_low_t *op = tmp;
1458   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1459   (void)bd;
1460 
1461   a1 = ip[0] >> UNIT_QUANT_SHIFT;
1462   e1 = a1 >> 1;
1463   a1 -= e1;
1464   op[0] = HIGHBD_WRAPLOW(a1, bd);
1465   op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1466 
1467   ip = tmp;
1468   for (i = 0; i < 4; i++) {
1469     e1 = ip[0] >> 1;
1470     a1 = ip[0] - e1;
1471     dest[dest_stride * 0] =
1472         highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
1473     dest[dest_stride * 1] =
1474         highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
1475     dest[dest_stride * 2] =
1476         highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
1477     dest[dest_stride * 3] =
1478         highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
1479     ip++;
1480     dest++;
1481   }
1482 }
1483