1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <math.h>
13 #include <string.h>
14
15 #include "./aom_dsp_rtcd.h"
16 #include "aom_dsp/inv_txfm.h"
17 #if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
18 CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
19 #include "av1/common/daala_tx.h"
20 #endif
21
aom_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)22 void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
23 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
24 0.5 shifts per pixel. */
25 int i;
26 tran_low_t output[16];
27 tran_high_t a1, b1, c1, d1, e1;
28 const tran_low_t *ip = input;
29 tran_low_t *op = output;
30
31 for (i = 0; i < 4; i++) {
32 a1 = ip[0] >> UNIT_QUANT_SHIFT;
33 c1 = ip[1] >> UNIT_QUANT_SHIFT;
34 d1 = ip[2] >> UNIT_QUANT_SHIFT;
35 b1 = ip[3] >> UNIT_QUANT_SHIFT;
36 a1 += c1;
37 d1 -= b1;
38 e1 = (a1 - d1) >> 1;
39 b1 = e1 - b1;
40 c1 = e1 - c1;
41 a1 -= b1;
42 d1 += c1;
43 op[0] = WRAPLOW(a1);
44 op[1] = WRAPLOW(b1);
45 op[2] = WRAPLOW(c1);
46 op[3] = WRAPLOW(d1);
47 ip += 4;
48 op += 4;
49 }
50
51 ip = output;
52 for (i = 0; i < 4; i++) {
53 a1 = ip[4 * 0];
54 c1 = ip[4 * 1];
55 d1 = ip[4 * 2];
56 b1 = ip[4 * 3];
57 a1 += c1;
58 d1 -= b1;
59 e1 = (a1 - d1) >> 1;
60 b1 = e1 - b1;
61 c1 = e1 - c1;
62 a1 -= b1;
63 d1 += c1;
64 dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
65 dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
66 dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
67 dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
68
69 ip++;
70 dest++;
71 }
72 }
73
aom_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest,int dest_stride)74 void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
75 int i;
76 tran_high_t a1, e1;
77 tran_low_t tmp[4];
78 const tran_low_t *ip = in;
79 tran_low_t *op = tmp;
80
81 a1 = ip[0] >> UNIT_QUANT_SHIFT;
82 e1 = a1 >> 1;
83 a1 -= e1;
84 op[0] = WRAPLOW(a1);
85 op[1] = op[2] = op[3] = WRAPLOW(e1);
86
87 ip = tmp;
88 for (i = 0; i < 4; i++) {
89 e1 = ip[0] >> 1;
90 a1 = ip[0] - e1;
91 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
92 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
93 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
94 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
95 ip++;
96 dest++;
97 }
98 }
99
aom_idct4_c(const tran_low_t * input,tran_low_t * output)100 void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
101 tran_low_t step[4];
102 tran_high_t temp1, temp2;
103 // stage 1
104 temp1 = (input[0] + input[2]) * cospi_16_64;
105 temp2 = (input[0] - input[2]) * cospi_16_64;
106 step[0] = WRAPLOW(dct_const_round_shift(temp1));
107 step[1] = WRAPLOW(dct_const_round_shift(temp2));
108 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
109 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
110 step[2] = WRAPLOW(dct_const_round_shift(temp1));
111 step[3] = WRAPLOW(dct_const_round_shift(temp2));
112
113 // stage 2
114 output[0] = WRAPLOW(step[0] + step[3]);
115 output[1] = WRAPLOW(step[1] + step[2]);
116 output[2] = WRAPLOW(step[1] - step[2]);
117 output[3] = WRAPLOW(step[0] - step[3]);
118 }
119
aom_idct4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride)120 void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
121 tran_low_t out[4 * 4];
122 tran_low_t *outptr = out;
123 int i, j;
124 tran_low_t temp_in[4], temp_out[4];
125
126 // Rows
127 for (i = 0; i < 4; ++i) {
128 aom_idct4_c(input, outptr);
129 input += 4;
130 outptr += 4;
131 }
132
133 // Columns
134 for (i = 0; i < 4; ++i) {
135 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
136 aom_idct4_c(temp_in, temp_out);
137 for (j = 0; j < 4; ++j) {
138 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
139 ROUND_POWER_OF_TWO(temp_out[j], 4));
140 }
141 }
142 }
143
aom_idct4x4_1_add_c(const tran_low_t * input,uint8_t * dest,int dest_stride)144 void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
145 int dest_stride) {
146 int i;
147 tran_high_t a1;
148 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
149 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
150 a1 = ROUND_POWER_OF_TWO(out, 4);
151
152 if (a1 == 0) return;
153
154 for (i = 0; i < 4; i++) {
155 dest[0] = clip_pixel_add(dest[0], a1);
156 dest[1] = clip_pixel_add(dest[1], a1);
157 dest[2] = clip_pixel_add(dest[2], a1);
158 dest[3] = clip_pixel_add(dest[3], a1);
159 dest += dest_stride;
160 }
161 }
162
aom_idct8_c(const tran_low_t * input,tran_low_t * output)163 void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
164 tran_low_t step1[8], step2[8];
165 tran_high_t temp1, temp2;
166 // stage 1
167 step1[0] = input[0];
168 step1[2] = input[4];
169 step1[1] = input[2];
170 step1[3] = input[6];
171 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
172 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
173 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
174 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
175 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
176 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
177 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
178 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
179
180 // stage 2
181 temp1 = (step1[0] + step1[2]) * cospi_16_64;
182 temp2 = (step1[0] - step1[2]) * cospi_16_64;
183 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
184 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
185 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
186 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
187 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
188 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
189 step2[4] = WRAPLOW(step1[4] + step1[5]);
190 step2[5] = WRAPLOW(step1[4] - step1[5]);
191 step2[6] = WRAPLOW(-step1[6] + step1[7]);
192 step2[7] = WRAPLOW(step1[6] + step1[7]);
193
194 // stage 3
195 step1[0] = WRAPLOW(step2[0] + step2[3]);
196 step1[1] = WRAPLOW(step2[1] + step2[2]);
197 step1[2] = WRAPLOW(step2[1] - step2[2]);
198 step1[3] = WRAPLOW(step2[0] - step2[3]);
199 step1[4] = step2[4];
200 temp1 = (step2[6] - step2[5]) * cospi_16_64;
201 temp2 = (step2[5] + step2[6]) * cospi_16_64;
202 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
203 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
204 step1[7] = step2[7];
205
206 // stage 4
207 output[0] = WRAPLOW(step1[0] + step1[7]);
208 output[1] = WRAPLOW(step1[1] + step1[6]);
209 output[2] = WRAPLOW(step1[2] + step1[5]);
210 output[3] = WRAPLOW(step1[3] + step1[4]);
211 output[4] = WRAPLOW(step1[3] - step1[4]);
212 output[5] = WRAPLOW(step1[2] - step1[5]);
213 output[6] = WRAPLOW(step1[1] - step1[6]);
214 output[7] = WRAPLOW(step1[0] - step1[7]);
215 }
216
aom_idct8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride)217 void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
218 tran_low_t out[8 * 8];
219 tran_low_t *outptr = out;
220 int i, j;
221 tran_low_t temp_in[8], temp_out[8];
222
223 // First transform rows
224 for (i = 0; i < 8; ++i) {
225 aom_idct8_c(input, outptr);
226 input += 8;
227 outptr += 8;
228 }
229
230 // Then transform columns
231 for (i = 0; i < 8; ++i) {
232 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
233 aom_idct8_c(temp_in, temp_out);
234 for (j = 0; j < 8; ++j) {
235 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
236 ROUND_POWER_OF_TWO(temp_out[j], 5));
237 }
238 }
239 }
240
aom_idct8x8_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)241 void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
242 int i, j;
243 tran_high_t a1;
244 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
245 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
246 a1 = ROUND_POWER_OF_TWO(out, 5);
247 if (a1 == 0) return;
248 for (j = 0; j < 8; ++j) {
249 for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
250 dest += stride;
251 }
252 }
253
aom_iadst4_c(const tran_low_t * input,tran_low_t * output)254 void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
255 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
256
257 tran_low_t x0 = input[0];
258 tran_low_t x1 = input[1];
259 tran_low_t x2 = input[2];
260 tran_low_t x3 = input[3];
261
262 if (!(x0 | x1 | x2 | x3)) {
263 output[0] = output[1] = output[2] = output[3] = 0;
264 return;
265 }
266
267 s0 = sinpi_1_9 * x0;
268 s1 = sinpi_2_9 * x0;
269 s2 = sinpi_3_9 * x1;
270 s3 = sinpi_4_9 * x2;
271 s4 = sinpi_1_9 * x2;
272 s5 = sinpi_2_9 * x3;
273 s6 = sinpi_4_9 * x3;
274 s7 = WRAPLOW(x0 - x2 + x3);
275
276 s0 = s0 + s3 + s5;
277 s1 = s1 - s4 - s6;
278 s3 = s2;
279 s2 = sinpi_3_9 * s7;
280
281 // 1-D transform scaling factor is sqrt(2).
282 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
283 // + 1b (addition) = 29b.
284 // Hence the output bit depth is 15b.
285 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
286 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
287 output[2] = WRAPLOW(dct_const_round_shift(s2));
288 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
289 }
290
aom_iadst8_c(const tran_low_t * input,tran_low_t * output)291 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
292 int s0, s1, s2, s3, s4, s5, s6, s7;
293
294 tran_high_t x0 = input[7];
295 tran_high_t x1 = input[0];
296 tran_high_t x2 = input[5];
297 tran_high_t x3 = input[2];
298 tran_high_t x4 = input[3];
299 tran_high_t x5 = input[4];
300 tran_high_t x6 = input[1];
301 tran_high_t x7 = input[6];
302
303 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
304 output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
305 output[6] = output[7] = 0;
306 return;
307 }
308
309 // stage 1
310 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
311 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
312 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
313 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
314 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
315 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
316 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
317 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
318
319 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
320 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
321 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
322 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
323 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
324 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
325 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
326 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
327
328 // stage 2
329 s0 = (int)x0;
330 s1 = (int)x1;
331 s2 = (int)x2;
332 s3 = (int)x3;
333 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
334 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
335 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
336 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
337
338 x0 = WRAPLOW(s0 + s2);
339 x1 = WRAPLOW(s1 + s3);
340 x2 = WRAPLOW(s0 - s2);
341 x3 = WRAPLOW(s1 - s3);
342 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
343 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
344 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
345 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
346
347 // stage 3
348 s2 = (int)(cospi_16_64 * (x2 + x3));
349 s3 = (int)(cospi_16_64 * (x2 - x3));
350 s6 = (int)(cospi_16_64 * (x6 + x7));
351 s7 = (int)(cospi_16_64 * (x6 - x7));
352
353 x2 = WRAPLOW(dct_const_round_shift(s2));
354 x3 = WRAPLOW(dct_const_round_shift(s3));
355 x6 = WRAPLOW(dct_const_round_shift(s6));
356 x7 = WRAPLOW(dct_const_round_shift(s7));
357
358 output[0] = WRAPLOW(x0);
359 output[1] = WRAPLOW(-x4);
360 output[2] = WRAPLOW(x6);
361 output[3] = WRAPLOW(-x2);
362 output[4] = WRAPLOW(x3);
363 output[5] = WRAPLOW(-x7);
364 output[6] = WRAPLOW(x5);
365 output[7] = WRAPLOW(-x1);
366 }
367
aom_idct8x8_12_add_c(const tran_low_t * input,uint8_t * dest,int stride)368 void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
369 tran_low_t out[8 * 8] = { 0 };
370 tran_low_t *outptr = out;
371 int i, j;
372 tran_low_t temp_in[8], temp_out[8];
373
374 // First transform rows
375 // only first 4 row has non-zero coefs
376 for (i = 0; i < 4; ++i) {
377 aom_idct8_c(input, outptr);
378 input += 8;
379 outptr += 8;
380 }
381
382 // Then transform columns
383 for (i = 0; i < 8; ++i) {
384 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
385 aom_idct8_c(temp_in, temp_out);
386 for (j = 0; j < 8; ++j) {
387 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
388 ROUND_POWER_OF_TWO(temp_out[j], 5));
389 }
390 }
391 }
392
aom_idct16_c(const tran_low_t * input,tran_low_t * output)393 void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
394 tran_low_t step1[16], step2[16];
395 tran_high_t temp1, temp2;
396
397 // stage 1
398 step1[0] = input[0 / 2];
399 step1[1] = input[16 / 2];
400 step1[2] = input[8 / 2];
401 step1[3] = input[24 / 2];
402 step1[4] = input[4 / 2];
403 step1[5] = input[20 / 2];
404 step1[6] = input[12 / 2];
405 step1[7] = input[28 / 2];
406 step1[8] = input[2 / 2];
407 step1[9] = input[18 / 2];
408 step1[10] = input[10 / 2];
409 step1[11] = input[26 / 2];
410 step1[12] = input[6 / 2];
411 step1[13] = input[22 / 2];
412 step1[14] = input[14 / 2];
413 step1[15] = input[30 / 2];
414
415 // stage 2
416 step2[0] = step1[0];
417 step2[1] = step1[1];
418 step2[2] = step1[2];
419 step2[3] = step1[3];
420 step2[4] = step1[4];
421 step2[5] = step1[5];
422 step2[6] = step1[6];
423 step2[7] = step1[7];
424
425 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
426 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
427 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
428 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
429
430 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
431 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
432 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
433 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
434
435 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
436 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
437 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
438 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
439
440 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
441 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
442 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
443 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
444
445 // stage 3
446 step1[0] = step2[0];
447 step1[1] = step2[1];
448 step1[2] = step2[2];
449 step1[3] = step2[3];
450
451 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
452 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
453 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
454 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
455 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
456 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
457 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
458 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
459
460 step1[8] = WRAPLOW(step2[8] + step2[9]);
461 step1[9] = WRAPLOW(step2[8] - step2[9]);
462 step1[10] = WRAPLOW(-step2[10] + step2[11]);
463 step1[11] = WRAPLOW(step2[10] + step2[11]);
464 step1[12] = WRAPLOW(step2[12] + step2[13]);
465 step1[13] = WRAPLOW(step2[12] - step2[13]);
466 step1[14] = WRAPLOW(-step2[14] + step2[15]);
467 step1[15] = WRAPLOW(step2[14] + step2[15]);
468
469 // stage 4
470 temp1 = (step1[0] + step1[1]) * cospi_16_64;
471 temp2 = (step1[0] - step1[1]) * cospi_16_64;
472 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
473 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
474 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
475 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
476 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
477 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
478 step2[4] = WRAPLOW(step1[4] + step1[5]);
479 step2[5] = WRAPLOW(step1[4] - step1[5]);
480 step2[6] = WRAPLOW(-step1[6] + step1[7]);
481 step2[7] = WRAPLOW(step1[6] + step1[7]);
482
483 step2[8] = step1[8];
484 step2[15] = step1[15];
485 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
486 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
487 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
488 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
489 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
490 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
491 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
492 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
493 step2[11] = step1[11];
494 step2[12] = step1[12];
495
496 // stage 5
497 step1[0] = WRAPLOW(step2[0] + step2[3]);
498 step1[1] = WRAPLOW(step2[1] + step2[2]);
499 step1[2] = WRAPLOW(step2[1] - step2[2]);
500 step1[3] = WRAPLOW(step2[0] - step2[3]);
501 step1[4] = step2[4];
502 temp1 = (step2[6] - step2[5]) * cospi_16_64;
503 temp2 = (step2[5] + step2[6]) * cospi_16_64;
504 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
505 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
506 step1[7] = step2[7];
507
508 step1[8] = WRAPLOW(step2[8] + step2[11]);
509 step1[9] = WRAPLOW(step2[9] + step2[10]);
510 step1[10] = WRAPLOW(step2[9] - step2[10]);
511 step1[11] = WRAPLOW(step2[8] - step2[11]);
512 step1[12] = WRAPLOW(-step2[12] + step2[15]);
513 step1[13] = WRAPLOW(-step2[13] + step2[14]);
514 step1[14] = WRAPLOW(step2[13] + step2[14]);
515 step1[15] = WRAPLOW(step2[12] + step2[15]);
516
517 // stage 6
518 step2[0] = WRAPLOW(step1[0] + step1[7]);
519 step2[1] = WRAPLOW(step1[1] + step1[6]);
520 step2[2] = WRAPLOW(step1[2] + step1[5]);
521 step2[3] = WRAPLOW(step1[3] + step1[4]);
522 step2[4] = WRAPLOW(step1[3] - step1[4]);
523 step2[5] = WRAPLOW(step1[2] - step1[5]);
524 step2[6] = WRAPLOW(step1[1] - step1[6]);
525 step2[7] = WRAPLOW(step1[0] - step1[7]);
526 step2[8] = step1[8];
527 step2[9] = step1[9];
528 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
529 temp2 = (step1[10] + step1[13]) * cospi_16_64;
530 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
531 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
532 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
533 temp2 = (step1[11] + step1[12]) * cospi_16_64;
534 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
535 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
536 step2[14] = step1[14];
537 step2[15] = step1[15];
538
539 // stage 7
540 output[0] = WRAPLOW(step2[0] + step2[15]);
541 output[1] = WRAPLOW(step2[1] + step2[14]);
542 output[2] = WRAPLOW(step2[2] + step2[13]);
543 output[3] = WRAPLOW(step2[3] + step2[12]);
544 output[4] = WRAPLOW(step2[4] + step2[11]);
545 output[5] = WRAPLOW(step2[5] + step2[10]);
546 output[6] = WRAPLOW(step2[6] + step2[9]);
547 output[7] = WRAPLOW(step2[7] + step2[8]);
548 output[8] = WRAPLOW(step2[7] - step2[8]);
549 output[9] = WRAPLOW(step2[6] - step2[9]);
550 output[10] = WRAPLOW(step2[5] - step2[10]);
551 output[11] = WRAPLOW(step2[4] - step2[11]);
552 output[12] = WRAPLOW(step2[3] - step2[12]);
553 output[13] = WRAPLOW(step2[2] - step2[13]);
554 output[14] = WRAPLOW(step2[1] - step2[14]);
555 output[15] = WRAPLOW(step2[0] - step2[15]);
556 }
557
aom_idct16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride)558 void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
559 int stride) {
560 tran_low_t out[16 * 16];
561 tran_low_t *outptr = out;
562 int i, j;
563 tran_low_t temp_in[16], temp_out[16];
564
565 // First transform rows
566 for (i = 0; i < 16; ++i) {
567 aom_idct16_c(input, outptr);
568 input += 16;
569 outptr += 16;
570 }
571
572 // Then transform columns
573 for (i = 0; i < 16; ++i) {
574 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
575 aom_idct16_c(temp_in, temp_out);
576 for (j = 0; j < 16; ++j) {
577 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
578 ROUND_POWER_OF_TWO(temp_out[j], 6));
579 }
580 }
581 }
582
aom_iadst16_c(const tran_low_t * input,tran_low_t * output)583 void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
584 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
585 tran_high_t s9, s10, s11, s12, s13, s14, s15;
586
587 tran_high_t x0 = input[15];
588 tran_high_t x1 = input[0];
589 tran_high_t x2 = input[13];
590 tran_high_t x3 = input[2];
591 tran_high_t x4 = input[11];
592 tran_high_t x5 = input[4];
593 tran_high_t x6 = input[9];
594 tran_high_t x7 = input[6];
595 tran_high_t x8 = input[7];
596 tran_high_t x9 = input[8];
597 tran_high_t x10 = input[5];
598 tran_high_t x11 = input[10];
599 tran_high_t x12 = input[3];
600 tran_high_t x13 = input[12];
601 tran_high_t x14 = input[1];
602 tran_high_t x15 = input[14];
603
604 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
605 x13 | x14 | x15)) {
606 output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
607 output[6] = output[7] = output[8] = output[9] = output[10] =
608 output[11] = output[12] = output[13] = output[14] = output[15] = 0;
609 return;
610 }
611
612 // stage 1
613 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
614 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
615 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
616 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
617 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
618 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
619 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
620 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
621 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
622 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
623 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
624 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
625 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
626 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
627 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
628 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
629
630 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
631 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
632 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
633 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
634 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
635 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
636 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
637 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
638 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
639 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
640 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
641 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
642 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
643 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
644 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
645 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
646
647 // stage 2
648 s0 = x0;
649 s1 = x1;
650 s2 = x2;
651 s3 = x3;
652 s4 = x4;
653 s5 = x5;
654 s6 = x6;
655 s7 = x7;
656 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
657 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
658 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
659 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
660 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
661 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
662 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
663 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
664
665 x0 = WRAPLOW(s0 + s4);
666 x1 = WRAPLOW(s1 + s5);
667 x2 = WRAPLOW(s2 + s6);
668 x3 = WRAPLOW(s3 + s7);
669 x4 = WRAPLOW(s0 - s4);
670 x5 = WRAPLOW(s1 - s5);
671 x6 = WRAPLOW(s2 - s6);
672 x7 = WRAPLOW(s3 - s7);
673 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
674 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
675 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
676 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
677 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
678 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
679 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
680 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
681
682 // stage 3
683 s0 = x0;
684 s1 = x1;
685 s2 = x2;
686 s3 = x3;
687 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
688 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
689 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
690 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
691 s8 = x8;
692 s9 = x9;
693 s10 = x10;
694 s11 = x11;
695 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
696 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
697 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
698 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
699
700 x0 = WRAPLOW(s0 + s2);
701 x1 = WRAPLOW(s1 + s3);
702 x2 = WRAPLOW(s0 - s2);
703 x3 = WRAPLOW(s1 - s3);
704 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
705 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
706 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
707 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
708 x8 = WRAPLOW(s8 + s10);
709 x9 = WRAPLOW(s9 + s11);
710 x10 = WRAPLOW(s8 - s10);
711 x11 = WRAPLOW(s9 - s11);
712 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
713 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
714 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
715 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
716
717 // stage 4
718 s2 = (-cospi_16_64) * (x2 + x3);
719 s3 = cospi_16_64 * (x2 - x3);
720 s6 = cospi_16_64 * (x6 + x7);
721 s7 = cospi_16_64 * (-x6 + x7);
722 s10 = cospi_16_64 * (x10 + x11);
723 s11 = cospi_16_64 * (-x10 + x11);
724 s14 = (-cospi_16_64) * (x14 + x15);
725 s15 = cospi_16_64 * (x14 - x15);
726
727 x2 = WRAPLOW(dct_const_round_shift(s2));
728 x3 = WRAPLOW(dct_const_round_shift(s3));
729 x6 = WRAPLOW(dct_const_round_shift(s6));
730 x7 = WRAPLOW(dct_const_round_shift(s7));
731 x10 = WRAPLOW(dct_const_round_shift(s10));
732 x11 = WRAPLOW(dct_const_round_shift(s11));
733 x14 = WRAPLOW(dct_const_round_shift(s14));
734 x15 = WRAPLOW(dct_const_round_shift(s15));
735
736 output[0] = WRAPLOW(x0);
737 output[1] = WRAPLOW(-x8);
738 output[2] = WRAPLOW(x12);
739 output[3] = WRAPLOW(-x4);
740 output[4] = WRAPLOW(x6);
741 output[5] = WRAPLOW(x14);
742 output[6] = WRAPLOW(x10);
743 output[7] = WRAPLOW(x2);
744 output[8] = WRAPLOW(x3);
745 output[9] = WRAPLOW(x11);
746 output[10] = WRAPLOW(x15);
747 output[11] = WRAPLOW(x7);
748 output[12] = WRAPLOW(x5);
749 output[13] = WRAPLOW(-x13);
750 output[14] = WRAPLOW(x9);
751 output[15] = WRAPLOW(-x1);
752 }
753
aom_idct16x16_38_add_c(const tran_low_t * input,uint8_t * dest,int stride)754 void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
755 int stride) {
756 int i, j;
757 tran_low_t out[16 * 16] = { 0 };
758 tran_low_t *outptr = out;
759 tran_low_t temp_in[16], temp_out[16];
760
761 // First transform rows. Since all non-zero dct coefficients are in
762 // upper-left 8x8 area, we only need to calculate first 8 rows here.
763 for (i = 0; i < 8; ++i) {
764 aom_idct16_c(input, outptr);
765 input += 16;
766 outptr += 16;
767 }
768
769 // Then transform columns
770 for (i = 0; i < 16; ++i) {
771 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
772 aom_idct16_c(temp_in, temp_out);
773 for (j = 0; j < 16; ++j) {
774 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
775 ROUND_POWER_OF_TWO(temp_out[j], 6));
776 }
777 }
778 }
779
aom_idct16x16_10_add_c(const tran_low_t * input,uint8_t * dest,int stride)780 void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
781 int stride) {
782 tran_low_t out[16 * 16] = { 0 };
783 tran_low_t *outptr = out;
784 int i, j;
785 tran_low_t temp_in[16], temp_out[16];
786
787 // First transform rows. Since all non-zero dct coefficients are in
788 // upper-left 4x4 area, we only need to calculate first 4 rows here.
789 for (i = 0; i < 4; ++i) {
790 aom_idct16_c(input, outptr);
791 input += 16;
792 outptr += 16;
793 }
794
795 // Then transform columns
796 for (i = 0; i < 16; ++i) {
797 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
798 aom_idct16_c(temp_in, temp_out);
799 for (j = 0; j < 16; ++j) {
800 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
801 ROUND_POWER_OF_TWO(temp_out[j], 6));
802 }
803 }
804 }
805
aom_idct16x16_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)806 void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
807 int i, j;
808 tran_high_t a1;
809 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
810 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
811 a1 = ROUND_POWER_OF_TWO(out, 6);
812 if (a1 == 0) return;
813 for (j = 0; j < 16; ++j) {
814 for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
815 dest += stride;
816 }
817 }
818
aom_idct32_c(const tran_low_t * input,tran_low_t * output)819 void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
820 tran_low_t step1[32], step2[32];
821 tran_high_t temp1, temp2;
822
823 // stage 1
824 step1[0] = input[0];
825 step1[1] = input[16];
826 step1[2] = input[8];
827 step1[3] = input[24];
828 step1[4] = input[4];
829 step1[5] = input[20];
830 step1[6] = input[12];
831 step1[7] = input[28];
832 step1[8] = input[2];
833 step1[9] = input[18];
834 step1[10] = input[10];
835 step1[11] = input[26];
836 step1[12] = input[6];
837 step1[13] = input[22];
838 step1[14] = input[14];
839 step1[15] = input[30];
840
841 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
842 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
843 step1[16] = WRAPLOW(dct_const_round_shift(temp1));
844 step1[31] = WRAPLOW(dct_const_round_shift(temp2));
845
846 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
847 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
848 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
849 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
850
851 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
852 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
853 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
854 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
855
856 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
857 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
858 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
859 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
860
861 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
862 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
863 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
864 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
865
866 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
867 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
868 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
869 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
870
871 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
872 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
873 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
874 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
875
876 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
877 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
878 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
879 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
880
881 // stage 2
882 step2[0] = step1[0];
883 step2[1] = step1[1];
884 step2[2] = step1[2];
885 step2[3] = step1[3];
886 step2[4] = step1[4];
887 step2[5] = step1[5];
888 step2[6] = step1[6];
889 step2[7] = step1[7];
890
891 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
892 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
893 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
894 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
895
896 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
897 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
898 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
899 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
900
901 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
902 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
903 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
904 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
905
906 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
907 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
908 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
909 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
910
911 step2[16] = WRAPLOW(step1[16] + step1[17]);
912 step2[17] = WRAPLOW(step1[16] - step1[17]);
913 step2[18] = WRAPLOW(-step1[18] + step1[19]);
914 step2[19] = WRAPLOW(step1[18] + step1[19]);
915 step2[20] = WRAPLOW(step1[20] + step1[21]);
916 step2[21] = WRAPLOW(step1[20] - step1[21]);
917 step2[22] = WRAPLOW(-step1[22] + step1[23]);
918 step2[23] = WRAPLOW(step1[22] + step1[23]);
919 step2[24] = WRAPLOW(step1[24] + step1[25]);
920 step2[25] = WRAPLOW(step1[24] - step1[25]);
921 step2[26] = WRAPLOW(-step1[26] + step1[27]);
922 step2[27] = WRAPLOW(step1[26] + step1[27]);
923 step2[28] = WRAPLOW(step1[28] + step1[29]);
924 step2[29] = WRAPLOW(step1[28] - step1[29]);
925 step2[30] = WRAPLOW(-step1[30] + step1[31]);
926 step2[31] = WRAPLOW(step1[30] + step1[31]);
927
928 // stage 3
929 step1[0] = step2[0];
930 step1[1] = step2[1];
931 step1[2] = step2[2];
932 step1[3] = step2[3];
933
934 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
935 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
936 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
937 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
938 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
939 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
940 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
941 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
942
943 step1[8] = WRAPLOW(step2[8] + step2[9]);
944 step1[9] = WRAPLOW(step2[8] - step2[9]);
945 step1[10] = WRAPLOW(-step2[10] + step2[11]);
946 step1[11] = WRAPLOW(step2[10] + step2[11]);
947 step1[12] = WRAPLOW(step2[12] + step2[13]);
948 step1[13] = WRAPLOW(step2[12] - step2[13]);
949 step1[14] = WRAPLOW(-step2[14] + step2[15]);
950 step1[15] = WRAPLOW(step2[14] + step2[15]);
951
952 step1[16] = step2[16];
953 step1[31] = step2[31];
954 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
955 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
956 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
957 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
958 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
959 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
960 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
961 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
962 step1[19] = step2[19];
963 step1[20] = step2[20];
964 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
965 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
966 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
967 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
968 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
969 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
970 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
971 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
972 step1[23] = step2[23];
973 step1[24] = step2[24];
974 step1[27] = step2[27];
975 step1[28] = step2[28];
976
977 // stage 4
978 temp1 = (step1[0] + step1[1]) * cospi_16_64;
979 temp2 = (step1[0] - step1[1]) * cospi_16_64;
980 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
981 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
982 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
983 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
984 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
985 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
986 step2[4] = WRAPLOW(step1[4] + step1[5]);
987 step2[5] = WRAPLOW(step1[4] - step1[5]);
988 step2[6] = WRAPLOW(-step1[6] + step1[7]);
989 step2[7] = WRAPLOW(step1[6] + step1[7]);
990
991 step2[8] = step1[8];
992 step2[15] = step1[15];
993 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
994 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
995 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
996 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
997 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
998 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
999 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1000 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1001 step2[11] = step1[11];
1002 step2[12] = step1[12];
1003
1004 step2[16] = WRAPLOW(step1[16] + step1[19]);
1005 step2[17] = WRAPLOW(step1[17] + step1[18]);
1006 step2[18] = WRAPLOW(step1[17] - step1[18]);
1007 step2[19] = WRAPLOW(step1[16] - step1[19]);
1008 step2[20] = WRAPLOW(-step1[20] + step1[23]);
1009 step2[21] = WRAPLOW(-step1[21] + step1[22]);
1010 step2[22] = WRAPLOW(step1[21] + step1[22]);
1011 step2[23] = WRAPLOW(step1[20] + step1[23]);
1012
1013 step2[24] = WRAPLOW(step1[24] + step1[27]);
1014 step2[25] = WRAPLOW(step1[25] + step1[26]);
1015 step2[26] = WRAPLOW(step1[25] - step1[26]);
1016 step2[27] = WRAPLOW(step1[24] - step1[27]);
1017 step2[28] = WRAPLOW(-step1[28] + step1[31]);
1018 step2[29] = WRAPLOW(-step1[29] + step1[30]);
1019 step2[30] = WRAPLOW(step1[29] + step1[30]);
1020 step2[31] = WRAPLOW(step1[28] + step1[31]);
1021
1022 // stage 5
1023 step1[0] = WRAPLOW(step2[0] + step2[3]);
1024 step1[1] = WRAPLOW(step2[1] + step2[2]);
1025 step1[2] = WRAPLOW(step2[1] - step2[2]);
1026 step1[3] = WRAPLOW(step2[0] - step2[3]);
1027 step1[4] = step2[4];
1028 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1029 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1030 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1031 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1032 step1[7] = step2[7];
1033
1034 step1[8] = WRAPLOW(step2[8] + step2[11]);
1035 step1[9] = WRAPLOW(step2[9] + step2[10]);
1036 step1[10] = WRAPLOW(step2[9] - step2[10]);
1037 step1[11] = WRAPLOW(step2[8] - step2[11]);
1038 step1[12] = WRAPLOW(-step2[12] + step2[15]);
1039 step1[13] = WRAPLOW(-step2[13] + step2[14]);
1040 step1[14] = WRAPLOW(step2[13] + step2[14]);
1041 step1[15] = WRAPLOW(step2[12] + step2[15]);
1042
1043 step1[16] = step2[16];
1044 step1[17] = step2[17];
1045 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1046 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1047 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1048 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1049 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1050 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1051 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1052 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1053 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1054 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1055 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1056 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1057 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1058 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1059 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1060 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1061 step1[22] = step2[22];
1062 step1[23] = step2[23];
1063 step1[24] = step2[24];
1064 step1[25] = step2[25];
1065 step1[30] = step2[30];
1066 step1[31] = step2[31];
1067
1068 // stage 6
1069 step2[0] = WRAPLOW(step1[0] + step1[7]);
1070 step2[1] = WRAPLOW(step1[1] + step1[6]);
1071 step2[2] = WRAPLOW(step1[2] + step1[5]);
1072 step2[3] = WRAPLOW(step1[3] + step1[4]);
1073 step2[4] = WRAPLOW(step1[3] - step1[4]);
1074 step2[5] = WRAPLOW(step1[2] - step1[5]);
1075 step2[6] = WRAPLOW(step1[1] - step1[6]);
1076 step2[7] = WRAPLOW(step1[0] - step1[7]);
1077 step2[8] = step1[8];
1078 step2[9] = step1[9];
1079 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1080 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1081 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1082 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1083 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1084 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1085 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1086 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1087 step2[14] = step1[14];
1088 step2[15] = step1[15];
1089
1090 step2[16] = WRAPLOW(step1[16] + step1[23]);
1091 step2[17] = WRAPLOW(step1[17] + step1[22]);
1092 step2[18] = WRAPLOW(step1[18] + step1[21]);
1093 step2[19] = WRAPLOW(step1[19] + step1[20]);
1094 step2[20] = WRAPLOW(step1[19] - step1[20]);
1095 step2[21] = WRAPLOW(step1[18] - step1[21]);
1096 step2[22] = WRAPLOW(step1[17] - step1[22]);
1097 step2[23] = WRAPLOW(step1[16] - step1[23]);
1098
1099 step2[24] = WRAPLOW(-step1[24] + step1[31]);
1100 step2[25] = WRAPLOW(-step1[25] + step1[30]);
1101 step2[26] = WRAPLOW(-step1[26] + step1[29]);
1102 step2[27] = WRAPLOW(-step1[27] + step1[28]);
1103 step2[28] = WRAPLOW(step1[27] + step1[28]);
1104 step2[29] = WRAPLOW(step1[26] + step1[29]);
1105 step2[30] = WRAPLOW(step1[25] + step1[30]);
1106 step2[31] = WRAPLOW(step1[24] + step1[31]);
1107
1108 // stage 7
1109 step1[0] = WRAPLOW(step2[0] + step2[15]);
1110 step1[1] = WRAPLOW(step2[1] + step2[14]);
1111 step1[2] = WRAPLOW(step2[2] + step2[13]);
1112 step1[3] = WRAPLOW(step2[3] + step2[12]);
1113 step1[4] = WRAPLOW(step2[4] + step2[11]);
1114 step1[5] = WRAPLOW(step2[5] + step2[10]);
1115 step1[6] = WRAPLOW(step2[6] + step2[9]);
1116 step1[7] = WRAPLOW(step2[7] + step2[8]);
1117 step1[8] = WRAPLOW(step2[7] - step2[8]);
1118 step1[9] = WRAPLOW(step2[6] - step2[9]);
1119 step1[10] = WRAPLOW(step2[5] - step2[10]);
1120 step1[11] = WRAPLOW(step2[4] - step2[11]);
1121 step1[12] = WRAPLOW(step2[3] - step2[12]);
1122 step1[13] = WRAPLOW(step2[2] - step2[13]);
1123 step1[14] = WRAPLOW(step2[1] - step2[14]);
1124 step1[15] = WRAPLOW(step2[0] - step2[15]);
1125
1126 step1[16] = step2[16];
1127 step1[17] = step2[17];
1128 step1[18] = step2[18];
1129 step1[19] = step2[19];
1130 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1131 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1132 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1133 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1134 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1135 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1136 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1137 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1138 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1139 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1140 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1141 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1142 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1143 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1144 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1145 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1146 step1[28] = step2[28];
1147 step1[29] = step2[29];
1148 step1[30] = step2[30];
1149 step1[31] = step2[31];
1150
1151 // final stage
1152 output[0] = WRAPLOW(step1[0] + step1[31]);
1153 output[1] = WRAPLOW(step1[1] + step1[30]);
1154 output[2] = WRAPLOW(step1[2] + step1[29]);
1155 output[3] = WRAPLOW(step1[3] + step1[28]);
1156 output[4] = WRAPLOW(step1[4] + step1[27]);
1157 output[5] = WRAPLOW(step1[5] + step1[26]);
1158 output[6] = WRAPLOW(step1[6] + step1[25]);
1159 output[7] = WRAPLOW(step1[7] + step1[24]);
1160 output[8] = WRAPLOW(step1[8] + step1[23]);
1161 output[9] = WRAPLOW(step1[9] + step1[22]);
1162 output[10] = WRAPLOW(step1[10] + step1[21]);
1163 output[11] = WRAPLOW(step1[11] + step1[20]);
1164 output[12] = WRAPLOW(step1[12] + step1[19]);
1165 output[13] = WRAPLOW(step1[13] + step1[18]);
1166 output[14] = WRAPLOW(step1[14] + step1[17]);
1167 output[15] = WRAPLOW(step1[15] + step1[16]);
1168 output[16] = WRAPLOW(step1[15] - step1[16]);
1169 output[17] = WRAPLOW(step1[14] - step1[17]);
1170 output[18] = WRAPLOW(step1[13] - step1[18]);
1171 output[19] = WRAPLOW(step1[12] - step1[19]);
1172 output[20] = WRAPLOW(step1[11] - step1[20]);
1173 output[21] = WRAPLOW(step1[10] - step1[21]);
1174 output[22] = WRAPLOW(step1[9] - step1[22]);
1175 output[23] = WRAPLOW(step1[8] - step1[23]);
1176 output[24] = WRAPLOW(step1[7] - step1[24]);
1177 output[25] = WRAPLOW(step1[6] - step1[25]);
1178 output[26] = WRAPLOW(step1[5] - step1[26]);
1179 output[27] = WRAPLOW(step1[4] - step1[27]);
1180 output[28] = WRAPLOW(step1[3] - step1[28]);
1181 output[29] = WRAPLOW(step1[2] - step1[29]);
1182 output[30] = WRAPLOW(step1[1] - step1[30]);
1183 output[31] = WRAPLOW(step1[0] - step1[31]);
1184 }
1185
1186 #if CONFIG_MRC_TX
aom_imrc32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride,uint8_t * mask)1187 void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1188 int stride, uint8_t *mask) {
1189 tran_low_t out[32 * 32];
1190 tran_low_t *outptr = out;
1191 int i, j;
1192 tran_low_t temp_in[32], temp_out[32];
1193
1194 // Rows
1195 for (i = 0; i < 32; ++i) {
1196 int16_t zero_coeff[16];
1197 for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1198 for (j = 0; j < 8; ++j)
1199 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1200 for (j = 0; j < 4; ++j)
1201 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1202 for (j = 0; j < 2; ++j)
1203 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1204
1205 if (zero_coeff[0] | zero_coeff[1])
1206 aom_idct32_c(input, outptr);
1207 else
1208 memset(outptr, 0, sizeof(tran_low_t) * 32);
1209 input += 32;
1210 outptr += 32;
1211 }
1212
1213 // Columns
1214 for (i = 0; i < 32; ++i) {
1215 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1216 aom_idct32_c(temp_in, temp_out);
1217 for (j = 0; j < 32; ++j) {
1218 // Only add the coefficient if the mask value is 1
1219 int mask_val = mask[j * 32 + i];
1220 dest[j * stride + i] =
1221 mask_val ? clip_pixel_add(dest[j * stride + i],
1222 ROUND_POWER_OF_TWO(temp_out[j], 6))
1223 : dest[j * stride + i];
1224 }
1225 }
1226 }
1227
aom_imrc32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride,uint8_t * mask)1228 void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1229 uint8_t *mask) {
1230 tran_low_t out[32 * 32] = { 0 };
1231 tran_low_t *outptr = out;
1232 int i, j;
1233 tran_low_t temp_in[32], temp_out[32];
1234
1235 // Rows
1236 // only upper-left 16x16 has non-zero coeff
1237 for (i = 0; i < 16; ++i) {
1238 aom_idct32_c(input, outptr);
1239 input += 32;
1240 outptr += 32;
1241 }
1242
1243 // Columns
1244 for (i = 0; i < 32; ++i) {
1245 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1246 aom_idct32_c(temp_in, temp_out);
1247 for (j = 0; j < 32; ++j) {
1248 // Only add the coefficient if the mask value is 1
1249 int mask_val = mask[j * 32 + i];
1250 dest[j * stride + i] =
1251 mask_val ? clip_pixel_add(dest[j * stride + i],
1252 ROUND_POWER_OF_TWO(temp_out[j], 6))
1253 : dest[j * stride + i];
1254 }
1255 }
1256 }
1257
aom_imrc32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride,uint8_t * mask)1258 void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1259 uint8_t *mask) {
1260 tran_low_t out[32 * 32] = { 0 };
1261 tran_low_t *outptr = out;
1262 int i, j;
1263 tran_low_t temp_in[32], temp_out[32];
1264
1265 // Rows
1266 // only upper-left 8x8 has non-zero coeff
1267 for (i = 0; i < 8; ++i) {
1268 aom_idct32_c(input, outptr);
1269 input += 32;
1270 outptr += 32;
1271 }
1272
1273 // Columns
1274 for (i = 0; i < 32; ++i) {
1275 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1276 aom_idct32_c(temp_in, temp_out);
1277 for (j = 0; j < 32; ++j) {
1278 // Only add the coefficient if the mask value is 1
1279 int mask_val = mask[j * 32 + i];
1280 dest[j * stride + i] =
1281 mask_val ? clip_pixel_add(dest[j * stride + i],
1282 ROUND_POWER_OF_TWO(temp_out[j], 6))
1283 : dest[j * stride + i];
1284 }
1285 }
1286 }
1287 #endif // CONFIG_MRC_TX
1288
aom_idct32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride)1289 void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1290 int stride) {
1291 tran_low_t out[32 * 32];
1292 tran_low_t *outptr = out;
1293 int i, j;
1294 tran_low_t temp_in[32], temp_out[32];
1295
1296 // Rows
1297 for (i = 0; i < 32; ++i) {
1298 int16_t zero_coeff[16];
1299 for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1300 for (j = 0; j < 8; ++j)
1301 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1302 for (j = 0; j < 4; ++j)
1303 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1304 for (j = 0; j < 2; ++j)
1305 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1306
1307 if (zero_coeff[0] | zero_coeff[1])
1308 aom_idct32_c(input, outptr);
1309 else
1310 memset(outptr, 0, sizeof(tran_low_t) * 32);
1311 input += 32;
1312 outptr += 32;
1313 }
1314
1315 // Columns
1316 for (i = 0; i < 32; ++i) {
1317 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1318 aom_idct32_c(temp_in, temp_out);
1319 for (j = 0; j < 32; ++j) {
1320 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1321 ROUND_POWER_OF_TWO(temp_out[j], 6));
1322 }
1323 }
1324 }
1325
aom_idct32x32_135_add_c(const tran_low_t * input,uint8_t * dest,int stride)1326 void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1327 int stride) {
1328 tran_low_t out[32 * 32] = { 0 };
1329 tran_low_t *outptr = out;
1330 int i, j;
1331 tran_low_t temp_in[32], temp_out[32];
1332
1333 // Rows
1334 // only upper-left 16x16 has non-zero coeff
1335 for (i = 0; i < 16; ++i) {
1336 aom_idct32_c(input, outptr);
1337 input += 32;
1338 outptr += 32;
1339 }
1340
1341 // Columns
1342 for (i = 0; i < 32; ++i) {
1343 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1344 aom_idct32_c(temp_in, temp_out);
1345 for (j = 0; j < 32; ++j) {
1346 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1347 ROUND_POWER_OF_TWO(temp_out[j], 6));
1348 }
1349 }
1350 }
1351
aom_idct32x32_34_add_c(const tran_low_t * input,uint8_t * dest,int stride)1352 void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1353 int stride) {
1354 tran_low_t out[32 * 32] = { 0 };
1355 tran_low_t *outptr = out;
1356 int i, j;
1357 tran_low_t temp_in[32], temp_out[32];
1358
1359 // Rows
1360 // only upper-left 8x8 has non-zero coeff
1361 for (i = 0; i < 8; ++i) {
1362 aom_idct32_c(input, outptr);
1363 input += 32;
1364 outptr += 32;
1365 }
1366
1367 // Columns
1368 for (i = 0; i < 32; ++i) {
1369 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1370 aom_idct32_c(temp_in, temp_out);
1371 for (j = 0; j < 32; ++j) {
1372 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1373 ROUND_POWER_OF_TWO(temp_out[j], 6));
1374 }
1375 }
1376 }
1377
aom_idct32x32_1_add_c(const tran_low_t * input,uint8_t * dest,int stride)1378 void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1379 int i, j;
1380 tran_high_t a1;
1381
1382 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1383 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1384 a1 = ROUND_POWER_OF_TWO(out, 6);
1385 if (a1 == 0) return;
1386
1387 for (j = 0; j < 32; ++j) {
1388 for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1389 dest += stride;
1390 }
1391 }
1392
aom_highbd_iwht4x4_16_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bd)1393 void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1394 int stride, int bd) {
1395 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1396 0.5 shifts per pixel. */
1397 int i;
1398 tran_low_t output[16];
1399 tran_high_t a1, b1, c1, d1, e1;
1400 const tran_low_t *ip = input;
1401 tran_low_t *op = output;
1402 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1403
1404 for (i = 0; i < 4; i++) {
1405 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1406 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1407 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1408 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1409 a1 += c1;
1410 d1 -= b1;
1411 e1 = (a1 - d1) >> 1;
1412 b1 = e1 - b1;
1413 c1 = e1 - c1;
1414 a1 -= b1;
1415 d1 += c1;
1416 op[0] = HIGHBD_WRAPLOW(a1, bd);
1417 op[1] = HIGHBD_WRAPLOW(b1, bd);
1418 op[2] = HIGHBD_WRAPLOW(c1, bd);
1419 op[3] = HIGHBD_WRAPLOW(d1, bd);
1420 ip += 4;
1421 op += 4;
1422 }
1423
1424 ip = output;
1425 for (i = 0; i < 4; i++) {
1426 a1 = ip[4 * 0];
1427 c1 = ip[4 * 1];
1428 d1 = ip[4 * 2];
1429 b1 = ip[4 * 3];
1430 a1 += c1;
1431 d1 -= b1;
1432 e1 = (a1 - d1) >> 1;
1433 b1 = e1 - b1;
1434 c1 = e1 - c1;
1435 a1 -= b1;
1436 d1 += c1;
1437 dest[stride * 0] =
1438 highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1439 dest[stride * 1] =
1440 highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1441 dest[stride * 2] =
1442 highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1443 dest[stride * 3] =
1444 highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1445
1446 ip++;
1447 dest++;
1448 }
1449 }
1450
aom_highbd_iwht4x4_1_add_c(const tran_low_t * in,uint8_t * dest8,int dest_stride,int bd)1451 void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1452 int dest_stride, int bd) {
1453 int i;
1454 tran_high_t a1, e1;
1455 tran_low_t tmp[4];
1456 const tran_low_t *ip = in;
1457 tran_low_t *op = tmp;
1458 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1459 (void)bd;
1460
1461 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1462 e1 = a1 >> 1;
1463 a1 -= e1;
1464 op[0] = HIGHBD_WRAPLOW(a1, bd);
1465 op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1466
1467 ip = tmp;
1468 for (i = 0; i < 4; i++) {
1469 e1 = ip[0] >> 1;
1470 a1 = ip[0] - e1;
1471 dest[dest_stride * 0] =
1472 highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
1473 dest[dest_stride * 1] =
1474 highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
1475 dest[dest_stride * 2] =
1476 highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
1477 dest[dest_stride * 3] =
1478 highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
1479 ip++;
1480 dest++;
1481 }
1482 }
1483