1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <math.h>
13 
14 #include "./aom_dsp_rtcd.h"
15 #include "./av1_rtcd.h"
16 #include "aom_dsp/inv_txfm.h"
17 #include "aom_ports/mem.h"
18 #include "av1/common/av1_inv_txfm1d_cfg.h"
19 #include "av1/common/blockd.h"
20 #include "av1/common/enums.h"
21 #include "av1/common/idct.h"
22 #if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \
23     CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64
24 #include "av1/common/daala_tx.h"
25 #endif
26 
av1_get_tx_scale(const TX_SIZE tx_size)27 int av1_get_tx_scale(const TX_SIZE tx_size) {
28   const int pels = tx_size_2d[tx_size];
29   return (pels > 256) + (pels > 1024) + (pels > 4096);
30 }
31 
32 // NOTE: The implementation of all inverses need to be aware of the fact
33 // that input and output could be the same buffer.
34 
35 #if CONFIG_EXT_TX
iidtx4_c(const tran_low_t * input,tran_low_t * output)36 static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
37   int i;
38   for (i = 0; i < 4; ++i) {
39     output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
40   }
41 }
42 
iidtx8_c(const tran_low_t * input,tran_low_t * output)43 static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
44   int i;
45   for (i = 0; i < 8; ++i) {
46     output[i] = input[i] * 2;
47   }
48 }
49 
iidtx16_c(const tran_low_t * input,tran_low_t * output)50 static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
51   int i;
52   for (i = 0; i < 16; ++i) {
53     output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
54   }
55 }
56 
iidtx32_c(const tran_low_t * input,tran_low_t * output)57 static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
58   int i;
59   for (i = 0; i < 32; ++i) {
60     output[i] = input[i] * 4;
61   }
62 }
63 
64 #if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
iidtx64_c(const tran_low_t * input,tran_low_t * output)65 static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
66   int i;
67   for (i = 0; i < 64; ++i) {
68     output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
69   }
70 }
71 #endif  // CONFIG_TX64X64
72 #endif  // CONFIG_EXT_TX
73 
74 // For use in lieu of ADST
ihalfright32_c(const tran_low_t * input,tran_low_t * output)75 static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
76   int i;
77   tran_low_t inputhalf[16];
78   // Multiply input by sqrt(2)
79   for (i = 0; i < 16; ++i) {
80     inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
81   }
82   for (i = 0; i < 16; ++i) {
83     output[i] = input[16 + i] * 4;
84   }
85   aom_idct16_c(inputhalf, output + 16);
86   // Note overall scaling factor is 4 times orthogonal
87 }
88 
89 #if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
idct64_col_c(const tran_low_t * input,tran_low_t * output)90 static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
91   int32_t in[64], out[64];
92   int i;
93   for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
94   av1_idct64_new(in, out, inv_cos_bit_col_dct_64, inv_stage_range_col_dct_64);
95   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
96 }
97 
idct64_row_c(const tran_low_t * input,tran_low_t * output)98 static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
99   int32_t in[64], out[64];
100   int i;
101   for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
102   av1_idct64_new(in, out, inv_cos_bit_row_dct_64, inv_stage_range_row_dct_64);
103   for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
104 }
105 
106 // For use in lieu of ADST
ihalfright64_c(const tran_low_t * input,tran_low_t * output)107 static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
108   int i;
109   tran_low_t inputhalf[32];
110   // Multiply input by sqrt(2)
111   for (i = 0; i < 32; ++i) {
112     inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
113   }
114   for (i = 0; i < 32; ++i) {
115     output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
116   }
117   aom_idct32_c(inputhalf, output + 32);
118   // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
119 }
120 #endif  // CONFIG_TX64X64
121 
122 // Inverse identity transform and add.
123 #if CONFIG_EXT_TX
inv_idtx_add_c(const tran_low_t * input,uint8_t * dest,int stride,int bsx,int bsy,TX_TYPE tx_type)124 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
125                            int bsx, int bsy, TX_TYPE tx_type) {
126   int r, c;
127   const int pels = bsx * bsy;
128   const int shift = 3 - ((pels > 256) + (pels > 1024));
129   if (tx_type == IDTX) {
130     for (r = 0; r < bsy; ++r) {
131       for (c = 0; c < bsx; ++c)
132         dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
133       dest += stride;
134       input += bsx;
135     }
136   }
137 }
138 #endif  // CONFIG_EXT_TX
139 
140 #define FLIPUD_PTR(dest, stride, size)       \
141   do {                                       \
142     (dest) = (dest) + ((size)-1) * (stride); \
143     (stride) = -(stride);                    \
144   } while (0)
145 
146 #if CONFIG_EXT_TX
maybe_flip_strides(uint8_t ** dst,int * dstride,tran_low_t ** src,int * sstride,TX_TYPE tx_type,int sizey,int sizex)147 static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
148                                int *sstride, TX_TYPE tx_type, int sizey,
149                                int sizex) {
150   // Note that the transpose of src will be added to dst. In order to LR
151   // flip the addends (in dst coordinates), we UD flip the src. To UD flip
152   // the addends, we UD flip the dst.
153   switch (tx_type) {
154     case DCT_DCT:
155     case ADST_DCT:
156     case DCT_ADST:
157     case ADST_ADST:
158     case IDTX:
159     case V_DCT:
160     case H_DCT:
161     case V_ADST:
162     case H_ADST: break;
163     case FLIPADST_DCT:
164     case FLIPADST_ADST:
165     case V_FLIPADST:
166       // flip UD
167       FLIPUD_PTR(*dst, *dstride, sizey);
168       break;
169     case DCT_FLIPADST:
170     case ADST_FLIPADST:
171     case H_FLIPADST:
172       // flip LR
173       FLIPUD_PTR(*src, *sstride, sizex);
174       break;
175     case FLIPADST_FLIPADST:
176       // flip UD
177       FLIPUD_PTR(*dst, *dstride, sizey);
178       // flip LR
179       FLIPUD_PTR(*src, *sstride, sizex);
180       break;
181     default: assert(0); break;
182   }
183 }
184 #endif  // CONFIG_EXT_TX
185 
186 #if CONFIG_HIGHBITDEPTH
187 #if CONFIG_EXT_TX && CONFIG_TX64X64
highbd_inv_idtx_add_c(const tran_low_t * input,uint8_t * dest8,int stride,int bsx,int bsy,TX_TYPE tx_type,int bd)188 static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
189                                   int stride, int bsx, int bsy, TX_TYPE tx_type,
190                                   int bd) {
191   int r, c;
192   const int pels = bsx * bsy;
193   const int shift = 3 - ((pels > 256) + (pels > 1024));
194   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
195 
196   if (tx_type == IDTX) {
197     for (r = 0; r < bsy; ++r) {
198       for (c = 0; c < bsx; ++c)
199         dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
200       dest += stride;
201       input += bsx;
202     }
203   }
204 }
205 #endif  // CONFIG_EXT_TX && CONFIG_TX64X64
206 #endif  // CONFIG_HIGHBITDEPTH
207 
208 #if CONFIG_LGT || CONFIG_LGT_FROM_PRED
ilgt4(const tran_low_t * input,tran_low_t * output,const tran_high_t * lgtmtx)209 void ilgt4(const tran_low_t *input, tran_low_t *output,
210            const tran_high_t *lgtmtx) {
211   if (!lgtmtx) assert(0);
212 #if CONFIG_LGT_FROM_PRED
213   // For DCT/ADST, use butterfly implementations
214   if (lgtmtx[0] == DCT4) {
215     aom_idct4_c(input, output);
216     return;
217   } else if (lgtmtx[0] == ADST4) {
218     aom_iadst4_c(input, output);
219     return;
220   }
221 #endif  // CONFIG_LGT_FROM_PRED
222 
223   // evaluate s[j] = sum of all lgtmtx[j]*input[i] over i=1,...,4
224   tran_high_t s[4] = { 0 };
225   for (int i = 0; i < 4; ++i)
226     for (int j = 0; j < 4; ++j) s[j] += lgtmtx[i * 4 + j] * input[i];
227 
228   for (int i = 0; i < 4; ++i) output[i] = WRAPLOW(dct_const_round_shift(s[i]));
229 }
230 
ilgt8(const tran_low_t * input,tran_low_t * output,const tran_high_t * lgtmtx)231 void ilgt8(const tran_low_t *input, tran_low_t *output,
232            const tran_high_t *lgtmtx) {
233   if (!lgtmtx) assert(0);
234 #if CONFIG_LGT_FROM_PRED
235   // For DCT/ADST, use butterfly implementations
236   if (lgtmtx[0] == DCT8) {
237     aom_idct8_c(input, output);
238     return;
239   } else if (lgtmtx[0] == ADST8) {
240     aom_iadst8_c(input, output);
241     return;
242   }
243 #endif  // CONFIG_LGT_FROM_PRED
244 
245   // evaluate s[j] = sum of all lgtmtx[j]*input[i] over i=1,...,8
246   tran_high_t s[8] = { 0 };
247   for (int i = 0; i < 8; ++i)
248     for (int j = 0; j < 8; ++j) s[j] += lgtmtx[i * 8 + j] * input[i];
249 
250   for (int i = 0; i < 8; ++i) output[i] = WRAPLOW(dct_const_round_shift(s[i]));
251 }
252 #endif  // CONFIG_LGT || CONFIG_LGT_FROM_PRED
253 
254 #if CONFIG_LGT
255 // get_lgt4 and get_lgt8 return 1 and pick a lgt matrix if LGT is chosen to
256 // apply. Otherwise they return 0
get_lgt4(const TxfmParam * txfm_param,int is_col,const tran_high_t ** lgtmtx)257 int get_lgt4(const TxfmParam *txfm_param, int is_col,
258              const tran_high_t **lgtmtx) {
259   if (is_col && (vtx_tab[txfm_param->tx_type] == ADST_1D ||
260                  vtx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
261     lgtmtx[0] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
262     return 1;
263   } else if (!is_col && (htx_tab[txfm_param->tx_type] == ADST_1D ||
264                          htx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
265     lgtmtx[0] = txfm_param->is_inter ? &lgt4_170[0][0] : &lgt4_140[0][0];
266     return 1;
267   }
268   lgtmtx[0] = NULL;
269   return 0;
270 }
271 
get_lgt8(const TxfmParam * txfm_param,int is_col,const tran_high_t ** lgtmtx)272 int get_lgt8(const TxfmParam *txfm_param, int is_col,
273              const tran_high_t **lgtmtx) {
274   if (is_col && (vtx_tab[txfm_param->tx_type] == ADST_1D ||
275                  vtx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
276     lgtmtx[0] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
277     return 1;
278   } else if (!is_col && (htx_tab[txfm_param->tx_type] == ADST_1D ||
279                          htx_tab[txfm_param->tx_type] == FLIPADST_1D)) {
280     lgtmtx[0] = txfm_param->is_inter ? &lgt8_170[0][0] : &lgt8_150[0][0];
281     return 1;
282   }
283   lgtmtx[0] = NULL;
284   return 0;
285 }
286 #endif  // CONFIG_LGT
287 
288 #if CONFIG_LGT_FROM_PRED
ilgt16up(const tran_low_t * input,tran_low_t * output,const tran_high_t * lgtmtx)289 void ilgt16up(const tran_low_t *input, tran_low_t *output,
290               const tran_high_t *lgtmtx) {
291   if (lgtmtx[0] == DCT16) {
292     aom_idct16_c(input, output);
293     return;
294   } else if (lgtmtx[0] == ADST16) {
295     aom_iadst16_c(input, output);
296     return;
297   } else if (lgtmtx[0] == DCT32) {
298     aom_idct32_c(input, output);
299     return;
300   } else if (lgtmtx[0] == ADST32) {
301     ihalfright32_c(input, output);
302     return;
303   } else {
304     assert(0);
305   }
306 }
307 
get_discontinuity_1d(uint8_t * arr,int n,int * idx_max_diff)308 void get_discontinuity_1d(uint8_t *arr, int n, int *idx_max_diff) {
309   *idx_max_diff = -1;
310 
311   int temp = 0, max_diff = 0, min_diff = INT_MAX;
312   for (int i = 1; i < n; ++i) {
313     temp = abs(arr[i] - arr[i - 1]);
314     if (temp > max_diff) {
315       max_diff = temp;
316       *idx_max_diff = i;
317     }
318     if (temp < min_diff) min_diff = temp;
319   }
320 }
321 
get_discontinuity_2d(uint8_t * dst,int stride,int n,int is_col,int * idx_max_diff,int ntx)322 void get_discontinuity_2d(uint8_t *dst, int stride, int n, int is_col,
323                           int *idx_max_diff, int ntx) {
324   *idx_max_diff = -1;
325 
326   int diff = 0, temp = 0, max_diff = 0, min_diff = INT_MAX;
327   for (int i = 1; i < n; ++i) {
328     temp = 0;
329     for (int j = 0; j < ntx; ++j) {
330       if (is_col)  // vertical diff
331         diff = dst[i * stride + j] - dst[(i - 1) * stride + j];
332       else  // horizontal diff
333         diff = dst[j * stride + i] - dst[j * stride + i - 1];
334       temp += diff * diff;
335     }
336     // temp/w is the i-th avg square diff
337     if (temp > max_diff) {
338       max_diff = temp;
339       *idx_max_diff = i;
340     }
341     if (temp < min_diff) min_diff = temp;
342   }
343 }
344 
idx_selfloop_wrt_mode(PREDICTION_MODE mode,int is_col)345 int idx_selfloop_wrt_mode(PREDICTION_MODE mode, int is_col) {
346   // 0: no self-loop
347   // 1: small self-loop
348   // 2: medium self-loop
349   // 3: large self-loop
350   switch (mode) {
351     case DC_PRED:
352     case SMOOTH_PRED:
353       // predition is good for both directions: large SLs for row and col
354       return 3;
355     case TM_PRED: return 0;
356 #if CONFIG_SMOOTH_HV
357     case SMOOTH_H_PRED:
358 #endif
359     case H_PRED:
360       // prediction is good for H direction: large SL for row only
361       return is_col ? 0 : 3;
362 #if CONFIG_SMOOTH_HV
363     case SMOOTH_V_PRED:
364 #endif
365     case V_PRED:
366       // prediction is good for V direction: large SL for col only
367       return is_col ? 3 : 0;
368 #if LGT_SL_INTRA
369     // directional mode: choose SL based on the direction
370     case D45_PRED: return is_col ? 2 : 0;
371     case D63_PRED: return is_col ? 3 : 0;
372     case D117_PRED: return is_col ? 3 : 1;
373     case D135_PRED: return 2;
374     case D153_PRED: return is_col ? 1 : 3;
375     case D207_PRED: return is_col ? 0 : 3;
376 #else
377     case D45_PRED:
378     case D63_PRED:
379     case D117_PRED: return is_col ? 3 : 0;
380     case D135_PRED:
381     case D153_PRED:
382     case D207_PRED: return is_col ? 0 : 3;
383 #endif
384     // inter: no SL
385     default: return 0;
386   }
387 }
388 
get_lgt4_from_pred(const TxfmParam * txfm_param,int is_col,const tran_high_t ** lgtmtx,int ntx)389 void get_lgt4_from_pred(const TxfmParam *txfm_param, int is_col,
390                         const tran_high_t **lgtmtx, int ntx) {
391   PREDICTION_MODE mode = txfm_param->mode;
392   int stride = txfm_param->stride;
393   uint8_t *dst = txfm_param->dst;
394   int bp = -1;
395   uint8_t arr[4];
396 
397   // Each lgt4mtx_arr[k][i] corresponds to a line graph with a self-loop on
398   // the first node, and possibly a weak edge within the line graph. i is
399   // the index of the weak edge (between the i-th and (i+1)-th pixels, i=0
400   // means no weak edge). k corresponds to the first self-loop's weight
401   const tran_high_t *lgt4mtx_arr[4][4] = {
402     { &lgt4_000[0][0], &lgt4_000w1[0][0], &lgt4_000w2[0][0],
403       &lgt4_000w3[0][0] },
404     { &lgt4_060[0][0], &lgt4_060_000w1[0][0], &lgt4_060_000w2[0][0],
405       &lgt4_060_000w3[0][0] },
406     { &lgt4_100[0][0], &lgt4_100_000w1[0][0], &lgt4_100_000w2[0][0],
407       &lgt4_100_000w3[0][0] },
408     { &lgt4_150[0][0], &lgt4_150_000w1[0][0], &lgt4_150_000w2[0][0],
409       &lgt4_150_000w3[0][0] },
410   };
411 
412   // initialize to DCT or some LGTs, and then change later if necessary
413   int idx_sl = idx_selfloop_wrt_mode(mode, is_col);
414   lgtmtx[0] = lgt4mtx_arr[idx_sl][0];
415 
416   // find the break point and replace the line graph by the one with a
417   // break point
418   if (mode == DC_PRED || mode == SMOOTH_PRED) {
419     // Do not use break point, since 1) is_left_available and is_top_available
420     // in DC_PRED are not known by txfm_param for now, so accessing
421     // both boundaries anyway may cause a mismatch 2) DC prediciton
422     // typically yields very smooth residues so having the break point
423     // does not usually improve the RD result.
424     return;
425   } else if (mode == TM_PRED) {
426     // TM_PRED: use both 1D top boundary and 1D left boundary
427     if (is_col)
428       for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride];
429     else
430       for (int i = 0; i < 4; ++i) arr[i] = dst[i];
431     get_discontinuity_1d(&arr[0], 4, &bp);
432   } else if (mode == V_PRED) {
433     // V_PRED: use 1D top boundary only
434     if (is_col) return;
435     for (int i = 0; i < 4; ++i) arr[i] = dst[i];
436     get_discontinuity_1d(&arr[0], 4, &bp);
437   } else if (mode == H_PRED) {
438     // H_PRED: use 1D left boundary only
439     if (!is_col) return;
440     for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride];
441     get_discontinuity_1d(&arr[0], 4, &bp);
442 #if CONFIG_SMOOTH_HV
443   } else if (mode == SMOOTH_V_PRED) {
444     if (is_col) return;
445     for (int i = 0; i < 4; ++i) arr[i] = dst[-stride + i];
446     get_discontinuity_1d(&arr[0], 4, &bp);
447   } else if (mode == SMOOTH_H_PRED) {
448     if (!is_col) return;
449     for (int i = 0; i < 4; ++i) arr[i] = dst[i * stride - 1];
450     get_discontinuity_1d(&arr[0], 4, &bp);
451 #endif
452   } else if (mode == D45_PRED || mode == D63_PRED || mode == D117_PRED) {
453     // directional modes closer to vertical (maybe include D135 later)
454     if (!is_col) get_discontinuity_2d(dst, stride, 4, 0, &bp, ntx);
455   } else if (mode == D135_PRED || mode == D153_PRED || mode == D207_PRED) {
456     // directional modes closer to horizontal
457     if (is_col) get_discontinuity_2d(dst, stride, 4, 1, &bp, ntx);
458   } else if (mode > TM_PRED) {
459     // inter
460     get_discontinuity_2d(dst, stride, 4, is_col, &bp, ntx);
461   }
462 
463 #if LGT_SL_INTRA
464   if (bp != -1) lgtmtx[0] = lgt4mtx_arr[idx_sl][bp];
465 #else
466   if (bp != -1) lgtmtx[0] = lgt4mtx_arr[0][bp];
467 #endif
468 }
469 
get_lgt8_from_pred(const TxfmParam * txfm_param,int is_col,const tran_high_t ** lgtmtx,int ntx)470 void get_lgt8_from_pred(const TxfmParam *txfm_param, int is_col,
471                         const tran_high_t **lgtmtx, int ntx) {
472   PREDICTION_MODE mode = txfm_param->mode;
473   int stride = txfm_param->stride;
474   uint8_t *dst = txfm_param->dst;
475   int bp = -1;
476   uint8_t arr[8];
477 
478   const tran_high_t *lgt8mtx_arr[4][8] = {
479     { &lgt8_000[0][0], &lgt8_000w1[0][0], &lgt8_000w2[0][0], &lgt8_000w3[0][0],
480       &lgt8_000w4[0][0], &lgt8_000w5[0][0], &lgt8_000w6[0][0],
481       &lgt8_000w7[0][0] },
482     { &lgt8_060[0][0], &lgt8_060_000w1[0][0], &lgt8_060_000w2[0][0],
483       &lgt8_060_000w3[0][0], &lgt8_060_000w4[0][0], &lgt8_060_000w5[0][0],
484       &lgt8_060_000w6[0][0], &lgt8_060_000w7[0][0] },
485     { &lgt8_100[0][0], &lgt8_100_000w1[0][0], &lgt8_100_000w2[0][0],
486       &lgt8_100_000w3[0][0], &lgt8_100_000w4[0][0], &lgt8_100_000w5[0][0],
487       &lgt8_100_000w6[0][0], &lgt8_100_000w7[0][0] },
488     { &lgt8_150[0][0], &lgt8_150_000w1[0][0], &lgt8_150_000w2[0][0],
489       &lgt8_150_000w3[0][0], &lgt8_150_000w4[0][0], &lgt8_150_000w5[0][0],
490       &lgt8_150_000w6[0][0], &lgt8_150_000w7[0][0] },
491   };
492 
493   int idx_sl = idx_selfloop_wrt_mode(mode, is_col);
494   lgtmtx[0] = lgt8mtx_arr[idx_sl][0];
495 
496   if (mode == DC_PRED || mode == SMOOTH_PRED) {
497     return;
498   } else if (mode == TM_PRED) {
499     if (is_col)
500       for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride];
501     else
502       for (int i = 0; i < 8; ++i) arr[i] = dst[i];
503     get_discontinuity_1d(&arr[0], 8, &bp);
504   } else if (mode == V_PRED) {
505     if (is_col) return;
506     for (int i = 0; i < 8; ++i) arr[i] = dst[i];
507     get_discontinuity_1d(&arr[0], 8, &bp);
508   } else if (mode == H_PRED) {
509     if (!is_col) return;
510     for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride];
511     get_discontinuity_1d(&arr[0], 8, &bp);
512 #if CONFIG_SMOOTH_HV
513   } else if (mode == SMOOTH_V_PRED) {
514     if (is_col) return;
515     for (int i = 0; i < 8; ++i) arr[i] = dst[-stride + i];
516     get_discontinuity_1d(&arr[0], 8, &bp);
517   } else if (mode == SMOOTH_H_PRED) {
518     if (!is_col) return;
519     for (int i = 0; i < 8; ++i) arr[i] = dst[i * stride - 1];
520     get_discontinuity_1d(&arr[0], 8, &bp);
521 #endif
522   } else if (mode == D45_PRED || mode == D63_PRED || mode == D117_PRED) {
523     if (!is_col) get_discontinuity_2d(dst, stride, 8, 0, &bp, ntx);
524   } else if (mode == D135_PRED || mode == D153_PRED || mode == D207_PRED) {
525     if (is_col) get_discontinuity_2d(dst, stride, 8, 1, &bp, ntx);
526   } else if (mode > TM_PRED) {
527     get_discontinuity_2d(dst, stride, 8, is_col, &bp, ntx);
528   }
529 
530 #if LGT_SL_INTRA
531   if (bp != -1) lgtmtx[0] = lgt8mtx_arr[idx_sl][bp];
532 #else
533   if (bp != -1) lgtmtx[0] = lgt8mtx_arr[0][bp];
534 #endif
535 }
536 
537 // Since LGTs with length >8 are not implemented now, the following function
538 // will just call DCT or ADST
get_lgt16up_from_pred(const TxfmParam * txfm_param,int is_col,const tran_high_t ** lgtmtx,int ntx)539 void get_lgt16up_from_pred(const TxfmParam *txfm_param, int is_col,
540                            const tran_high_t **lgtmtx, int ntx) {
541   int tx_length = is_col ? tx_size_high[txfm_param->tx_size]
542                          : tx_size_wide[txfm_param->tx_size];
543   assert(tx_length == 16 || tx_length == 32);
544   PREDICTION_MODE mode = txfm_param->mode;
545 
546   (void)ntx;
547   const tran_high_t *dctmtx =
548       tx_length == 16 ? &lgt16_000[0][0] : &lgt32_000[0][0];
549   const tran_high_t *adstmtx =
550       tx_length == 16 ? &lgt16_200[0][0] : &lgt32_200[0][0];
551 
552   switch (mode) {
553     case DC_PRED:
554     case TM_PRED:
555     case SMOOTH_PRED:
556       // prediction from both top and left -> ADST
557       lgtmtx[0] = adstmtx;
558       break;
559     case V_PRED:
560     case D45_PRED:
561     case D63_PRED:
562     case D117_PRED:
563 #if CONFIG_SMOOTH_HV
564     case SMOOTH_V_PRED:
565 #endif
566       // prediction from the top more than from the left -> ADST
567       lgtmtx[0] = is_col ? adstmtx : dctmtx;
568       break;
569     case H_PRED:
570     case D135_PRED:
571     case D153_PRED:
572     case D207_PRED:
573 #if CONFIG_SMOOTH_HV
574     case SMOOTH_H_PRED:
575 #endif
576       // prediction from the left more than from the top -> DCT
577       lgtmtx[0] = is_col ? dctmtx : adstmtx;
578       break;
579     default: lgtmtx[0] = dctmtx; break;
580   }
581 }
582 
583 typedef void (*IlgtFunc)(const tran_low_t *input, tran_low_t *output,
584                          const tran_high_t *lgtmtx);
585 
586 static IlgtFunc ilgt_func[4] = { ilgt4, ilgt8, ilgt16up, ilgt16up };
587 
588 typedef void (*GetLgtFunc)(const TxfmParam *txfm_param, int is_col,
589                            const tran_high_t **lgtmtx, int ntx);
590 
591 static GetLgtFunc get_lgt_func[4] = { get_lgt4_from_pred, get_lgt8_from_pred,
592                                       get_lgt16up_from_pred,
593                                       get_lgt16up_from_pred };
594 
595 // this inline function corresponds to the up scaling before the transpose
596 // operation in the av1_iht* functions
inv_upscale_wrt_txsize(const tran_high_t val,const TX_SIZE tx_size)597 static INLINE tran_low_t inv_upscale_wrt_txsize(const tran_high_t val,
598                                                 const TX_SIZE tx_size) {
599   switch (tx_size) {
600     case TX_4X4:
601     case TX_8X8:
602     case TX_4X16:
603     case TX_16X4:
604     case TX_8X32:
605     case TX_32X8: return (tran_low_t)val;
606     case TX_4X8:
607     case TX_8X4:
608     case TX_8X16:
609     case TX_16X8: return (tran_low_t)dct_const_round_shift(val * Sqrt2);
610     default: assert(0); break;
611   }
612   return 0;
613 }
614 
615 // This inline function corresponds to the bit shift before summing with the
616 // destination in the av1_iht* functions
inv_downscale_wrt_txsize(const tran_low_t val,const TX_SIZE tx_size)617 static INLINE tran_low_t inv_downscale_wrt_txsize(const tran_low_t val,
618                                                   const TX_SIZE tx_size) {
619   switch (tx_size) {
620     case TX_4X4: return ROUND_POWER_OF_TWO(val, 4);
621     case TX_4X8:
622     case TX_8X4:
623     case TX_8X8:
624     case TX_4X16:
625     case TX_16X4: return ROUND_POWER_OF_TWO(val, 5);
626     case TX_8X16:
627     case TX_16X8:
628     case TX_8X32:
629     case TX_32X8: return ROUND_POWER_OF_TWO(val, 6);
630     default: assert(0); break;
631   }
632   return 0;
633 }
634 
ilgt2d_from_pred_add(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)635 void ilgt2d_from_pred_add(const tran_low_t *input, uint8_t *dest, int stride,
636                           const TxfmParam *txfm_param) {
637   const TX_SIZE tx_size = txfm_param->tx_size;
638   const int w = tx_size_wide[tx_size];
639   const int h = tx_size_high[tx_size];
640   const int wlog2 = tx_size_wide_log2[tx_size];
641   const int hlog2 = tx_size_high_log2[tx_size];
642   assert(w <= 8 || h <= 8);
643 
644   int i, j;
645   // largest 1D size allowed for LGT: 32
646   // largest 2D size allowed for LGT: 8x32=256
647   tran_low_t tmp[256], out[256], temp1d[32];
648   const tran_high_t *lgtmtx_col[1];
649   const tran_high_t *lgtmtx_row[1];
650   get_lgt_func[hlog2 - 2](txfm_param, 1, lgtmtx_col, w);
651   get_lgt_func[wlog2 - 2](txfm_param, 0, lgtmtx_row, h);
652 
653 // for inverse transform, to be consistent with av1_iht functions, we always
654 // apply row transforms first and column transforms second, but both
655 // row-first and column-first versions are implemented here for future
656 // tests (use different lgtmtx_col[i], and choose row or column tx first
657 // depending on transforms).
658 #if 1
659   // inverse column transforms
660   for (i = 0; i < w; ++i) {
661     // transpose
662     for (j = 0; j < h; ++j) tmp[i * h + j] = input[j * w + i];
663     ilgt_func[hlog2 - 2](&tmp[i * h], temp1d, lgtmtx_col[0]);
664     // upscale, and store in place
665     for (j = 0; j < h; ++j)
666       tmp[i * h + j] = inv_upscale_wrt_txsize(temp1d[j], tx_size);
667   }
668   // inverse row transforms
669   for (i = 0; i < h; ++i) {
670     for (j = 0; j < w; ++j) temp1d[j] = tmp[j * h + i];
671     ilgt_func[wlog2 - 2](temp1d, &out[i * w], lgtmtx_row[0]);
672   }
673   // downscale + sum with the destination
674   for (i = 0; i < h; ++i) {
675     for (j = 0; j < w; ++j) {
676       int d = i * stride + j;
677       int s = i * w + j;
678       dest[d] =
679           clip_pixel_add(dest[d], inv_downscale_wrt_txsize(out[s], tx_size));
680     }
681   }
682 #else
683   // inverse row transforms
684   for (i = 0; i < h; ++i) {
685     ilgt_func[wlog2 - 2](input, temp1d, lgtmtx_row[0]);
686     // upscale and transpose (tmp[j*h+i] <--> tmp[j][i])
687     for (j = 0; j < w; ++j)
688       tmp[j * h + i] = inv_upscale_wrt_txsize(temp1d[j], tx_size);
689     input += w;
690   }
691   // inverse column transforms
692   for (i = 0; i < w; ++i)
693     ilgt_func[hlog2 - 2](&tmp[i * h], &out[i * h], lgtmtx_col[0]);
694   // here, out[] is the transpose of 2D block of transform coefficients
695 
696   // downscale + transform + sum with dest
697   for (i = 0; i < h; ++i) {
698     for (j = 0; j < w; ++j) {
699       int d = i * stride + j;
700       int s = j * h + i;
701       dest[d] =
702           clip_pixel_add(dest[d], inv_downscale_wrt_txsize(out[s], tx_size));
703     }
704   }
705 #endif
706 }
707 #endif  // CONFIG_LGT_FROM_PRED
708 
av1_iht4x4_16_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)709 void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
710                          const TxfmParam *txfm_param) {
711   const TX_TYPE tx_type = txfm_param->tx_type;
712 #if CONFIG_MRC_TX
713   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
714 #endif  // CONFIG_MRC_TX
715 #if !CONFIG_DAALA_DCT4
716   if (tx_type == DCT_DCT) {
717     aom_idct4x4_16_add(input, dest, stride);
718     return;
719   }
720 #endif
721   static const transform_2d IHT_4[] = {
722 #if CONFIG_DAALA_DCT4
723     { daala_idct4, daala_idct4 },  // DCT_DCT  = 0
724     { daala_idst4, daala_idct4 },  // ADST_DCT = 1
725     { daala_idct4, daala_idst4 },  // DCT_ADST = 2
726     { daala_idst4, daala_idst4 },  // ADST_ADST = 3
727 #if CONFIG_EXT_TX
728     { daala_idst4, daala_idct4 },  // FLIPADST_DCT
729     { daala_idct4, daala_idst4 },  // DCT_FLIPADST
730     { daala_idst4, daala_idst4 },  // FLIPADST_FLIPADST
731     { daala_idst4, daala_idst4 },  // ADST_FLIPADST
732     { daala_idst4, daala_idst4 },  // FLIPADST_ADST
733     { daala_idtx4, daala_idtx4 },  // IDTX
734     { daala_idct4, daala_idtx4 },  // V_DCT
735     { daala_idtx4, daala_idct4 },  // H_DCT
736     { daala_idst4, daala_idtx4 },  // V_ADST
737     { daala_idtx4, daala_idst4 },  // H_ADST
738     { daala_idst4, daala_idtx4 },  // V_FLIPADST
739     { daala_idtx4, daala_idst4 },  // H_FLIPADST
740 #endif
741 #else
742     { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
743     { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
744     { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
745     { aom_iadst4_c, aom_iadst4_c },  // ADST_ADST = 3
746 #if CONFIG_EXT_TX
747     { aom_iadst4_c, aom_idct4_c },   // FLIPADST_DCT
748     { aom_idct4_c, aom_iadst4_c },   // DCT_FLIPADST
749     { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_FLIPADST
750     { aom_iadst4_c, aom_iadst4_c },  // ADST_FLIPADST
751     { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_ADST
752     { iidtx4_c, iidtx4_c },          // IDTX
753     { aom_idct4_c, iidtx4_c },       // V_DCT
754     { iidtx4_c, aom_idct4_c },       // H_DCT
755     { aom_iadst4_c, iidtx4_c },      // V_ADST
756     { iidtx4_c, aom_iadst4_c },      // H_ADST
757     { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
758     { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
759 #endif
760 #endif
761   };
762 
763   int i, j;
764   tran_low_t tmp[4][4];
765   tran_low_t out[4][4];
766   tran_low_t *outp = &out[0][0];
767   int outstride = 4;
768 
769 #if CONFIG_DCT_ONLY
770   assert(tx_type == DCT_DCT);
771 #endif
772 
773 #if CONFIG_LGT
774   const tran_high_t *lgtmtx_col[1];
775   const tran_high_t *lgtmtx_row[1];
776   int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
777   int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
778 #endif
779 
780   // inverse transform row vectors
781   for (i = 0; i < 4; ++i) {
782 #if CONFIG_DAALA_DCT4
783     tran_low_t temp_in[4];
784     for (j = 0; j < 4; j++) temp_in[j] = input[j] * 2;
785     IHT_4[tx_type].rows(temp_in, out[i]);
786 #else
787 #if CONFIG_LGT
788     if (use_lgt_row)
789       ilgt4(input, out[i], lgtmtx_row[0]);
790     else
791 #endif
792       IHT_4[tx_type].rows(input, out[i]);
793 #endif
794     input += 4;
795   }
796 
797   // transpose
798   for (i = 0; i < 4; i++) {
799     for (j = 0; j < 4; j++) {
800       tmp[j][i] = out[i][j];
801     }
802   }
803 
804   // inverse transform column vectors
805   for (i = 0; i < 4; ++i) {
806 #if CONFIG_LGT
807     if (use_lgt_col)
808       ilgt4(tmp[i], out[i], lgtmtx_col[0]);
809     else
810 #endif
811       IHT_4[tx_type].cols(tmp[i], out[i]);
812   }
813 
814 #if CONFIG_EXT_TX
815   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
816 #endif
817 
818   // Sum with the destination
819   for (i = 0; i < 4; ++i) {
820     for (j = 0; j < 4; ++j) {
821       int d = i * stride + j;
822       int s = j * outstride + i;
823 #if CONFIG_DAALA_DCT4
824       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
825 #else
826       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
827 #endif
828     }
829   }
830 }
831 
av1_iht4x8_32_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)832 void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
833                          const TxfmParam *txfm_param) {
834   const TX_TYPE tx_type = txfm_param->tx_type;
835 #if CONFIG_MRC_TX
836   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
837 #endif  // CONFIG_MRC_TX
838 #if CONFIG_DCT_ONLY
839   assert(tx_type == DCT_DCT);
840 #endif
841   static const transform_2d IHT_4x8[] = {
842     { aom_idct8_c, aom_idct4_c },    // DCT_DCT
843     { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
844     { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
845     { aom_iadst8_c, aom_iadst4_c },  // ADST_ADST
846 #if CONFIG_EXT_TX
847     { aom_iadst8_c, aom_idct4_c },   // FLIPADST_DCT
848     { aom_idct8_c, aom_iadst4_c },   // DCT_FLIPADST
849     { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_FLIPADST
850     { aom_iadst8_c, aom_iadst4_c },  // ADST_FLIPADST
851     { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_ADST
852     { iidtx8_c, iidtx4_c },          // IDTX
853     { aom_idct8_c, iidtx4_c },       // V_DCT
854     { iidtx8_c, aom_idct4_c },       // H_DCT
855     { aom_iadst8_c, iidtx4_c },      // V_ADST
856     { iidtx8_c, aom_iadst4_c },      // H_ADST
857     { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
858     { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
859 #endif
860   };
861 
862   const int n = 4;
863   const int n2 = 8;
864   int i, j;
865   tran_low_t out[4][8], tmp[4][8], outtmp[4];
866   tran_low_t *outp = &out[0][0];
867   int outstride = n2;
868 
869 #if CONFIG_LGT
870   const tran_high_t *lgtmtx_col[1];
871   const tran_high_t *lgtmtx_row[1];
872   int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
873   int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
874 #endif
875 
876   // inverse transform row vectors and transpose
877   for (i = 0; i < n2; ++i) {
878 #if CONFIG_LGT
879     if (use_lgt_row)
880       ilgt4(input, outtmp, lgtmtx_row[0]);
881     else
882 #endif
883       IHT_4x8[tx_type].rows(input, outtmp);
884     for (j = 0; j < n; ++j)
885       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
886     input += n;
887   }
888 
889   // inverse transform column vectors
890   for (i = 0; i < n; ++i) {
891 #if CONFIG_LGT
892     if (use_lgt_col)
893       ilgt8(tmp[i], out[i], lgtmtx_col[0]);
894     else
895 #endif
896       IHT_4x8[tx_type].cols(tmp[i], out[i]);
897   }
898 
899 #if CONFIG_EXT_TX
900   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
901 #endif
902 
903   // Sum with the destination
904   for (i = 0; i < n2; ++i) {
905     for (j = 0; j < n; ++j) {
906       int d = i * stride + j;
907       int s = j * outstride + i;
908       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
909     }
910   }
911 }
912 
av1_iht8x4_32_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)913 void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
914                          const TxfmParam *txfm_param) {
915   const TX_TYPE tx_type = txfm_param->tx_type;
916 #if CONFIG_MRC_TX
917   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
918 #endif  // CONFIG_MRC_TX
919 #if CONFIG_DCT_ONLY
920   assert(tx_type == DCT_DCT);
921 #endif
922   static const transform_2d IHT_8x4[] = {
923     { aom_idct4_c, aom_idct8_c },    // DCT_DCT
924     { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
925     { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
926     { aom_iadst4_c, aom_iadst8_c },  // ADST_ADST
927 #if CONFIG_EXT_TX
928     { aom_iadst4_c, aom_idct8_c },   // FLIPADST_DCT
929     { aom_idct4_c, aom_iadst8_c },   // DCT_FLIPADST
930     { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_FLIPADST
931     { aom_iadst4_c, aom_iadst8_c },  // ADST_FLIPADST
932     { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_ADST
933     { iidtx4_c, iidtx8_c },          // IDTX
934     { aom_idct4_c, iidtx8_c },       // V_DCT
935     { iidtx4_c, aom_idct8_c },       // H_DCT
936     { aom_iadst4_c, iidtx8_c },      // V_ADST
937     { iidtx4_c, aom_iadst8_c },      // H_ADST
938     { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
939     { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
940 #endif
941   };
942 
943   const int n = 4;
944   const int n2 = 8;
945 
946   int i, j;
947   tran_low_t out[8][4], tmp[8][4], outtmp[8];
948   tran_low_t *outp = &out[0][0];
949   int outstride = n;
950 
951 #if CONFIG_LGT
952   const tran_high_t *lgtmtx_col[1];
953   const tran_high_t *lgtmtx_row[1];
954   int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
955   int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
956 #endif
957 
958   // inverse transform row vectors and transpose
959   for (i = 0; i < n; ++i) {
960 #if CONFIG_LGT
961     if (use_lgt_row)
962       ilgt8(input, outtmp, lgtmtx_row[0]);
963     else
964 #endif
965       IHT_8x4[tx_type].rows(input, outtmp);
966     for (j = 0; j < n2; ++j)
967       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
968     input += n2;
969   }
970 
971   // inverse transform column vectors
972   for (i = 0; i < n2; ++i) {
973 #if CONFIG_LGT
974     if (use_lgt_col)
975       ilgt4(tmp[i], out[i], lgtmtx_col[0]);
976     else
977 #endif
978       IHT_8x4[tx_type].cols(tmp[i], out[i]);
979   }
980 
981 #if CONFIG_EXT_TX
982   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
983 #endif
984 
985   // Sum with the destination
986   for (i = 0; i < n; ++i) {
987     for (j = 0; j < n2; ++j) {
988       int d = i * stride + j;
989       int s = j * outstride + i;
990       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
991     }
992   }
993 }
994 
av1_iht4x16_64_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)995 void av1_iht4x16_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
996                           const TxfmParam *txfm_param) {
997   const TX_TYPE tx_type = txfm_param->tx_type;
998 #if CONFIG_MRC_TX
999   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1000 #endif  // CONFIG_MRC_TX
1001 #if CONFIG_DCT_ONLY
1002   assert(tx_type == DCT_DCT);
1003 #endif
1004   static const transform_2d IHT_4x16[] = {
1005     { aom_idct16_c, aom_idct4_c },    // DCT_DCT
1006     { aom_iadst16_c, aom_idct4_c },   // ADST_DCT
1007     { aom_idct16_c, aom_iadst4_c },   // DCT_ADST
1008     { aom_iadst16_c, aom_iadst4_c },  // ADST_ADST
1009 #if CONFIG_EXT_TX
1010     { aom_iadst16_c, aom_idct4_c },   // FLIPADST_DCT
1011     { aom_idct16_c, aom_iadst4_c },   // DCT_FLIPADST
1012     { aom_iadst16_c, aom_iadst4_c },  // FLIPADST_FLIPADST
1013     { aom_iadst16_c, aom_iadst4_c },  // ADST_FLIPADST
1014     { aom_iadst16_c, aom_iadst4_c },  // FLIPADST_ADST
1015     { iidtx16_c, iidtx4_c },          // IDTX
1016     { aom_idct16_c, iidtx4_c },       // V_DCT
1017     { iidtx16_c, aom_idct4_c },       // H_DCT
1018     { aom_iadst16_c, iidtx4_c },      // V_ADST
1019     { iidtx16_c, aom_iadst4_c },      // H_ADST
1020     { aom_iadst16_c, iidtx4_c },      // V_FLIPADST
1021     { iidtx16_c, aom_iadst4_c },      // H_FLIPADST
1022 #endif
1023   };
1024 
1025   const int n = 4;
1026   const int n4 = 16;
1027   int i, j;
1028   tran_low_t out[4][16], tmp[4][16], outtmp[4];
1029   tran_low_t *outp = &out[0][0];
1030   int outstride = n4;
1031 
1032 #if CONFIG_LGT
1033   const tran_high_t *lgtmtx_row[1];
1034   int use_lgt_row = get_lgt4(txfm_param, 0, lgtmtx_row);
1035 #endif
1036 
1037   // inverse transform row vectors and transpose
1038   for (i = 0; i < n4; ++i) {
1039 #if CONFIG_LGT
1040     if (use_lgt_row)
1041       ilgt4(input, outtmp, lgtmtx_row[0]);
1042     else
1043 #endif
1044       IHT_4x16[tx_type].rows(input, outtmp);
1045     for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
1046     input += n;
1047   }
1048 
1049   // inverse transform column vectors
1050   for (i = 0; i < n; ++i) {
1051     IHT_4x16[tx_type].cols(tmp[i], out[i]);
1052   }
1053 
1054 #if CONFIG_EXT_TX
1055   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
1056 #endif
1057 
1058   // Sum with the destination
1059   for (i = 0; i < n4; ++i) {
1060     for (j = 0; j < n; ++j) {
1061       int d = i * stride + j;
1062       int s = j * outstride + i;
1063       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
1064     }
1065   }
1066 }
1067 
av1_iht16x4_64_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1068 void av1_iht16x4_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1069                           const TxfmParam *txfm_param) {
1070   const TX_TYPE tx_type = txfm_param->tx_type;
1071 #if CONFIG_MRC_TX
1072   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1073 #endif  // CONFIG_MRC_TX
1074 #if CONFIG_DCT_ONLY
1075   assert(tx_type == DCT_DCT);
1076 #endif
1077   static const transform_2d IHT_16x4[] = {
1078     { aom_idct4_c, aom_idct16_c },    // DCT_DCT
1079     { aom_iadst4_c, aom_idct16_c },   // ADST_DCT
1080     { aom_idct4_c, aom_iadst16_c },   // DCT_ADST
1081     { aom_iadst4_c, aom_iadst16_c },  // ADST_ADST
1082 #if CONFIG_EXT_TX
1083     { aom_iadst4_c, aom_idct16_c },   // FLIPADST_DCT
1084     { aom_idct4_c, aom_iadst16_c },   // DCT_FLIPADST
1085     { aom_iadst4_c, aom_iadst16_c },  // FLIPADST_FLIPADST
1086     { aom_iadst4_c, aom_iadst16_c },  // ADST_FLIPADST
1087     { aom_iadst4_c, aom_iadst16_c },  // FLIPADST_ADST
1088     { iidtx4_c, iidtx16_c },          // IDTX
1089     { aom_idct4_c, iidtx16_c },       // V_DCT
1090     { iidtx4_c, aom_idct16_c },       // H_DCT
1091     { aom_iadst4_c, iidtx16_c },      // V_ADST
1092     { iidtx4_c, aom_iadst16_c },      // H_ADST
1093     { aom_iadst4_c, iidtx16_c },      // V_FLIPADST
1094     { iidtx4_c, aom_iadst16_c },      // H_FLIPADST
1095 #endif
1096   };
1097 
1098   const int n = 4;
1099   const int n4 = 16;
1100 
1101   int i, j;
1102   tran_low_t out[16][4], tmp[16][4], outtmp[16];
1103   tran_low_t *outp = &out[0][0];
1104   int outstride = n;
1105 
1106 #if CONFIG_LGT
1107   const tran_high_t *lgtmtx_col[1];
1108   int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
1109 #endif
1110 
1111   // inverse transform row vectors and transpose
1112   for (i = 0; i < n; ++i) {
1113     IHT_16x4[tx_type].rows(input, outtmp);
1114     for (j = 0; j < n4; ++j) tmp[j][i] = outtmp[j];
1115     input += n4;
1116   }
1117 
1118   // inverse transform column vectors
1119   for (i = 0; i < n4; ++i) {
1120 #if CONFIG_LGT
1121     if (use_lgt_col)
1122       ilgt4(tmp[i], out[i], lgtmtx_col[0]);
1123     else
1124 #endif
1125       IHT_16x4[tx_type].cols(tmp[i], out[i]);
1126   }
1127 
1128 #if CONFIG_EXT_TX
1129   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
1130 #endif
1131 
1132   // Sum with the destination
1133   for (i = 0; i < n; ++i) {
1134     for (j = 0; j < n4; ++j) {
1135       int d = i * stride + j;
1136       int s = j * outstride + i;
1137       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
1138     }
1139   }
1140 }
1141 
av1_iht8x16_128_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1142 void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1143                            const TxfmParam *txfm_param) {
1144   const TX_TYPE tx_type = txfm_param->tx_type;
1145 #if CONFIG_MRC_TX
1146   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1147 #endif  // CONFIG_MRC_TX
1148 #if CONFIG_DCT_ONLY
1149   assert(tx_type == DCT_DCT);
1150 #endif
1151   static const transform_2d IHT_8x16[] = {
1152     { aom_idct16_c, aom_idct8_c },    // DCT_DCT
1153     { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
1154     { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
1155     { aom_iadst16_c, aom_iadst8_c },  // ADST_ADST
1156 #if CONFIG_EXT_TX
1157     { aom_iadst16_c, aom_idct8_c },   // FLIPADST_DCT
1158     { aom_idct16_c, aom_iadst8_c },   // DCT_FLIPADST
1159     { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_FLIPADST
1160     { aom_iadst16_c, aom_iadst8_c },  // ADST_FLIPADST
1161     { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_ADST
1162     { iidtx16_c, iidtx8_c },          // IDTX
1163     { aom_idct16_c, iidtx8_c },       // V_DCT
1164     { iidtx16_c, aom_idct8_c },       // H_DCT
1165     { aom_iadst16_c, iidtx8_c },      // V_ADST
1166     { iidtx16_c, aom_iadst8_c },      // H_ADST
1167     { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
1168     { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
1169 #endif
1170   };
1171 
1172   const int n = 8;
1173   const int n2 = 16;
1174   int i, j;
1175   tran_low_t out[8][16], tmp[8][16], outtmp[8];
1176   tran_low_t *outp = &out[0][0];
1177   int outstride = n2;
1178 
1179 #if CONFIG_LGT
1180   const tran_high_t *lgtmtx_row[1];
1181   int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
1182 #endif
1183 
1184   // inverse transform row vectors and transpose
1185   for (i = 0; i < n2; ++i) {
1186 #if CONFIG_LGT
1187     if (use_lgt_row)
1188       ilgt8(input, outtmp, lgtmtx_row[0]);
1189     else
1190 #endif
1191       IHT_8x16[tx_type].rows(input, outtmp);
1192     for (j = 0; j < n; ++j)
1193       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
1194     input += n;
1195   }
1196 
1197   // inverse transform column vectors
1198   for (i = 0; i < n; ++i) {
1199     IHT_8x16[tx_type].cols(tmp[i], out[i]);
1200   }
1201 
1202 #if CONFIG_EXT_TX
1203   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
1204 #endif
1205 
1206   // Sum with the destination
1207   for (i = 0; i < n2; ++i) {
1208     for (j = 0; j < n; ++j) {
1209       int d = i * stride + j;
1210       int s = j * outstride + i;
1211       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
1212     }
1213   }
1214 }
1215 
av1_iht16x8_128_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1216 void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1217                            const TxfmParam *txfm_param) {
1218   const TX_TYPE tx_type = txfm_param->tx_type;
1219 #if CONFIG_MRC_TX
1220   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1221 #endif  // CONFIG_MRC_TX
1222 #if CONFIG_DCT_ONLY
1223   assert(tx_type == DCT_DCT);
1224 #endif
1225   static const transform_2d IHT_16x8[] = {
1226     { aom_idct8_c, aom_idct16_c },    // DCT_DCT
1227     { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
1228     { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
1229     { aom_iadst8_c, aom_iadst16_c },  // ADST_ADST
1230 #if CONFIG_EXT_TX
1231     { aom_iadst8_c, aom_idct16_c },   // FLIPADST_DCT
1232     { aom_idct8_c, aom_iadst16_c },   // DCT_FLIPADST
1233     { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_FLIPADST
1234     { aom_iadst8_c, aom_iadst16_c },  // ADST_FLIPADST
1235     { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_ADST
1236     { iidtx8_c, iidtx16_c },          // IDTX
1237     { aom_idct8_c, iidtx16_c },       // V_DCT
1238     { iidtx8_c, aom_idct16_c },       // H_DCT
1239     { aom_iadst8_c, iidtx16_c },      // V_ADST
1240     { iidtx8_c, aom_iadst16_c },      // H_ADST
1241     { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
1242     { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
1243 #endif
1244   };
1245 
1246   const int n = 8;
1247   const int n2 = 16;
1248 
1249   int i, j;
1250   tran_low_t out[16][8], tmp[16][8], outtmp[16];
1251   tran_low_t *outp = &out[0][0];
1252   int outstride = n;
1253 
1254 #if CONFIG_LGT
1255   const tran_high_t *lgtmtx_col[1];
1256   int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
1257 #endif
1258 
1259   // inverse transform row vectors and transpose
1260   for (i = 0; i < n; ++i) {
1261     IHT_16x8[tx_type].rows(input, outtmp);
1262     for (j = 0; j < n2; ++j)
1263       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
1264     input += n2;
1265   }
1266 
1267   // inverse transform column vectors
1268   for (i = 0; i < n2; ++i) {
1269 #if CONFIG_LGT
1270     if (use_lgt_col)
1271       ilgt8(tmp[i], out[i], lgtmtx_col[0]);
1272     else
1273 #endif
1274       IHT_16x8[tx_type].cols(tmp[i], out[i]);
1275   }
1276 
1277 #if CONFIG_EXT_TX
1278   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
1279 #endif
1280 
1281   // Sum with the destination
1282   for (i = 0; i < n; ++i) {
1283     for (j = 0; j < n2; ++j) {
1284       int d = i * stride + j;
1285       int s = j * outstride + i;
1286       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
1287     }
1288   }
1289 }
1290 
av1_iht8x32_256_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1291 void av1_iht8x32_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1292                            const TxfmParam *txfm_param) {
1293   const TX_TYPE tx_type = txfm_param->tx_type;
1294 #if CONFIG_MRC_TX
1295   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1296 #endif  // CONFIG_MRC_TX
1297 #if CONFIG_DCT_ONLY
1298   assert(tx_type == DCT_DCT);
1299 #endif
1300   static const transform_2d IHT_8x32[] = {
1301     { aom_idct32_c, aom_idct8_c },     // DCT_DCT
1302     { ihalfright32_c, aom_idct8_c },   // ADST_DCT
1303     { aom_idct32_c, aom_iadst8_c },    // DCT_ADST
1304     { ihalfright32_c, aom_iadst8_c },  // ADST_ADST
1305 #if CONFIG_EXT_TX
1306     { ihalfright32_c, aom_idct8_c },   // FLIPADST_DCT
1307     { aom_idct32_c, aom_iadst8_c },    // DCT_FLIPADST
1308     { ihalfright32_c, aom_iadst8_c },  // FLIPADST_FLIPADST
1309     { ihalfright32_c, aom_iadst8_c },  // ADST_FLIPADST
1310     { ihalfright32_c, aom_iadst8_c },  // FLIPADST_ADST
1311     { iidtx32_c, iidtx8_c },           // IDTX
1312     { aom_idct32_c, iidtx8_c },        // V_DCT
1313     { iidtx32_c, aom_idct8_c },        // H_DCT
1314     { ihalfright32_c, iidtx8_c },      // V_ADST
1315     { iidtx32_c, aom_iadst8_c },       // H_ADST
1316     { ihalfright32_c, iidtx8_c },      // V_FLIPADST
1317     { iidtx32_c, aom_iadst8_c },       // H_FLIPADST
1318 #endif
1319   };
1320 
1321   const int n = 8;
1322   const int n4 = 32;
1323   int i, j;
1324   tran_low_t out[8][32], tmp[8][32], outtmp[8];
1325   tran_low_t *outp = &out[0][0];
1326   int outstride = n4;
1327 
1328 #if CONFIG_LGT
1329   const tran_high_t *lgtmtx_row[1];
1330   int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
1331 #endif
1332 
1333   // inverse transform row vectors and transpose
1334   for (i = 0; i < n4; ++i) {
1335 #if CONFIG_LGT
1336     if (use_lgt_row)
1337       ilgt8(input, outtmp, lgtmtx_row[0]);
1338     else
1339 #endif
1340       IHT_8x32[tx_type].rows(input, outtmp);
1341     for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
1342     input += n;
1343   }
1344 
1345   // inverse transform column vectors
1346   for (i = 0; i < n; ++i) {
1347     IHT_8x32[tx_type].cols(tmp[i], out[i]);
1348   }
1349 
1350 #if CONFIG_EXT_TX
1351   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n4, n);
1352 #endif
1353 
1354   // Sum with the destination
1355   for (i = 0; i < n4; ++i) {
1356     for (j = 0; j < n; ++j) {
1357       int d = i * stride + j;
1358       int s = j * outstride + i;
1359       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
1360     }
1361   }
1362 }
1363 
av1_iht32x8_256_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1364 void av1_iht32x8_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1365                            const TxfmParam *txfm_param) {
1366   const TX_TYPE tx_type = txfm_param->tx_type;
1367 #if CONFIG_MRC_TX
1368   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1369 #endif  // CONFIG_MRC_TX
1370 #if CONFIG_DCT_ONLY
1371   assert(tx_type == DCT_DCT);
1372 #endif
1373   static const transform_2d IHT_32x8[] = {
1374     { aom_idct8_c, aom_idct32_c },     // DCT_DCT
1375     { aom_iadst8_c, aom_idct32_c },    // ADST_DCT
1376     { aom_idct8_c, ihalfright32_c },   // DCT_ADST
1377     { aom_iadst8_c, ihalfright32_c },  // ADST_ADST
1378 #if CONFIG_EXT_TX
1379     { aom_iadst8_c, aom_idct32_c },    // FLIPADST_DCT
1380     { aom_idct8_c, ihalfright32_c },   // DCT_FLIPADST
1381     { aom_iadst8_c, ihalfright32_c },  // FLIPADST_FLIPADST
1382     { aom_iadst8_c, ihalfright32_c },  // ADST_FLIPADST
1383     { aom_iadst8_c, ihalfright32_c },  // FLIPADST_ADST
1384     { iidtx8_c, iidtx32_c },           // IDTX
1385     { aom_idct8_c, iidtx32_c },        // V_DCT
1386     { iidtx8_c, aom_idct32_c },        // H_DCT
1387     { aom_iadst8_c, iidtx32_c },       // V_ADST
1388     { iidtx8_c, ihalfright32_c },      // H_ADST
1389     { aom_iadst8_c, iidtx32_c },       // V_FLIPADST
1390     { iidtx8_c, ihalfright32_c },      // H_FLIPADST
1391 #endif
1392   };
1393 
1394   const int n = 8;
1395   const int n4 = 32;
1396 
1397   int i, j;
1398   tran_low_t out[32][8], tmp[32][8], outtmp[32];
1399   tran_low_t *outp = &out[0][0];
1400   int outstride = n;
1401 
1402 #if CONFIG_LGT
1403   const tran_high_t *lgtmtx_col[1];
1404   int use_lgt_col = get_lgt4(txfm_param, 1, lgtmtx_col);
1405 #endif
1406 
1407   // inverse transform row vectors and transpose
1408   for (i = 0; i < n; ++i) {
1409     IHT_32x8[tx_type].rows(input, outtmp);
1410     for (j = 0; j < n4; ++j) tmp[j][i] = outtmp[j];
1411     input += n4;
1412   }
1413 
1414   // inverse transform column vectors
1415   for (i = 0; i < n4; ++i) {
1416 #if CONFIG_LGT
1417     if (use_lgt_col)
1418       ilgt8(tmp[i], out[i], lgtmtx_col[0]);
1419     else
1420 #endif
1421       IHT_32x8[tx_type].cols(tmp[i], out[i]);
1422   }
1423 
1424 #if CONFIG_EXT_TX
1425   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n4);
1426 #endif
1427 
1428   // Sum with the destination
1429   for (i = 0; i < n; ++i) {
1430     for (j = 0; j < n4; ++j) {
1431       int d = i * stride + j;
1432       int s = j * outstride + i;
1433       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
1434     }
1435   }
1436 }
1437 
av1_iht16x32_512_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1438 void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1439                             const TxfmParam *txfm_param) {
1440   const TX_TYPE tx_type = txfm_param->tx_type;
1441 #if CONFIG_MRC_TX
1442   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1443 #endif  // CONFIG_MRC_TX
1444 #if CONFIG_DCT_ONLY
1445   assert(tx_type == DCT_DCT);
1446 #endif
1447   static const transform_2d IHT_16x32[] = {
1448     { aom_idct32_c, aom_idct16_c },     // DCT_DCT
1449     { ihalfright32_c, aom_idct16_c },   // ADST_DCT
1450     { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
1451     { ihalfright32_c, aom_iadst16_c },  // ADST_ADST
1452 #if CONFIG_EXT_TX
1453     { ihalfright32_c, aom_idct16_c },   // FLIPADST_DCT
1454     { aom_idct32_c, aom_iadst16_c },    // DCT_FLIPADST
1455     { ihalfright32_c, aom_iadst16_c },  // FLIPADST_FLIPADST
1456     { ihalfright32_c, aom_iadst16_c },  // ADST_FLIPADST
1457     { ihalfright32_c, aom_iadst16_c },  // FLIPADST_ADST
1458     { iidtx32_c, iidtx16_c },           // IDTX
1459     { aom_idct32_c, iidtx16_c },        // V_DCT
1460     { iidtx32_c, aom_idct16_c },        // H_DCT
1461     { ihalfright32_c, iidtx16_c },      // V_ADST
1462     { iidtx32_c, aom_iadst16_c },       // H_ADST
1463     { ihalfright32_c, iidtx16_c },      // V_FLIPADST
1464     { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
1465 #endif
1466   };
1467 
1468   const int n = 16;
1469   const int n2 = 32;
1470   int i, j;
1471   tran_low_t out[16][32], tmp[16][32], outtmp[16];
1472   tran_low_t *outp = &out[0][0];
1473   int outstride = n2;
1474 
1475   // inverse transform row vectors and transpose
1476   for (i = 0; i < n2; ++i) {
1477     IHT_16x32[tx_type].rows(input, outtmp);
1478     for (j = 0; j < n; ++j)
1479       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
1480     input += n;
1481   }
1482 
1483   // inverse transform column vectors
1484   for (i = 0; i < n; ++i) IHT_16x32[tx_type].cols(tmp[i], out[i]);
1485 
1486 #if CONFIG_EXT_TX
1487   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
1488 #endif
1489 
1490   // Sum with the destination
1491   for (i = 0; i < n2; ++i) {
1492     for (j = 0; j < n; ++j) {
1493       int d = i * stride + j;
1494       int s = j * outstride + i;
1495       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
1496     }
1497   }
1498 }
1499 
av1_iht32x16_512_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1500 void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1501                             const TxfmParam *txfm_param) {
1502   const TX_TYPE tx_type = txfm_param->tx_type;
1503 #if CONFIG_MRC_TX
1504   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1505 #endif  // CONFIG_MRC_TX
1506 #if CONFIG_DCT_ONLY
1507   assert(tx_type == DCT_DCT);
1508 #endif
1509   static const transform_2d IHT_32x16[] = {
1510     { aom_idct16_c, aom_idct32_c },     // DCT_DCT
1511     { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
1512     { aom_idct16_c, ihalfright32_c },   // DCT_ADST
1513     { aom_iadst16_c, ihalfright32_c },  // ADST_ADST
1514 #if CONFIG_EXT_TX
1515     { aom_iadst16_c, aom_idct32_c },    // FLIPADST_DCT
1516     { aom_idct16_c, ihalfright32_c },   // DCT_FLIPADST
1517     { aom_iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
1518     { aom_iadst16_c, ihalfright32_c },  // ADST_FLIPADST
1519     { aom_iadst16_c, ihalfright32_c },  // FLIPADST_ADST
1520     { iidtx16_c, iidtx32_c },           // IDTX
1521     { aom_idct16_c, iidtx32_c },        // V_DCT
1522     { iidtx16_c, aom_idct32_c },        // H_DCT
1523     { aom_iadst16_c, iidtx32_c },       // V_ADST
1524     { iidtx16_c, ihalfright32_c },      // H_ADST
1525     { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
1526     { iidtx16_c, ihalfright32_c },      // H_FLIPADST
1527 #endif
1528   };
1529   const int n = 16;
1530   const int n2 = 32;
1531 
1532   int i, j;
1533   tran_low_t out[32][16], tmp[32][16], outtmp[32];
1534   tran_low_t *outp = &out[0][0];
1535   int outstride = n;
1536 
1537   // inverse transform row vectors and transpose
1538   for (i = 0; i < n; ++i) {
1539     IHT_32x16[tx_type].rows(input, outtmp);
1540     for (j = 0; j < n2; ++j)
1541       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
1542     input += n2;
1543   }
1544 
1545   // inverse transform column vectors
1546   for (i = 0; i < n2; ++i) IHT_32x16[tx_type].cols(tmp[i], out[i]);
1547 
1548 #if CONFIG_EXT_TX
1549   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
1550 #endif
1551 
1552   // Sum with the destination
1553   for (i = 0; i < n; ++i) {
1554     for (j = 0; j < n2; ++j) {
1555       int d = i * stride + j;
1556       int s = j * outstride + i;
1557       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
1558     }
1559   }
1560 }
1561 
av1_iht8x8_64_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1562 void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1563                          const TxfmParam *txfm_param) {
1564   const TX_TYPE tx_type = txfm_param->tx_type;
1565 #if CONFIG_MRC_TX
1566   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1567 #endif  // CONFIG_MRC_TX
1568 #if CONFIG_DCT_ONLY
1569   assert(tx_type == DCT_DCT);
1570 #endif
1571   static const transform_2d IHT_8[] = {
1572 #if CONFIG_DAALA_DCT8
1573     { daala_idct8, daala_idct8 },  // DCT_DCT  = 0
1574     { daala_idst8, daala_idct8 },  // ADST_DCT = 1
1575     { daala_idct8, daala_idst8 },  // DCT_ADST = 2
1576     { daala_idst8, daala_idst8 },  // ADST_ADST = 3
1577 #if CONFIG_EXT_TX
1578     { daala_idst8, daala_idct8 },  // FLIPADST_DCT
1579     { daala_idct8, daala_idst8 },  // DCT_FLIPADST
1580     { daala_idst8, daala_idst8 },  // FLIPADST_FLIPADST
1581     { daala_idst8, daala_idst8 },  // ADST_FLIPADST
1582     { daala_idst8, daala_idst8 },  // FLIPADST_ADST
1583     { daala_idtx8, daala_idtx8 },  // IDTX
1584     { daala_idct8, daala_idtx8 },  // V_DCT
1585     { daala_idtx8, daala_idct8 },  // H_DCT
1586     { daala_idst8, daala_idtx8 },  // V_ADST
1587     { daala_idtx8, daala_idst8 },  // H_ADST
1588     { daala_idst8, daala_idtx8 },  // V_FLIPADST
1589     { daala_idtx8, daala_idst8 },  // H_FLIPADST
1590 #endif
1591 #else
1592     { aom_idct8_c, aom_idct8_c },    // DCT_DCT  = 0
1593     { aom_iadst8_c, aom_idct8_c },   // ADST_DCT = 1
1594     { aom_idct8_c, aom_iadst8_c },   // DCT_ADST = 2
1595     { aom_iadst8_c, aom_iadst8_c },  // ADST_ADST = 3
1596 #if CONFIG_EXT_TX
1597     { aom_iadst8_c, aom_idct8_c },   // FLIPADST_DCT
1598     { aom_idct8_c, aom_iadst8_c },   // DCT_FLIPADST
1599     { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_FLIPADST
1600     { aom_iadst8_c, aom_iadst8_c },  // ADST_FLIPADST
1601     { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_ADST
1602     { iidtx8_c, iidtx8_c },          // IDTX
1603     { aom_idct8_c, iidtx8_c },       // V_DCT
1604     { iidtx8_c, aom_idct8_c },       // H_DCT
1605     { aom_iadst8_c, iidtx8_c },      // V_ADST
1606     { iidtx8_c, aom_iadst8_c },      // H_ADST
1607     { aom_iadst8_c, iidtx8_c },      // V_FLIPADST
1608     { iidtx8_c, aom_iadst8_c },      // H_FLIPADST
1609 #endif
1610 #endif
1611   };
1612 
1613   int i, j;
1614   tran_low_t tmp[8][8];
1615   tran_low_t out[8][8];
1616   tran_low_t *outp = &out[0][0];
1617   int outstride = 8;
1618 
1619 #if CONFIG_LGT
1620   const tran_high_t *lgtmtx_col[1];
1621   const tran_high_t *lgtmtx_row[1];
1622   int use_lgt_col = get_lgt8(txfm_param, 1, lgtmtx_col);
1623   int use_lgt_row = get_lgt8(txfm_param, 0, lgtmtx_row);
1624 #endif
1625 
1626   // inverse transform row vectors
1627   for (i = 0; i < 8; ++i) {
1628 #if CONFIG_DAALA_DCT8
1629     tran_low_t temp_in[8];
1630     for (j = 0; j < 8; j++) temp_in[j] = input[j] * 2;
1631     IHT_8[tx_type].rows(temp_in, out[i]);
1632 #else
1633 #if CONFIG_LGT
1634     if (use_lgt_row)
1635       ilgt8(input, out[i], lgtmtx_row[0]);
1636     else
1637 #endif
1638       IHT_8[tx_type].rows(input, out[i]);
1639 #endif
1640     input += 8;
1641   }
1642 
1643   // transpose
1644   for (i = 0; i < 8; i++) {
1645     for (j = 0; j < 8; j++) {
1646       tmp[j][i] = out[i][j];
1647     }
1648   }
1649 
1650   // inverse transform column vectors
1651   for (i = 0; i < 8; ++i) {
1652 #if CONFIG_LGT
1653     if (use_lgt_col)
1654       ilgt8(tmp[i], out[i], lgtmtx_col[0]);
1655     else
1656 #endif
1657       IHT_8[tx_type].cols(tmp[i], out[i]);
1658   }
1659 
1660 #if CONFIG_EXT_TX
1661   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
1662 #endif
1663 
1664   // Sum with the destination
1665   for (i = 0; i < 8; ++i) {
1666     for (j = 0; j < 8; ++j) {
1667       int d = i * stride + j;
1668       int s = j * outstride + i;
1669 #if CONFIG_DAALA_DCT8
1670       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
1671 #else
1672       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
1673 #endif
1674     }
1675   }
1676 }
1677 
av1_iht16x16_256_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1678 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1679                             const TxfmParam *txfm_param) {
1680   const TX_TYPE tx_type = txfm_param->tx_type;
1681 #if CONFIG_MRC_TX
1682   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1683 #endif  // CONFIG_MRC_TX
1684 #if CONFIG_DCT_ONLY
1685   assert(tx_type == DCT_DCT);
1686 #endif
1687   static const transform_2d IHT_16[] = {
1688 #if CONFIG_DAALA_DCT16
1689     { daala_idct16, daala_idct16 },  // DCT_DCT  = 0
1690     { daala_idst16, daala_idct16 },  // ADST_DCT = 1
1691     { daala_idct16, daala_idst16 },  // DCT_ADST = 2
1692     { daala_idst16, daala_idst16 },  // ADST_ADST = 3
1693 #if CONFIG_EXT_TX
1694     { daala_idst16, daala_idct16 },  // FLIPADST_DCT
1695     { daala_idct16, daala_idst16 },  // DCT_FLIPADST
1696     { daala_idst16, daala_idst16 },  // FLIPADST_FLIPADST
1697     { daala_idst16, daala_idst16 },  // ADST_FLIPADST
1698     { daala_idst16, daala_idst16 },  // FLIPADST_ADST
1699     { daala_idtx16, daala_idtx16 },  // IDTX
1700     { daala_idct16, daala_idtx16 },  // V_DCT
1701     { daala_idtx16, daala_idct16 },  // H_DCT
1702     { daala_idst16, daala_idtx16 },  // V_ADST
1703     { daala_idtx16, daala_idst16 },  // H_ADST
1704     { daala_idst16, daala_idtx16 },  // V_FLIPADST
1705     { daala_idtx16, daala_idst16 },  // H_FLIPADST
1706 #endif
1707 #else
1708     { aom_idct16_c, aom_idct16_c },    // DCT_DCT  = 0
1709     { aom_iadst16_c, aom_idct16_c },   // ADST_DCT = 1
1710     { aom_idct16_c, aom_iadst16_c },   // DCT_ADST = 2
1711     { aom_iadst16_c, aom_iadst16_c },  // ADST_ADST = 3
1712 #if CONFIG_EXT_TX
1713     { aom_iadst16_c, aom_idct16_c },   // FLIPADST_DCT
1714     { aom_idct16_c, aom_iadst16_c },   // DCT_FLIPADST
1715     { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_FLIPADST
1716     { aom_iadst16_c, aom_iadst16_c },  // ADST_FLIPADST
1717     { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_ADST
1718     { iidtx16_c, iidtx16_c },          // IDTX
1719     { aom_idct16_c, iidtx16_c },       // V_DCT
1720     { iidtx16_c, aom_idct16_c },       // H_DCT
1721     { aom_iadst16_c, iidtx16_c },      // V_ADST
1722     { iidtx16_c, aom_iadst16_c },      // H_ADST
1723     { aom_iadst16_c, iidtx16_c },      // V_FLIPADST
1724     { iidtx16_c, aom_iadst16_c },      // H_FLIPADST
1725 #endif
1726 #endif
1727   };
1728 
1729   int i, j;
1730   tran_low_t tmp[16][16];
1731   tran_low_t out[16][16];
1732   tran_low_t *outp = &out[0][0];
1733   int outstride = 16;
1734 
1735   // inverse transform row vectors
1736   for (i = 0; i < 16; ++i) {
1737 #if CONFIG_DAALA_DCT16
1738     tran_low_t temp_in[16];
1739     for (j = 0; j < 16; j++) temp_in[j] = input[j] * 2;
1740     IHT_16[tx_type].rows(temp_in, out[i]);
1741 #else
1742     IHT_16[tx_type].rows(input, out[i]);
1743 #endif
1744     input += 16;
1745   }
1746 
1747   // transpose
1748   for (i = 0; i < 16; i++) {
1749     for (j = 0; j < 16; j++) {
1750       tmp[j][i] = out[i][j];
1751     }
1752   }
1753 
1754   // inverse transform column vectors
1755   for (i = 0; i < 16; ++i) IHT_16[tx_type].cols(tmp[i], out[i]);
1756 
1757 #if CONFIG_EXT_TX
1758   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
1759 #endif
1760 
1761   // Sum with the destination
1762   for (i = 0; i < 16; ++i) {
1763     for (j = 0; j < 16; ++j) {
1764       int d = i * stride + j;
1765       int s = j * outstride + i;
1766 #if CONFIG_DAALA_DCT16
1767       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
1768 #else
1769       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
1770 #endif
1771     }
1772   }
1773 }
1774 
1775 #if CONFIG_EXT_TX || CONFIG_DAALA_DCT32
av1_iht32x32_1024_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1776 void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1777                              const TxfmParam *txfm_param) {
1778   const TX_TYPE tx_type = txfm_param->tx_type;
1779 #if CONFIG_DCT_ONLY
1780   assert(tx_type == DCT_DCT);
1781 #endif
1782   static const transform_2d IHT_32[] = {
1783 #if CONFIG_DAALA_DCT32
1784     { daala_idct32, daala_idct32 },  // DCT_DCT
1785 #if CONFIG_EXT_TX
1786     { daala_idst32, daala_idct32 },  // ADST_DCT
1787     { daala_idct32, daala_idst32 },  // DCT_ADST
1788     { daala_idst32, daala_idst32 },  // ADST_ADST
1789     { daala_idst32, daala_idct32 },  // FLIPADST_DCT
1790     { daala_idct32, daala_idst32 },  // DCT_FLIPADST
1791     { daala_idst32, daala_idst32 },  // FLIPADST_FLIPADST
1792     { daala_idst32, daala_idst32 },  // ADST_FLIPADST
1793     { daala_idst32, daala_idst32 },  // FLIPADST_ADST
1794     { daala_idtx32, daala_idtx32 },  // IDTX
1795     { daala_idct32, daala_idtx32 },  // V_DCT
1796     { daala_idtx32, daala_idct32 },  // H_DCT
1797     { daala_idst32, daala_idtx32 },  // V_ADST
1798     { daala_idtx32, daala_idst32 },  // H_ADST
1799     { daala_idst32, daala_idtx32 },  // V_FLIPADST
1800     { daala_idtx32, daala_idst32 },  // H_FLIPADST
1801 #endif
1802 #else
1803     { aom_idct32_c, aom_idct32_c },      // DCT_DCT
1804 #if CONFIG_EXT_TX
1805     { ihalfright32_c, aom_idct32_c },    // ADST_DCT
1806     { aom_idct32_c, ihalfright32_c },    // DCT_ADST
1807     { ihalfright32_c, ihalfright32_c },  // ADST_ADST
1808     { ihalfright32_c, aom_idct32_c },    // FLIPADST_DCT
1809     { aom_idct32_c, ihalfright32_c },    // DCT_FLIPADST
1810     { ihalfright32_c, ihalfright32_c },  // FLIPADST_FLIPADST
1811     { ihalfright32_c, ihalfright32_c },  // ADST_FLIPADST
1812     { ihalfright32_c, ihalfright32_c },  // FLIPADST_ADST
1813     { iidtx32_c, iidtx32_c },            // IDTX
1814     { aom_idct32_c, iidtx32_c },         // V_DCT
1815     { iidtx32_c, aom_idct32_c },         // H_DCT
1816     { ihalfright32_c, iidtx32_c },       // V_ADST
1817     { iidtx32_c, ihalfright32_c },       // H_ADST
1818     { ihalfright32_c, iidtx32_c },       // V_FLIPADST
1819     { iidtx32_c, ihalfright32_c },       // H_FLIPADST
1820 #endif
1821 #endif
1822   };
1823 
1824   int i, j;
1825   tran_low_t tmp[32][32];
1826   tran_low_t out[32][32];
1827   tran_low_t *outp = &out[0][0];
1828   int outstride = 32;
1829 
1830   // inverse transform row vectors
1831   for (i = 0; i < 32; ++i) {
1832 #if CONFIG_DAALA_DCT32
1833     tran_low_t temp_in[32];
1834     for (j = 0; j < 32; j++) temp_in[j] = input[j] * 2;
1835     IHT_32[tx_type].rows(temp_in, out[i]);
1836 #else
1837     IHT_32[tx_type].rows(input, out[i]);
1838 #endif
1839     input += 32;
1840   }
1841 
1842   // transpose
1843   for (i = 0; i < 32; i++) {
1844     for (j = 0; j < 32; j++) {
1845 #if CONFIG_DAALA_DCT32
1846       tmp[j][i] = out[i][j] * 4;
1847 #else
1848       tmp[j][i] = out[i][j];
1849 #endif
1850     }
1851   }
1852 
1853   // inverse transform column vectors
1854   for (i = 0; i < 32; ++i) IHT_32[tx_type].cols(tmp[i], out[i]);
1855 
1856   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
1857 
1858   // Sum with the destination
1859   for (i = 0; i < 32; ++i) {
1860     for (j = 0; j < 32; ++j) {
1861       int d = i * stride + j;
1862       int s = j * outstride + i;
1863 #if CONFIG_DAALA_DCT32
1864       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
1865 #else
1866       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
1867 #endif
1868     }
1869   }
1870 }
1871 #endif  // CONFIG_EXT_TX || CONFIG_DAALA_DCT32
1872 
1873 #if CONFIG_TX64X64
av1_iht64x64_4096_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1874 void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1875                              const TxfmParam *txfm_param) {
1876   const TX_TYPE tx_type = txfm_param->tx_type;
1877 #if CONFIG_MRC_TX
1878   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1879 #endif  // CONFIG_MRC_TX
1880 #if CONFIG_DCT_ONLY
1881   assert(tx_type == DCT_DCT);
1882 #endif
1883   static const transform_2d IHT_64[] = {
1884 #if CONFIG_DAALA_DCT64
1885     { daala_idct64, daala_idct64 },  // DCT_DCT
1886     { daala_idst64, daala_idct64 },  // ADST_DCT
1887     { daala_idct64, daala_idst64 },  // DCT_ADST
1888     { daala_idst64, daala_idst64 },  // ADST_ADST
1889 #if CONFIG_EXT_TX
1890     { daala_idst64, daala_idct64 },  // FLIPADST_DCT
1891     { daala_idct64, daala_idst64 },  // DCT_FLIPADST
1892     { daala_idst64, daala_idst64 },  // FLIPADST_FLIPADST
1893     { daala_idst64, daala_idst64 },  // ADST_FLIPADST
1894     { daala_idst64, daala_idst64 },  // FLIPADST_ADST
1895     { daala_idtx64, daala_idtx64 },  // IDTX
1896     { daala_idct64, daala_idtx64 },  // V_DCT
1897     { daala_idtx64, daala_idct64 },  // H_DCT
1898     { daala_idst64, daala_idtx64 },  // V_ADST
1899     { daala_idtx64, daala_idst64 },  // H_ADST
1900     { daala_idst64, daala_idtx64 },  // V_FLIPADST
1901     { daala_idtx64, daala_idst64 },  // H_FLIPADST
1902 #endif
1903 #else
1904     { idct64_col_c, idct64_row_c },      // DCT_DCT
1905     { ihalfright64_c, idct64_row_c },    // ADST_DCT
1906     { idct64_col_c, ihalfright64_c },    // DCT_ADST
1907     { ihalfright64_c, ihalfright64_c },  // ADST_ADST
1908 #if CONFIG_EXT_TX
1909     { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
1910     { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
1911     { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
1912     { ihalfright64_c, ihalfright64_c },  // ADST_FLIPADST
1913     { ihalfright64_c, ihalfright64_c },  // FLIPADST_ADST
1914     { iidtx64_c, iidtx64_c },            // IDTX
1915     { idct64_col_c, iidtx64_c },         // V_DCT
1916     { iidtx64_c, idct64_row_c },         // H_DCT
1917     { ihalfright64_c, iidtx64_c },       // V_ADST
1918     { iidtx64_c, ihalfright64_c },       // H_ADST
1919     { ihalfright64_c, iidtx64_c },       // V_FLIPADST
1920     { iidtx64_c, ihalfright64_c },       // H_FLIPADST
1921 #endif
1922 #endif
1923   };
1924 
1925   int i, j;
1926   tran_low_t tmp[64][64];
1927   tran_low_t out[64][64];
1928   tran_low_t *outp = &out[0][0];
1929   int outstride = 64;
1930 
1931   // inverse transform row vectors
1932   for (i = 0; i < 64; ++i) {
1933 #if CONFIG_DAALA_DCT64
1934     tran_low_t temp_in[64];
1935     for (j = 0; j < 64; j++) temp_in[j] = input[j] * 2;
1936     IHT_64[tx_type].rows(temp_in, out[i]);
1937 // Do not rescale intermediate for Daala
1938 #else
1939     IHT_64[tx_type].rows(input, out[i]);
1940     for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
1941 #endif
1942     input += 64;
1943   }
1944 
1945   // transpose
1946   for (i = 0; i < 64; i++) {
1947     for (j = 0; j < 64; j++) {
1948       tmp[j][i] = out[i][j];
1949     }
1950   }
1951 
1952   // inverse transform column vectors
1953   for (i = 0; i < 64; ++i) IHT_64[tx_type].cols(tmp[i], out[i]);
1954 
1955 #if CONFIG_EXT_TX
1956   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
1957 #endif  // CONFIG_EXT_TX
1958 
1959   // Sum with the destination
1960   for (i = 0; i < 64; ++i) {
1961     for (j = 0; j < 64; ++j) {
1962       int d = i * stride + j;
1963       int s = j * outstride + i;
1964 #if CONFIG_DAALA_DCT64
1965       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2));
1966 #else
1967       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
1968 #endif
1969     }
1970   }
1971 }
1972 
av1_iht64x32_2048_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)1973 void av1_iht64x32_2048_add_c(const tran_low_t *input, uint8_t *dest, int stride,
1974                              const TxfmParam *txfm_param) {
1975   const TX_TYPE tx_type = txfm_param->tx_type;
1976 #if CONFIG_MRC_TX
1977   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
1978 #endif  // CONFIG_MRC_TX
1979 #if CONFIG_DCT_ONLY
1980   assert(tx_type == DCT_DCT);
1981 #endif
1982   static const transform_2d IHT_64x32[] = {
1983     { aom_idct32_c, idct64_row_c },      // DCT_DCT
1984     { ihalfright32_c, idct64_row_c },    // ADST_DCT
1985     { aom_idct32_c, ihalfright64_c },    // DCT_ADST
1986     { ihalfright32_c, ihalfright64_c },  // ADST_ADST
1987 #if CONFIG_EXT_TX
1988     { ihalfright32_c, idct64_row_c },    // FLIPADST_DCT
1989     { aom_idct32_c, ihalfright64_c },    // DCT_FLIPADST
1990     { ihalfright32_c, ihalfright64_c },  // FLIPADST_FLIPADST
1991     { ihalfright32_c, ihalfright64_c },  // ADST_FLIPADST
1992     { ihalfright32_c, ihalfright64_c },  // FLIPADST_ADST
1993     { iidtx32_c, iidtx64_c },            // IDTX
1994     { aom_idct32_c, iidtx64_c },         // V_DCT
1995     { iidtx32_c, idct64_row_c },         // H_DCT
1996     { ihalfright32_c, iidtx64_c },       // V_ADST
1997     { iidtx32_c, ihalfright64_c },       // H_ADST
1998     { ihalfright32_c, iidtx64_c },       // V_FLIPADST
1999     { iidtx32_c, ihalfright64_c },       // H_FLIPADST
2000 #endif
2001   };
2002   const int n = 32;
2003   const int n2 = 64;
2004 
2005   int i, j;
2006   tran_low_t out[64][32], tmp[64][32], outtmp[64];
2007   tran_low_t *outp = &out[0][0];
2008   int outstride = n;
2009 
2010   // inverse transform row vectors and transpose
2011   for (i = 0; i < n; ++i) {
2012     IHT_64x32[tx_type].rows(input, outtmp);
2013     for (j = 0; j < n2; ++j)
2014       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
2015     input += n2;
2016   }
2017 
2018   // inverse transform column vectors
2019   for (i = 0; i < n2; ++i) IHT_64x32[tx_type].cols(tmp[i], out[i]);
2020 
2021 #if CONFIG_EXT_TX
2022   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
2023 #endif
2024 
2025   // Sum with the destination
2026   for (i = 0; i < n; ++i) {
2027     for (j = 0; j < n2; ++j) {
2028       int d = i * stride + j;
2029       int s = j * outstride + i;
2030       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
2031     }
2032   }
2033 }
2034 
av1_iht32x64_2048_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2035 void av1_iht32x64_2048_add_c(const tran_low_t *input, uint8_t *dest, int stride,
2036                              const TxfmParam *txfm_param) {
2037   const TX_TYPE tx_type = txfm_param->tx_type;
2038 #if CONFIG_MRC_TX
2039   assert(tx_type != MRC_DCT && "Invalid tx type for tx size");
2040 #endif  // CONFIG_MRC_TX
2041 #if CONFIG_DCT_ONLY
2042   assert(tx_type == DCT_DCT);
2043 #endif
2044   static const transform_2d IHT_32x64[] = {
2045     { idct64_col_c, aom_idct32_c },      // DCT_DCT
2046     { ihalfright64_c, aom_idct32_c },    // ADST_DCT
2047     { idct64_col_c, ihalfright32_c },    // DCT_ADST
2048     { ihalfright64_c, ihalfright32_c },  // ADST_ADST
2049 #if CONFIG_EXT_TX
2050     { ihalfright64_c, aom_idct32_c },    // FLIPADST_DCT
2051     { idct64_col_c, ihalfright32_c },    // DCT_FLIPADST
2052     { ihalfright64_c, ihalfright32_c },  // FLIPADST_FLIPADST
2053     { ihalfright64_c, ihalfright32_c },  // ADST_FLIPADST
2054     { ihalfright64_c, ihalfright32_c },  // FLIPADST_ADST
2055     { iidtx64_c, iidtx32_c },            // IDTX
2056     { idct64_col_c, iidtx32_c },         // V_DCT
2057     { iidtx64_c, aom_idct32_c },         // H_DCT
2058     { ihalfright64_c, iidtx32_c },       // V_ADST
2059     { iidtx64_c, ihalfright32_c },       // H_ADST
2060     { ihalfright64_c, iidtx32_c },       // V_FLIPADST
2061     { iidtx64_c, ihalfright32_c },       // H_FLIPADST
2062 #endif
2063   };
2064 
2065   const int n = 32;
2066   const int n2 = 64;
2067   int i, j;
2068   tran_low_t out[32][64], tmp[32][64], outtmp[32];
2069   tran_low_t *outp = &out[0][0];
2070   int outstride = n2;
2071 
2072   // inverse transform row vectors and transpose
2073   for (i = 0; i < n2; ++i) {
2074     IHT_32x64[tx_type].rows(input, outtmp);
2075     for (j = 0; j < n; ++j)
2076       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
2077     input += n;
2078   }
2079 
2080   // inverse transform column vectors
2081   for (i = 0; i < n; ++i) IHT_32x64[tx_type].cols(tmp[i], out[i]);
2082 
2083 #if CONFIG_EXT_TX
2084   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
2085 #endif
2086 
2087   // Sum with the destination
2088   for (i = 0; i < n2; ++i) {
2089     for (j = 0; j < n; ++j) {
2090       int d = i * stride + j;
2091       int s = j * outstride + i;
2092       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
2093     }
2094   }
2095 }
2096 
2097 #endif  // CONFIG_TX64X64
2098 
2099 // idct
av1_idct4x4_add(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2100 void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2101                      const TxfmParam *txfm_param) {
2102   const int eob = txfm_param->eob;
2103   if (eob > 1)
2104     av1_iht4x4_16_add(input, dest, stride, txfm_param);
2105   else
2106     aom_idct4x4_1_add(input, dest, stride);
2107 }
2108 
av1_iwht4x4_add(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2109 void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2110                      const TxfmParam *txfm_param) {
2111   const int eob = txfm_param->eob;
2112   if (eob > 1)
2113     aom_iwht4x4_16_add(input, dest, stride);
2114   else
2115     aom_iwht4x4_1_add(input, dest, stride);
2116 }
2117 
2118 #if !CONFIG_DAALA_DCT8
idct8x8_add(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2119 static void idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
2120                         const TxfmParam *txfm_param) {
2121 // If dc is 1, then input[0] is the reconstructed value, do not need
2122 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
2123 
2124 // The calculation can be simplified if there are not many non-zero dct
2125 // coefficients. Use eobs to decide what to do.
2126 // TODO(yunqingwang): "eobs = 1" case is also handled in av1_short_idct8x8_c.
2127 // Combine that with code here.
2128 #if CONFIG_ADAPT_SCAN
2129   const int16_t half = txfm_param->eob_threshold[0];
2130 #else
2131   const int16_t half = 12;
2132 #endif
2133 
2134   const int eob = txfm_param->eob;
2135   if (eob == 1)
2136     // DC only DCT coefficient
2137     aom_idct8x8_1_add(input, dest, stride);
2138   else if (eob <= half)
2139     aom_idct8x8_12_add(input, dest, stride);
2140   else
2141     aom_idct8x8_64_add(input, dest, stride);
2142 }
2143 #endif
2144 
2145 #if !CONFIG_DAALA_DCT16
idct16x16_add(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2146 static void idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
2147                           const TxfmParam *txfm_param) {
2148 // The calculation can be simplified if there are not many non-zero dct
2149 // coefficients. Use eobs to separate different cases.
2150 #if CONFIG_ADAPT_SCAN
2151   const int16_t half = txfm_param->eob_threshold[0];
2152   const int16_t quarter = txfm_param->eob_threshold[1];
2153 #else
2154   const int16_t half = 38;
2155   const int16_t quarter = 10;
2156 #endif
2157 
2158   const int eob = txfm_param->eob;
2159   if (eob == 1) /* DC only DCT coefficient. */
2160     aom_idct16x16_1_add(input, dest, stride);
2161   else if (eob <= quarter)
2162     aom_idct16x16_10_add(input, dest, stride);
2163   else if (eob <= half)
2164     aom_idct16x16_38_add(input, dest, stride);
2165   else
2166     aom_idct16x16_256_add(input, dest, stride);
2167 }
2168 #endif
2169 
2170 #if CONFIG_MRC_TX
imrc32x32_add_c(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2171 static void imrc32x32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
2172                             const TxfmParam *txfm_param) {
2173 #if CONFIG_ADAPT_SCAN
2174   const int16_t half = txfm_param->eob_threshold[0];
2175   const int16_t quarter = txfm_param->eob_threshold[1];
2176 #else
2177   const int16_t half = 135;
2178   const int16_t quarter = 34;
2179 #endif
2180 
2181   const int eob = txfm_param->eob;
2182   int n_masked_vals = 0;
2183   uint8_t *mask;
2184   uint8_t mask_tmp[32 * 32];
2185   if (eob == 1) {
2186     aom_idct32x32_1_add_c(input, dest, stride);
2187   } else {
2188     if ((txfm_param->is_inter && SIGNAL_MRC_MASK_INTER) ||
2189         (!txfm_param->is_inter && SIGNAL_MRC_MASK_INTRA)) {
2190       mask = txfm_param->mask;
2191     } else {
2192       n_masked_vals =
2193           get_mrc_pred_mask(txfm_param->dst, txfm_param->stride, mask_tmp, 32,
2194                             32, 32, txfm_param->is_inter);
2195       if (!is_valid_mrc_mask(n_masked_vals, 32, 32))
2196         assert(0 && "Invalid MRC mask");
2197       mask = mask_tmp;
2198     }
2199     if (eob <= quarter)
2200       // non-zero coeff only in upper-left 8x8
2201       aom_imrc32x32_34_add_c(input, dest, stride, mask);
2202     else if (eob <= half)
2203       // non-zero coeff only in upper-left 16x16
2204       aom_imrc32x32_135_add_c(input, dest, stride, mask);
2205     else
2206       aom_imrc32x32_1024_add_c(input, dest, stride, mask);
2207   }
2208 }
2209 #endif  // CONFIG_MRC_TX
2210 
2211 #if !CONFIG_DAALA_DCT32
idct32x32_add(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2212 static void idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
2213                           const TxfmParam *txfm_param) {
2214 #if CONFIG_ADAPT_SCAN
2215   const int16_t half = txfm_param->eob_threshold[0];
2216   const int16_t quarter = txfm_param->eob_threshold[1];
2217 #else
2218   const int16_t half = 135;
2219   const int16_t quarter = 34;
2220 #endif
2221 
2222   const int eob = txfm_param->eob;
2223   if (eob == 1)
2224     aom_idct32x32_1_add(input, dest, stride);
2225   else if (eob <= quarter)
2226     // non-zero coeff only in upper-left 8x8
2227     aom_idct32x32_34_add(input, dest, stride);
2228   else if (eob <= half)
2229     // non-zero coeff only in upper-left 16x16
2230     aom_idct32x32_135_add(input, dest, stride);
2231   else
2232     aom_idct32x32_1024_add(input, dest, stride);
2233 }
2234 #endif
2235 
2236 #if CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
idct64x64_add(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2237 static void idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
2238                           const TxfmParam *txfm_param) {
2239   (void)txfm_param;
2240   av1_iht64x64_4096_add(input, dest, stride, txfm_param);
2241 }
2242 #endif  // CONFIG_TX64X64 && !CONFIG_DAALA_DCT64
2243 
2244 #if CONFIG_CHROMA_2X2
inv_txfm_add_2x2(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2245 static void inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest, int stride,
2246                              const TxfmParam *txfm_param) {
2247   tran_high_t a1 = input[0] >> UNIT_QUANT_SHIFT;
2248   tran_high_t b1 = input[1] >> UNIT_QUANT_SHIFT;
2249   tran_high_t c1 = input[2] >> UNIT_QUANT_SHIFT;
2250   tran_high_t d1 = input[3] >> UNIT_QUANT_SHIFT;
2251 
2252   tran_high_t a2 = a1 + c1;
2253   tran_high_t b2 = b1 + d1;
2254   tran_high_t c2 = a1 - c1;
2255   tran_high_t d2 = b1 - d1;
2256 
2257   (void)txfm_param;
2258 
2259   a1 = (a2 + b2) >> 2;
2260   b1 = (a2 - b2) >> 2;
2261   c1 = (c2 + d2) >> 2;
2262   d1 = (c2 - d2) >> 2;
2263 
2264   dest[0] = clip_pixel_add(dest[0], WRAPLOW(a1));
2265   dest[1] = clip_pixel_add(dest[1], WRAPLOW(b1));
2266   dest[stride] = clip_pixel_add(dest[stride], WRAPLOW(c1));
2267   dest[stride + 1] = clip_pixel_add(dest[stride + 1], WRAPLOW(d1));
2268 }
2269 #endif
2270 
inv_txfm_add_4x4(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2271 static void inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
2272                              const TxfmParam *txfm_param) {
2273   const TX_TYPE tx_type = txfm_param->tx_type;
2274   if (txfm_param->lossless) {
2275     assert(tx_type == DCT_DCT);
2276     av1_iwht4x4_add(input, dest, stride, txfm_param);
2277     return;
2278   }
2279 
2280   switch (tx_type) {
2281 #if !CONFIG_DAALA_DCT4
2282     case DCT_DCT: av1_idct4x4_add(input, dest, stride, txfm_param); break;
2283 #else
2284     case DCT_DCT:
2285 #endif
2286     case ADST_DCT:
2287     case DCT_ADST:
2288     case ADST_ADST:
2289 #if CONFIG_LGT || CONFIG_DAALA_DCT4
2290       // LGT only exists in C verson
2291       av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
2292       break;
2293 #else
2294       av1_iht4x4_16_add(input, dest, stride, txfm_param);
2295       break;
2296 #endif
2297 #if CONFIG_EXT_TX
2298     case FLIPADST_DCT:
2299     case DCT_FLIPADST:
2300     case FLIPADST_FLIPADST:
2301     case ADST_FLIPADST:
2302     case FLIPADST_ADST:
2303 #if CONFIG_LGT || CONFIG_DAALA_DCT4
2304       av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
2305       break;
2306 #else
2307       av1_iht4x4_16_add(input, dest, stride, txfm_param);
2308       break;
2309 #endif
2310     case V_DCT:
2311     case H_DCT:
2312     case V_ADST:
2313     case H_ADST:
2314     case V_FLIPADST:
2315     case H_FLIPADST:
2316       // Use C version since DST only exists in C code
2317       av1_iht4x4_16_add_c(input, dest, stride, txfm_param);
2318       break;
2319     case IDTX: inv_idtx_add_c(input, dest, stride, 4, 4, tx_type); break;
2320 #endif  // CONFIG_EXT_TX
2321     default: assert(0); break;
2322   }
2323 }
2324 
inv_txfm_add_4x8(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2325 static void inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
2326                              const TxfmParam *txfm_param) {
2327 #if CONFIG_LGT
2328   av1_iht4x8_32_add_c(input, dest, stride, txfm_param);
2329 #else
2330   av1_iht4x8_32_add(input, dest, stride, txfm_param);
2331 #endif
2332 }
2333 
inv_txfm_add_8x4(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2334 static void inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
2335                              const TxfmParam *txfm_param) {
2336 #if CONFIG_LGT
2337   av1_iht8x4_32_add_c(input, dest, stride, txfm_param);
2338 #else
2339   av1_iht8x4_32_add(input, dest, stride, txfm_param);
2340 #endif
2341 }
2342 
2343 // These will be used by the masked-tx experiment in the future.
2344 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
inv_txfm_add_4x16(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2345 static void inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
2346                               int stride, const TxfmParam *txfm_param) {
2347 #if CONFIG_LGT
2348   av1_iht4x16_64_add_c(input, dest, stride, txfm_param);
2349 #else
2350   av1_iht4x16_64_add(input, dest, stride, txfm_param);
2351 #endif
2352 }
2353 
inv_txfm_add_16x4(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2354 static void inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
2355                               int stride, const TxfmParam *txfm_param) {
2356 #if CONFIG_LGT
2357   av1_iht16x4_64_add_c(input, dest, stride, txfm_param);
2358 #else
2359   av1_iht16x4_64_add(input, dest, stride, txfm_param);
2360 #endif
2361 }
2362 
inv_txfm_add_8x32(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2363 static void inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
2364                               int stride, const TxfmParam *txfm_param) {
2365 #if CONFIG_LGT
2366   av1_iht8x32_256_add_c(input, dest, stride, txfm_param);
2367 #else
2368   av1_iht8x32_256_add(input, dest, stride, txfm_param);
2369 #endif
2370 }
2371 
inv_txfm_add_32x8(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2372 static void inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
2373                               int stride, const TxfmParam *txfm_param) {
2374 #if CONFIG_LGT
2375   av1_iht32x8_256_add_c(input, dest, stride, txfm_param);
2376 #else
2377   av1_iht32x8_256_add(input, dest, stride, txfm_param);
2378 #endif
2379 }
2380 #endif
2381 
inv_txfm_add_8x16(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2382 static void inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
2383                               int stride, const TxfmParam *txfm_param) {
2384 #if CONFIG_LGT
2385   av1_iht8x16_128_add_c(input, dest, stride, txfm_param);
2386 #else
2387   av1_iht8x16_128_add(input, dest, stride, txfm_param);
2388 #endif
2389 }
2390 
inv_txfm_add_16x8(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2391 static void inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
2392                               int stride, const TxfmParam *txfm_param) {
2393 #if CONFIG_LGT
2394   av1_iht16x8_128_add_c(input, dest, stride, txfm_param);
2395 #else
2396   av1_iht16x8_128_add(input, dest, stride, txfm_param);
2397 #endif
2398 }
2399 
inv_txfm_add_16x32(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2400 static void inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
2401                                int stride, const TxfmParam *txfm_param) {
2402   av1_iht16x32_512_add(input, dest, stride, txfm_param);
2403 }
2404 
inv_txfm_add_32x16(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2405 static void inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
2406                                int stride, const TxfmParam *txfm_param) {
2407   av1_iht32x16_512_add(input, dest, stride, txfm_param);
2408 }
2409 
2410 #if CONFIG_TX64X64
inv_txfm_add_32x64(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2411 static void inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
2412                                int stride, const TxfmParam *txfm_param) {
2413   av1_iht32x64_2048_add(input, dest, stride, txfm_param);
2414 }
2415 
inv_txfm_add_64x32(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2416 static void inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
2417                                int stride, const TxfmParam *txfm_param) {
2418   av1_iht64x32_2048_add(input, dest, stride, txfm_param);
2419 }
2420 #endif  // CONFIG_TX64X64
2421 
inv_txfm_add_8x8(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2422 static void inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
2423                              const TxfmParam *txfm_param) {
2424   const TX_TYPE tx_type = txfm_param->tx_type;
2425   switch (tx_type) {
2426 #if !CONFIG_DAALA_DCT8
2427     case DCT_DCT: idct8x8_add(input, dest, stride, txfm_param); break;
2428 #else
2429     case DCT_DCT:
2430 #endif
2431     case ADST_DCT:
2432     case DCT_ADST:
2433     case ADST_ADST:
2434 #if CONFIG_LGT || CONFIG_DAALA_DCT8
2435       av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
2436       break;
2437 #else
2438       av1_iht8x8_64_add(input, dest, stride, txfm_param);
2439       break;
2440 #endif
2441 #if CONFIG_EXT_TX
2442     case FLIPADST_DCT:
2443     case DCT_FLIPADST:
2444     case FLIPADST_FLIPADST:
2445     case ADST_FLIPADST:
2446     case FLIPADST_ADST:
2447 #if CONFIG_LGT || CONFIG_DAALA_DCT8
2448       av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
2449       break;
2450 #else
2451       av1_iht8x8_64_add(input, dest, stride, txfm_param);
2452       break;
2453 #endif
2454     case V_DCT:
2455     case H_DCT:
2456     case V_ADST:
2457     case H_ADST:
2458     case V_FLIPADST:
2459     case H_FLIPADST:
2460       // Use C version since DST only exists in C code
2461       av1_iht8x8_64_add_c(input, dest, stride, txfm_param);
2462       break;
2463     case IDTX: inv_idtx_add_c(input, dest, stride, 8, 8, tx_type); break;
2464 #endif  // CONFIG_EXT_TX
2465     default: assert(0); break;
2466   }
2467 }
2468 
inv_txfm_add_16x16(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2469 static void inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
2470                                int stride, const TxfmParam *txfm_param) {
2471   const TX_TYPE tx_type = txfm_param->tx_type;
2472   switch (tx_type) {
2473 #if !CONFIG_DAALA_DCT16
2474     case DCT_DCT: idct16x16_add(input, dest, stride, txfm_param); break;
2475 #else
2476     case DCT_DCT:
2477 #endif
2478     case ADST_DCT:
2479     case DCT_ADST:
2480     case ADST_ADST:
2481 #if CONFIG_DAALA_DCT16
2482       av1_iht16x16_256_add_c(input, dest, stride, txfm_param);
2483 #else
2484       av1_iht16x16_256_add(input, dest, stride, txfm_param);
2485 #endif  // CONFIG_DAALA_DCT16
2486       break;
2487 #if CONFIG_EXT_TX
2488     case FLIPADST_DCT:
2489     case DCT_FLIPADST:
2490     case FLIPADST_FLIPADST:
2491     case ADST_FLIPADST:
2492     case FLIPADST_ADST:
2493     case V_DCT:
2494     case H_DCT:
2495     case V_ADST:
2496     case H_ADST:
2497     case V_FLIPADST:
2498     case H_FLIPADST:
2499 #if CONFIG_DAALA_DCT16
2500       av1_iht16x16_256_add_c(input, dest, stride, txfm_param);
2501 #else
2502       av1_iht16x16_256_add(input, dest, stride, txfm_param);
2503 #endif  // CONFIG_DAALA_DCT16
2504       break;
2505     case IDTX: inv_idtx_add_c(input, dest, stride, 16, 16, tx_type); break;
2506 #endif  // CONFIG_EXT_TX
2507 #if CONFIG_MRC_TX
2508     case MRC_DCT: assert(0 && "Invalid tx type for tx size");
2509 #endif  // CONFIG_MRC_TX
2510     default: assert(0); break;
2511   }
2512 }
2513 
inv_txfm_add_32x32(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2514 static void inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
2515                                int stride, const TxfmParam *txfm_param) {
2516   const TX_TYPE tx_type = txfm_param->tx_type;
2517   switch (tx_type) {
2518 #if !CONFIG_DAALA_DCT32
2519     case DCT_DCT: idct32x32_add(input, dest, stride, txfm_param); break;
2520 #else
2521     case DCT_DCT:
2522       av1_iht32x32_1024_add_c(input, dest, stride, txfm_param);
2523       break;
2524 #endif
2525 #if CONFIG_EXT_TX
2526     case ADST_DCT:
2527     case DCT_ADST:
2528     case ADST_ADST:
2529     case FLIPADST_DCT:
2530     case DCT_FLIPADST:
2531     case FLIPADST_FLIPADST:
2532     case ADST_FLIPADST:
2533     case FLIPADST_ADST:
2534     case V_DCT:
2535     case H_DCT:
2536     case V_ADST:
2537     case H_ADST:
2538     case V_FLIPADST:
2539     case H_FLIPADST:
2540       av1_iht32x32_1024_add_c(input, dest, stride, txfm_param);
2541       break;
2542     case IDTX: inv_idtx_add_c(input, dest, stride, 32, 32, tx_type); break;
2543 #endif  // CONFIG_EXT_TX
2544 #if CONFIG_MRC_TX
2545     case MRC_DCT: imrc32x32_add_c(input, dest, stride, txfm_param); break;
2546 #endif  // CONFIG_MRC_TX
2547     default: assert(0); break;
2548   }
2549 }
2550 
2551 #if CONFIG_TX64X64
inv_txfm_add_64x64(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2552 static void inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
2553                                int stride, const TxfmParam *txfm_param) {
2554   const TX_TYPE tx_type = txfm_param->tx_type;
2555   assert(tx_type == DCT_DCT);
2556   switch (tx_type) {
2557 #if !CONFIG_DAALA_DCT64
2558     case DCT_DCT: idct64x64_add(input, dest, stride, txfm_param); break;
2559 #else
2560     case DCT_DCT:
2561 #endif
2562 #if CONFIG_EXT_TX
2563     case ADST_DCT:
2564     case DCT_ADST:
2565     case ADST_ADST:
2566     case FLIPADST_DCT:
2567     case DCT_FLIPADST:
2568     case FLIPADST_FLIPADST:
2569     case ADST_FLIPADST:
2570     case FLIPADST_ADST:
2571     case V_DCT:
2572     case H_DCT:
2573     case V_ADST:
2574     case H_ADST:
2575     case V_FLIPADST:
2576     case H_FLIPADST:
2577       av1_iht64x64_4096_add_c(input, dest, stride, txfm_param);
2578       break;
2579     case IDTX: inv_idtx_add_c(input, dest, stride, 64, 64, tx_type); break;
2580 #endif  // CONFIG_EXT_TX
2581 #if CONFIG_MRC_TX
2582     case MRC_DCT: assert(0 && "Invalid tx type for tx size");
2583 #endif  // CONFIG_MRC_TX
2584     default: assert(0); break;
2585   }
2586 }
2587 #endif  // CONFIG_TX64X64
2588 
av1_highbd_iwht4x4_add(const tran_low_t * input,uint8_t * dest,int stride,int eob,int bd)2589 void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
2590                             int eob, int bd) {
2591   if (eob > 1)
2592     aom_highbd_iwht4x4_16_add(input, dest, stride, bd);
2593   else
2594     aom_highbd_iwht4x4_1_add(input, dest, stride, bd);
2595 }
2596 
2597 #if CONFIG_CHROMA_2X2
highbd_inv_txfm_add_2x2(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2598 static void highbd_inv_txfm_add_2x2(const tran_low_t *input, uint8_t *dest,
2599                                     int stride, const TxfmParam *txfm_param) {
2600   int eob = txfm_param->eob;
2601   int bd = txfm_param->bd;
2602   int lossless = txfm_param->lossless;
2603   const TX_TYPE tx_type = txfm_param->tx_type;
2604   tran_high_t a1 = input[0] >> UNIT_QUANT_SHIFT;
2605   tran_high_t b1 = input[1] >> UNIT_QUANT_SHIFT;
2606   tran_high_t c1 = input[2] >> UNIT_QUANT_SHIFT;
2607   tran_high_t d1 = input[3] >> UNIT_QUANT_SHIFT;
2608 
2609   tran_high_t a2 = a1 + c1;
2610   tran_high_t b2 = b1 + d1;
2611   tran_high_t c2 = a1 - c1;
2612   tran_high_t d2 = b1 - d1;
2613 
2614   uint16_t *dst = CONVERT_TO_SHORTPTR(dest);
2615 
2616   (void)tx_type;
2617   (void)lossless;
2618   (void)eob;
2619 
2620   a1 = (a2 + b2) >> 2;
2621   b1 = (a2 - b2) >> 2;
2622   c1 = (c2 + d2) >> 2;
2623   d1 = (c2 - d2) >> 2;
2624 
2625   dst[0] = highbd_clip_pixel_add(dst[0], a1, bd);
2626   dst[1] = highbd_clip_pixel_add(dst[1], b1, bd);
2627   dst[stride] = highbd_clip_pixel_add(dst[stride], c1, bd);
2628   dst[stride + 1] = highbd_clip_pixel_add(dst[stride + 1], d1, bd);
2629 }
2630 #endif
2631 
cast_to_int32(const tran_low_t * input)2632 static const int32_t *cast_to_int32(const tran_low_t *input) {
2633   assert(sizeof(int32_t) == sizeof(tran_low_t));
2634   return (const int32_t *)input;
2635 }
2636 
av1_highbd_inv_txfm_add_4x4(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2637 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
2638                                  int stride, const TxfmParam *txfm_param) {
2639   int eob = txfm_param->eob;
2640   int bd = txfm_param->bd;
2641   int lossless = txfm_param->lossless;
2642   const int32_t *src = cast_to_int32(input);
2643   const TX_TYPE tx_type = txfm_param->tx_type;
2644   if (lossless) {
2645     assert(tx_type == DCT_DCT);
2646     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
2647     return;
2648   }
2649   switch (tx_type) {
2650     case DCT_DCT:
2651     case ADST_DCT:
2652     case DCT_ADST:
2653     case ADST_ADST:
2654       av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2655                              bd);
2656       break;
2657 #if CONFIG_EXT_TX
2658     case FLIPADST_DCT:
2659     case DCT_FLIPADST:
2660     case FLIPADST_FLIPADST:
2661     case ADST_FLIPADST:
2662     case FLIPADST_ADST:
2663       av1_inv_txfm2d_add_4x4(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2664                              bd);
2665       break;
2666     // use the c version for anything including identity for now
2667     case V_DCT:
2668     case H_DCT:
2669     case V_ADST:
2670     case H_ADST:
2671     case V_FLIPADST:
2672     case H_FLIPADST:
2673     case IDTX:
2674       av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2675                                bd);
2676       break;
2677 #endif  // CONFIG_EXT_TX
2678     default: assert(0); break;
2679   }
2680 }
2681 
av1_highbd_inv_txfm_add_4x8(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2682 void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
2683                                  int stride, const TxfmParam *txfm_param) {
2684   const int32_t *src = cast_to_int32(input);
2685   av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2686                            txfm_param->tx_type, txfm_param->bd);
2687 }
2688 
av1_highbd_inv_txfm_add_8x4(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2689 void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
2690                                  int stride, const TxfmParam *txfm_param) {
2691   const int32_t *src = cast_to_int32(input);
2692   av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2693                            txfm_param->tx_type, txfm_param->bd);
2694 }
2695 
highbd_inv_txfm_add_8x16(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2696 static void highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
2697                                      int stride, const TxfmParam *txfm_param) {
2698   const int32_t *src = cast_to_int32(input);
2699   av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2700                             txfm_param->tx_type, txfm_param->bd);
2701 }
2702 
highbd_inv_txfm_add_16x8(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2703 static void highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
2704                                      int stride, const TxfmParam *txfm_param) {
2705   const int32_t *src = cast_to_int32(input);
2706   av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2707                             txfm_param->tx_type, txfm_param->bd);
2708 }
2709 
highbd_inv_txfm_add_16x32(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2710 static void highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
2711                                       int stride, const TxfmParam *txfm_param) {
2712   const int32_t *src = cast_to_int32(input);
2713   av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2714                              txfm_param->tx_type, txfm_param->bd);
2715 }
2716 
highbd_inv_txfm_add_32x16(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2717 static void highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
2718                                       int stride, const TxfmParam *txfm_param) {
2719   const int32_t *src = cast_to_int32(input);
2720   av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2721                              txfm_param->tx_type, txfm_param->bd);
2722 }
2723 
2724 #if CONFIG_TX64X64
highbd_inv_txfm_add_32x64(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2725 static void highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
2726                                       int stride, const TxfmParam *txfm_param) {
2727   const int32_t *src = cast_to_int32(input);
2728   av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2729                              txfm_param->tx_type, txfm_param->bd);
2730 }
2731 
highbd_inv_txfm_add_64x32(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2732 static void highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
2733                                       int stride, const TxfmParam *txfm_param) {
2734   const int32_t *src = cast_to_int32(input);
2735   av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2736                              txfm_param->tx_type, txfm_param->bd);
2737 }
2738 #endif  // CONFIG_TX64X64
2739 
highbd_inv_txfm_add_8x8(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2740 static void highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
2741                                     int stride, const TxfmParam *txfm_param) {
2742   int bd = txfm_param->bd;
2743   const TX_TYPE tx_type = txfm_param->tx_type;
2744   const int32_t *src = cast_to_int32(input);
2745   switch (tx_type) {
2746     case DCT_DCT:
2747     case ADST_DCT:
2748     case DCT_ADST:
2749     case ADST_ADST:
2750       av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2751                              bd);
2752       break;
2753 #if CONFIG_EXT_TX
2754     case FLIPADST_DCT:
2755     case DCT_FLIPADST:
2756     case FLIPADST_FLIPADST:
2757     case ADST_FLIPADST:
2758     case FLIPADST_ADST:
2759       av1_inv_txfm2d_add_8x8(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2760                              bd);
2761       break;
2762     // use the c version for anything including identity for now
2763     case V_DCT:
2764     case H_DCT:
2765     case V_ADST:
2766     case H_ADST:
2767     case V_FLIPADST:
2768     case H_FLIPADST:
2769     case IDTX:
2770       av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2771                                bd);
2772       break;
2773 #endif  // CONFIG_EXT_TX
2774     default: assert(0);
2775   }
2776 }
2777 
highbd_inv_txfm_add_16x16(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2778 static void highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
2779                                       int stride, const TxfmParam *txfm_param) {
2780   int bd = txfm_param->bd;
2781   const TX_TYPE tx_type = txfm_param->tx_type;
2782   const int32_t *src = cast_to_int32(input);
2783   switch (tx_type) {
2784     case DCT_DCT:
2785     case ADST_DCT:
2786     case DCT_ADST:
2787     case ADST_ADST:
2788       av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2789                                bd);
2790       break;
2791 #if CONFIG_EXT_TX
2792     case FLIPADST_DCT:
2793     case DCT_FLIPADST:
2794     case FLIPADST_FLIPADST:
2795     case ADST_FLIPADST:
2796     case FLIPADST_ADST:
2797       av1_inv_txfm2d_add_16x16(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2798                                bd);
2799       break;
2800     // use the c version for anything including identity for now
2801     case V_DCT:
2802     case H_DCT:
2803     case V_ADST:
2804     case H_ADST:
2805     case V_FLIPADST:
2806     case H_FLIPADST:
2807     case IDTX:
2808       av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2809                                  tx_type, bd);
2810       break;
2811 #endif  // CONFIG_EXT_TX
2812     default: assert(0);
2813   }
2814 }
2815 
highbd_inv_txfm_add_32x32(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2816 static void highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
2817                                       int stride, const TxfmParam *txfm_param) {
2818   int bd = txfm_param->bd;
2819   const TX_TYPE tx_type = txfm_param->tx_type;
2820   const int32_t *src = cast_to_int32(input);
2821   switch (tx_type) {
2822     case DCT_DCT:
2823       av1_inv_txfm2d_add_32x32(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
2824                                bd);
2825       break;
2826 
2827     // The optimised version only supports DCT_DCT, so force use of
2828     // the C version for all other transform types.
2829     case ADST_DCT:
2830     case DCT_ADST:
2831     case ADST_ADST:
2832 #if CONFIG_EXT_TX
2833     case FLIPADST_DCT:
2834     case DCT_FLIPADST:
2835     case FLIPADST_FLIPADST:
2836     case ADST_FLIPADST:
2837     case FLIPADST_ADST:
2838     case IDTX:
2839     case V_DCT:
2840     case H_DCT:
2841     case V_ADST:
2842     case H_ADST:
2843     case V_FLIPADST:
2844     case H_FLIPADST:
2845 #endif  // CONFIG_EXT_TX
2846       av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2847                                  tx_type, bd);
2848       break;
2849 
2850     default: assert(0);
2851   }
2852 }
2853 
2854 #if CONFIG_TX64X64
highbd_inv_txfm_add_64x64(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)2855 static void highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
2856                                       int stride, const TxfmParam *txfm_param) {
2857   int bd = txfm_param->bd;
2858   const TX_TYPE tx_type = txfm_param->tx_type;
2859   const int32_t *src = cast_to_int32(input);
2860   switch (tx_type) {
2861     case DCT_DCT:
2862       av1_inv_txfm2d_add_64x64(src, CONVERT_TO_SHORTPTR(dest), stride, DCT_DCT,
2863                                bd);
2864       break;
2865 #if CONFIG_EXT_TX
2866     case ADST_DCT:
2867     case DCT_ADST:
2868     case ADST_ADST:
2869     case FLIPADST_DCT:
2870     case DCT_FLIPADST:
2871     case FLIPADST_FLIPADST:
2872     case ADST_FLIPADST:
2873     case FLIPADST_ADST:
2874     case V_DCT:
2875     case H_DCT:
2876     case V_ADST:
2877     case H_ADST:
2878     case V_FLIPADST:
2879     case H_FLIPADST:
2880       // TODO(sarahparker)
2881       // I've deleted the 64x64 implementations that existed in lieu
2882       // of adst, flipadst and identity for simplicity but will bring back
2883       // in a later change. This shouldn't impact performance since
2884       // DCT_DCT is the only extended type currently allowed for 64x64,
2885       // as dictated by get_ext_tx_set_type in blockd.h.
2886       av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
2887                                  DCT_DCT, bd);
2888       break;
2889     case IDTX:
2890       highbd_inv_idtx_add_c(input, dest, stride, 64, 64, tx_type, bd);
2891       break;
2892 #endif  // CONFIG_EXT_TX
2893     default: assert(0); break;
2894   }
2895 }
2896 #endif  // CONFIG_TX64X64
2897 
av1_inv_txfm_add(const tran_low_t * input,uint8_t * dest,int stride,TxfmParam * txfm_param)2898 void av1_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
2899                       TxfmParam *txfm_param) {
2900   const TX_SIZE tx_size = txfm_param->tx_size;
2901 #if CONFIG_LGT_FROM_PRED
2902   if (txfm_param->use_lgt) {
2903     assert(is_lgt_allowed(txfm_param->mode, tx_size));
2904     ilgt2d_from_pred_add(input, dest, stride, txfm_param);
2905     return;
2906   }
2907 #endif  // CONFIG_LGT_FROM_PRED
2908   switch (tx_size) {
2909 #if CONFIG_TX64X64
2910     case TX_64X64: inv_txfm_add_64x64(input, dest, stride, txfm_param); break;
2911 #endif  // CONFIG_TX64X64
2912     case TX_32X32: inv_txfm_add_32x32(input, dest, stride, txfm_param); break;
2913     case TX_16X16: inv_txfm_add_16x16(input, dest, stride, txfm_param); break;
2914     case TX_8X8: inv_txfm_add_8x8(input, dest, stride, txfm_param); break;
2915     case TX_4X8: inv_txfm_add_4x8(input, dest, stride, txfm_param); break;
2916     case TX_8X4: inv_txfm_add_8x4(input, dest, stride, txfm_param); break;
2917     case TX_8X16: inv_txfm_add_8x16(input, dest, stride, txfm_param); break;
2918     case TX_16X8: inv_txfm_add_16x8(input, dest, stride, txfm_param); break;
2919     case TX_16X32: inv_txfm_add_16x32(input, dest, stride, txfm_param); break;
2920     case TX_32X16: inv_txfm_add_32x16(input, dest, stride, txfm_param); break;
2921 #if CONFIG_TX64X64
2922     case TX_64X32: inv_txfm_add_64x32(input, dest, stride, txfm_param); break;
2923     case TX_32X64: inv_txfm_add_32x64(input, dest, stride, txfm_param); break;
2924 #endif  // CONFIG_TX64X64
2925     case TX_4X4:
2926       // this is like av1_short_idct4x4 but has a special case around eob<=1
2927       // which is significant (not just an optimization) for the lossless
2928       // case.
2929       inv_txfm_add_4x4(input, dest, stride, txfm_param);
2930       break;
2931 #if CONFIG_CHROMA_2X2
2932     case TX_2X2: inv_txfm_add_2x2(input, dest, stride, txfm_param); break;
2933 #endif
2934 #if CONFIG_RECT_TX_EXT && (CONFIG_EXT_TX || CONFIG_VAR_TX)
2935     case TX_32X8: inv_txfm_add_32x8(input, dest, stride, txfm_param); break;
2936     case TX_8X32: inv_txfm_add_8x32(input, dest, stride, txfm_param); break;
2937     case TX_16X4: inv_txfm_add_16x4(input, dest, stride, txfm_param); break;
2938     case TX_4X16: inv_txfm_add_4x16(input, dest, stride, txfm_param); break;
2939 #endif
2940     default: assert(0 && "Invalid transform size"); break;
2941   }
2942 }
2943 
init_txfm_param(const MACROBLOCKD * xd,TX_SIZE tx_size,TX_TYPE tx_type,int eob,TxfmParam * txfm_param)2944 static void init_txfm_param(const MACROBLOCKD *xd, TX_SIZE tx_size,
2945                             TX_TYPE tx_type, int eob, TxfmParam *txfm_param) {
2946   txfm_param->tx_type = tx_type;
2947   txfm_param->tx_size = tx_size;
2948   txfm_param->eob = eob;
2949   txfm_param->lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
2950   txfm_param->bd = xd->bd;
2951 #if CONFIG_LGT
2952   txfm_param->is_inter = is_inter_block(&xd->mi[0]->mbmi);
2953 #endif
2954 #if CONFIG_LGT_FROM_PRED
2955   txfm_param->use_lgt = xd->mi[0]->mbmi.use_lgt;
2956 #endif
2957 #if CONFIG_ADAPT_SCAN
2958   txfm_param->eob_threshold =
2959       (const int16_t *)&xd->eob_threshold_md[tx_size][tx_type][0];
2960 #endif
2961 }
2962 
2963 #if !CONFIG_TXMG
2964 typedef void (*InvTxfmFunc)(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
2965                             TxfmParam *txfm_param);
2966 
2967 static InvTxfmFunc inv_txfm_func[2] = { av1_inv_txfm_add,
2968                                         av1_highbd_inv_txfm_add };
2969 #endif
2970 
av1_inverse_transform_block(const MACROBLOCKD * xd,const tran_low_t * dqcoeff,PREDICTION_MODE mode,uint8_t * mrc_mask,TX_TYPE tx_type,TX_SIZE tx_size,uint8_t * dst,int stride,int eob)2971 void av1_inverse_transform_block(const MACROBLOCKD *xd,
2972                                  const tran_low_t *dqcoeff,
2973 #if CONFIG_LGT_FROM_PRED
2974                                  PREDICTION_MODE mode,
2975 #endif
2976 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
2977                                  uint8_t *mrc_mask,
2978 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
2979                                  TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
2980                                  int stride, int eob) {
2981   if (!eob) return;
2982 #if CONFIG_PVQ
2983   const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
2984   const int txb_width = block_size_wide[tx_bsize];
2985   const int txb_height = block_size_high[tx_bsize];
2986   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
2987     for (int r = 0; r < txb_height; r++)
2988       for (int c = 0; c < txb_width; c++)
2989         CONVERT_TO_SHORTPTR(dst)[r * stride + c] = 0;
2990   } else {
2991     for (int r = 0; r < txb_height; r++)
2992       for (int c = 0; c < txb_width; c++) dst[r * stride + c] = 0;
2993   }
2994 #endif  // CONFIG_PVQ
2995   TxfmParam txfm_param;
2996   init_txfm_param(xd, tx_size, tx_type, eob, &txfm_param);
2997 #if CONFIG_LGT || CONFIG_MRC_TX
2998   txfm_param.is_inter = is_inter_block(&xd->mi[0]->mbmi);
2999 #endif  // CONFIG_LGT || CONFIG_MRC_TX
3000 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3001   txfm_param.mask = mrc_mask;
3002 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3003 #if CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
3004   txfm_param.dst = dst;
3005   txfm_param.stride = stride;
3006 #if CONFIG_LGT_FROM_PRED
3007   txfm_param.mode = mode;
3008 #endif  // CONFIG_LGT_FROM_PRED
3009 #endif  // CONFIG_LGT_FROM_PRED || CONFIG_MRC_TX
3010 
3011   const int is_hbd = get_bitdepth_data_path_index(xd);
3012 #if CONFIG_TXMG
3013   if (is_hbd) {
3014     av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
3015   } else {
3016     DECLARE_ALIGNED(16, uint16_t, tmp[MAX_TX_SQUARE]);
3017     int tmp_stride = MAX_TX_SIZE;
3018     int w = tx_size_wide[tx_size];
3019     int h = tx_size_high[tx_size];
3020     for (int r = 0; r < h; ++r) {
3021       for (int c = 0; c < w; ++c) {
3022         tmp[r * tmp_stride + c] = dst[r * stride + c];
3023       }
3024     }
3025 
3026     av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
3027                             &txfm_param);
3028 
3029     for (int r = 0; r < h; ++r) {
3030       for (int c = 0; c < w; ++c) {
3031         dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
3032       }
3033     }
3034   }
3035 #else  // CONFIG_TXMG
3036   inv_txfm_func[is_hbd](dqcoeff, dst, stride, &txfm_param);
3037 #endif  // CONFIG_TXMG
3038 }
3039 
av1_inverse_transform_block_facade(MACROBLOCKD * xd,int plane,int block,int blk_row,int blk_col,int eob)3040 void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,
3041                                         int blk_row, int blk_col, int eob) {
3042   struct macroblockd_plane *const pd = &xd->plane[plane];
3043   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
3044 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3045   uint8_t *mrc_mask = BLOCK_OFFSET(xd->mrc_mask, block);
3046 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3047   const PLANE_TYPE plane_type = get_plane_type(plane);
3048   const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
3049   const TX_TYPE tx_type =
3050       av1_get_tx_type(plane_type, xd, blk_row, blk_col, block, tx_size);
3051   const int dst_stride = pd->dst.stride;
3052   uint8_t *dst =
3053       &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
3054   av1_inverse_transform_block(xd, dqcoeff,
3055 #if CONFIG_LGT_FROM_PRED
3056                               xd->mi[0]->mbmi.mode,
3057 #endif  // CONFIG_LGT_FROM_PRED
3058 #if CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3059                               mrc_mask,
3060 #endif  // CONFIG_MRC_TX && SIGNAL_ANY_MRC_MASK
3061                               tx_type, tx_size, dst, dst_stride, eob);
3062 }
3063 
av1_highbd_inv_txfm_add(const tran_low_t * input,uint8_t * dest,int stride,TxfmParam * txfm_param)3064 void av1_highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
3065                              TxfmParam *txfm_param) {
3066   const TX_SIZE tx_size = txfm_param->tx_size;
3067   switch (tx_size) {
3068 #if CONFIG_TX64X64
3069     case TX_64X64:
3070       highbd_inv_txfm_add_64x64(input, dest, stride, txfm_param);
3071       break;
3072 #endif  // CONFIG_TX64X64
3073     case TX_32X32:
3074       highbd_inv_txfm_add_32x32(input, dest, stride, txfm_param);
3075       break;
3076     case TX_16X16:
3077       highbd_inv_txfm_add_16x16(input, dest, stride, txfm_param);
3078       break;
3079     case TX_8X8:
3080       highbd_inv_txfm_add_8x8(input, dest, stride, txfm_param);
3081       break;
3082     case TX_4X8:
3083       av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
3084       break;
3085     case TX_8X4:
3086       av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
3087       break;
3088     case TX_8X16:
3089       highbd_inv_txfm_add_8x16(input, dest, stride, txfm_param);
3090       break;
3091     case TX_16X8:
3092       highbd_inv_txfm_add_16x8(input, dest, stride, txfm_param);
3093       break;
3094     case TX_16X32:
3095       highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
3096       break;
3097     case TX_32X16:
3098       highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
3099       break;
3100 #if CONFIG_TX64X64
3101     case TX_64X32:
3102       highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
3103       break;
3104     case TX_32X64:
3105       highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
3106       break;
3107 #endif  // CONFIG_TX64X64
3108     case TX_4X4:
3109       // this is like av1_short_idct4x4 but has a special case around eob<=1
3110       // which is significant (not just an optimization) for the lossless
3111       // case.
3112       av1_highbd_inv_txfm_add_4x4(input, dest, stride, txfm_param);
3113       break;
3114 #if CONFIG_CHROMA_2X2
3115     case TX_2X2:
3116       highbd_inv_txfm_add_2x2(input, dest, stride, txfm_param);
3117       break;
3118 #endif
3119     default: assert(0 && "Invalid transform size"); break;
3120   }
3121 }
3122