1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_ 13 #define AOM_DSP_MIPS_INV_TXFM_MSA_H_ 14 15 #include "aom_dsp/mips/macros_msa.h" 16 #include "aom_dsp/mips/txfm_macros_msa.h" 17 #include "aom_dsp/txfm_common.h" 18 19 #define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ 20 out3, out4, out5, out6, out7) \ 21 { \ 22 v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ 23 v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ 24 v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ 25 cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ 26 v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ 27 cospi_24_64, -cospi_24_64, 0, 0 }; \ 28 \ 29 SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ 30 cnst2_m = -cnst0_m; \ 31 ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ 32 SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ 33 cnst4_m = -cnst2_m; \ 34 ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ 35 \ 36 ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ 37 ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ 38 DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ 39 cnst2_m, cnst3_m, in7, in0, in4, in3); \ 40 \ 41 SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ 42 cnst2_m = -cnst0_m; \ 43 ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ 44 SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ 45 cnst4_m = -cnst2_m; \ 46 ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ 47 \ 48 ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ 49 ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ 50 \ 51 DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ 52 cnst2_m, cnst3_m, in5, in2, in6, in1); \ 53 BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ 54 out7 = -s0_m; \ 55 out0 = s1_m; \ 56 \ 57 SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ 58 \ 59 ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ 60 cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 61 cnst1_m = cnst0_m; \ 62 \ 63 ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ 64 ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ 65 DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ 66 cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ 67 \ 68 SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ 69 cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ 70 \ 71 ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ 72 ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ 73 out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ 74 out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ 75 out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ 76 out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ 77 \ 78 out1 = -out1; \ 79 out3 = -out3; \ 80 out5 = -out5; \ 81 } 82 83 #define AOM_SET_COSPI_PAIR(c0_h, c1_h) \ 84 ({ \ 85 v8i16 out0_m, r0_m, r1_m; \ 86 \ 87 r0_m = __msa_fill_h(c0_h); \ 88 r1_m = __msa_fill_h(c1_h); \ 89 out0_m = __msa_ilvev_h(r1_m, r0_m); \ 90 \ 91 out0_m; \ 92 }) 93 94 #define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ 95 { \ 96 uint8_t *dst_m = (uint8_t *)(dst); \ 97 v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ 98 v16i8 tmp0_m, tmp1_m; \ 99 v16i8 zero_m = { 0 }; \ 100 v8i16 res0_m, res1_m, res2_m, res3_m; \ 101 \ 102 LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ 103 ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \ 104 res0_m, res1_m, res2_m, res3_m); \ 105 ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \ 106 res2_m, res3_m); \ 107 CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ 108 PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ 109 ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ 110 } 111 112 #define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ 113 { \ 114 v8i16 c0_m, c1_m, c2_m, c3_m; \ 115 v8i16 step0_m, step1_m; \ 116 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 117 \ 118 c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ 119 c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ 120 step0_m = __msa_ilvr_h(in2, in0); \ 121 DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ 122 \ 123 c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ 124 c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ 125 step1_m = __msa_ilvr_h(in3, in1); \ 126 DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ 127 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ 128 \ 129 PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ 130 SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ 131 BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \ 132 out0, out1, out2, out3); \ 133 } 134 135 #define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ 136 { \ 137 v8i16 res0_m, res1_m, c0_m, c1_m; \ 138 v8i16 k1_m, k2_m, k3_m, k4_m; \ 139 v8i16 zero_m = { 0 }; \ 140 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 141 v4i32 int0_m, int1_m, int2_m, int3_m; \ 142 v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \ 143 -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \ 144 \ 145 SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ 146 ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ 147 ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ 148 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ 149 int0_m = tmp2_m + tmp1_m; \ 150 \ 151 SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ 152 ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ 153 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ 154 int1_m = tmp0_m + tmp1_m; \ 155 \ 156 c0_m = __msa_splati_h(mask_m, 6); \ 157 ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ 158 ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ 159 DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ 160 int2_m = tmp0_m + tmp1_m; \ 161 \ 162 c0_m = __msa_splati_h(mask_m, 6); \ 163 c0_m = __msa_ilvev_h(c0_m, k1_m); \ 164 \ 165 res0_m = __msa_ilvr_h((in1), (in3)); \ 166 tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ 167 int3_m = tmp2_m + tmp0_m; \ 168 \ 169 res0_m = __msa_ilvr_h((in2), (in3)); \ 170 c1_m = __msa_ilvev_h(k4_m, k3_m); \ 171 \ 172 tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ 173 res1_m = __msa_ilvr_h((in0), (in2)); \ 174 c1_m = __msa_ilvev_h(k1_m, zero_m); \ 175 \ 176 tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ 177 int3_m += tmp2_m; \ 178 int3_m += tmp3_m; \ 179 \ 180 SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ 181 PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ 182 PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ 183 } 184 185 #define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ 186 ({ \ 187 v8i16 c0_m, c1_m; \ 188 \ 189 SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ 190 c0_m = __msa_ilvev_h(c1_m, c0_m); \ 191 \ 192 c0_m; \ 193 }) 194 195 /* multiply and add macro */ 196 #define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ 197 out2, out3) \ 198 { \ 199 v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ 200 v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ 201 \ 202 ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ 203 ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ 204 DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ 205 cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ 206 SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ 207 PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ 208 DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ 209 cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ 210 SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ 211 PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ 212 } 213 214 /* idct 8x8 macro */ 215 #define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 216 out2, out3, out4, out5, out6, out7) \ 217 { \ 218 v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ 219 v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ 220 v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 221 v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ 222 cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ 223 \ 224 k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5); \ 225 k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0); \ 226 k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3); \ 227 k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2); \ 228 AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ 229 SUB2(in1, in3, in7, in5, res0_m, res1_m); \ 230 k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7); \ 231 k1_m = __msa_splati_h(mask_m, 4); \ 232 \ 233 ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ 234 DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ 235 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ 236 SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ 237 tp4_m = in1 + in3; \ 238 PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ 239 tp7_m = in7 + in5; \ 240 k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ 241 k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ 242 AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \ 243 BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ 244 BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \ 245 out1, out2, out3, out4, out5, out6, out7); \ 246 } 247 248 #define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 249 out2, out3, out4, out5, out6, out7) \ 250 { \ 251 v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ 252 v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ 253 v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ 254 v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \ 255 cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ 256 v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \ 257 -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ 258 v8i16 mask3_m = { \ 259 -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \ 260 }; \ 261 \ 262 k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1); \ 263 k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2); \ 264 ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ 265 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ 266 r1_m, r2_m, r3_m); \ 267 k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7); \ 268 k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1); \ 269 ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ 270 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ 271 r5_m, r6_m, r7_m); \ 272 ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ 273 m3_m); \ 274 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ 275 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ 276 SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ 277 m3_m); \ 278 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ 279 PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ 280 k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4); \ 281 k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5); \ 282 ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ 283 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ 284 r1_m, r2_m, r3_m); \ 285 k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3); \ 286 k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4); \ 287 ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ 288 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ 289 r5_m, r6_m, r7_m); \ 290 ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ 291 m3_m); \ 292 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ 293 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ 294 SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ 295 m3_m); \ 296 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ 297 PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ 298 ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ 299 BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ 300 k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6); \ 301 k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7); \ 302 ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ 303 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ 304 r1_m, r2_m, r3_m); \ 305 k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1); \ 306 DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \ 307 r6_m, r7_m); \ 308 ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ 309 m3_m); \ 310 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ 311 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ 312 SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ 313 m3_m); \ 314 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ 315 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ 316 k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2); \ 317 k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3); \ 318 ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ 319 DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \ 320 m1_m, m2_m, m3_m); \ 321 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ 322 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ 323 ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ 324 DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \ 325 m2_m, m3_m); \ 326 SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ 327 PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ 328 \ 329 out1 = -in1; \ 330 out3 = -in3; \ 331 out5 = -in5; \ 332 out7 = -in7; \ 333 } 334 335 #define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \ 336 r12, r13, r14, r15, out0, out1, out2, out3, out4, \ 337 out5, out6, out7, out8, out9, out10, out11, out12, \ 338 out13, out14, out15) \ 339 { \ 340 v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ 341 v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ 342 v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ 343 v8i16 h8_m, h9_m, h10_m, h11_m; \ 344 v8i16 k0_m, k1_m, k2_m, k3_m; \ 345 \ 346 /* stage 1 */ \ 347 k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ 348 k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ 349 k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ 350 k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ 351 MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \ 352 k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ 353 k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ 354 k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ 355 k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ 356 MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \ 357 k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ 358 k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ 359 k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ 360 k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ 361 MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \ 362 g11_m); \ 363 k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ 364 k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ 365 k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ 366 k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ 367 MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \ 368 g15_m); \ 369 \ 370 /* stage 2 */ \ 371 k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ 372 k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ 373 k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ 374 MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \ 375 h3_m); \ 376 k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ 377 k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ 378 k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ 379 MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \ 380 h6_m, h7_m); \ 381 BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ 382 BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \ 383 h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ 384 \ 385 /* stage 3 */ \ 386 BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ 387 k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ 388 k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ 389 k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ 390 MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \ 391 out7); \ 392 MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \ 393 out13, out15); \ 394 \ 395 /* stage 4 */ \ 396 k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ 397 k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ 398 k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ 399 k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ 400 MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ 401 MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ 402 MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ 403 MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ 404 } 405 406 void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 407 int32_t dst_stride); 408 void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output); 409 void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, 410 int32_t dst_stride); 411 void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); 412 #endif // AOM_DSP_MIPS_INV_TXFM_MSA_H_ 413