1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/av1_rtcd.h"
13
14 #include "aom_dsp/mips/macros_msa.h"
15
temporal_filter_apply_8size_msa(uint8_t * frm1_ptr,uint32_t stride,uint8_t * frm2_ptr,int32_t filt_sth,int32_t filt_wgt,uint32_t * acc,uint16_t * cnt)16 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
17 uint8_t *frm2_ptr, int32_t filt_sth,
18 int32_t filt_wgt, uint32_t *acc,
19 uint16_t *cnt) {
20 uint32_t row;
21 uint64_t f0, f1, f2, f3;
22 v16i8 frm2, frm1 = { 0 };
23 v16i8 frm4, frm3 = { 0 };
24 v16u8 frm_r, frm_l;
25 v8i16 frm2_r, frm2_l;
26 v8i16 diff0, diff1, mod0_h, mod1_h;
27 v4i32 cnst3, cnst16, filt_wt, strength;
28 v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
29 v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
30 v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
31 v4i32 acc0, acc1, acc2, acc3;
32 v8i16 cnt0, cnt1;
33
34 filt_wt = __msa_fill_w(filt_wgt);
35 strength = __msa_fill_w(filt_sth);
36 cnst3 = __msa_ldi_w(3);
37 cnst16 = __msa_ldi_w(16);
38
39 for (row = 2; row--;) {
40 LD4(frm1_ptr, stride, f0, f1, f2, f3);
41 frm1_ptr += (4 * stride);
42
43 LD_SB2(frm2_ptr, 16, frm2, frm4);
44 frm2_ptr += 32;
45
46 LD_SW2(acc, 4, acc0, acc1);
47 LD_SW2(acc + 8, 4, acc2, acc3);
48 LD_SH2(cnt, 8, cnt0, cnt1);
49
50 INSERT_D2_SB(f0, f1, frm1);
51 INSERT_D2_SB(f2, f3, frm3);
52 ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
53 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
54 UNPCK_SH_SW(diff0, diff0_r, diff0_l);
55 UNPCK_SH_SW(diff1, diff1_r, diff1_l);
56 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
57 mod0_w, mod1_w, mod2_w, mod3_w);
58 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
59 mod1_w, mod2_w, mod3_w);
60 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
61
62 diff0_r = (mod0_w < cnst16);
63 diff0_l = (mod1_w < cnst16);
64 diff1_r = (mod2_w < cnst16);
65 diff1_l = (mod3_w < cnst16);
66
67 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
68 mod1_w, mod2_w, mod3_w);
69
70 mod0_w = diff0_r & mod0_w;
71 mod1_w = diff0_l & mod1_w;
72 mod2_w = diff1_r & mod2_w;
73 mod3_w = diff1_l & mod3_w;
74
75 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
76 mod0_w, mod1_w, mod2_w, mod3_w);
77 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
78 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
79 ST_SH2(mod0_h, mod1_h, cnt, 8);
80 cnt += 16;
81
82 UNPCK_UB_SH(frm2, frm2_r, frm2_l);
83 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
84 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
85 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
86 mod0_w, mod1_w, mod2_w, mod3_w);
87 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
88 mod2_w, mod3_w);
89
90 ST_SW2(mod0_w, mod1_w, acc, 4);
91 acc += 8;
92 ST_SW2(mod2_w, mod3_w, acc, 4);
93 acc += 8;
94
95 LD_SW2(acc, 4, acc0, acc1);
96 LD_SW2(acc + 8, 4, acc2, acc3);
97 LD_SH2(cnt, 8, cnt0, cnt1);
98
99 ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
100 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
101 UNPCK_SH_SW(diff0, diff0_r, diff0_l);
102 UNPCK_SH_SW(diff1, diff1_r, diff1_l);
103 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
104 mod0_w, mod1_w, mod2_w, mod3_w);
105 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
106 mod1_w, mod2_w, mod3_w);
107 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
108
109 diff0_r = (mod0_w < cnst16);
110 diff0_l = (mod1_w < cnst16);
111 diff1_r = (mod2_w < cnst16);
112 diff1_l = (mod3_w < cnst16);
113
114 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
115 mod1_w, mod2_w, mod3_w);
116
117 mod0_w = diff0_r & mod0_w;
118 mod1_w = diff0_l & mod1_w;
119 mod2_w = diff1_r & mod2_w;
120 mod3_w = diff1_l & mod3_w;
121
122 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
123 mod0_w, mod1_w, mod2_w, mod3_w);
124 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
125 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
126 ST_SH2(mod0_h, mod1_h, cnt, 8);
127 cnt += 16;
128 UNPCK_UB_SH(frm4, frm2_r, frm2_l);
129 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
130 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
131 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
132 mod0_w, mod1_w, mod2_w, mod3_w);
133 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
134 mod2_w, mod3_w);
135
136 ST_SW2(mod0_w, mod1_w, acc, 4);
137 acc += 8;
138 ST_SW2(mod2_w, mod3_w, acc, 4);
139 acc += 8;
140 }
141 }
142
temporal_filter_apply_16size_msa(uint8_t * frm1_ptr,uint32_t stride,uint8_t * frm2_ptr,int32_t filt_sth,int32_t filt_wgt,uint32_t * acc,uint16_t * cnt)143 static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
144 uint8_t *frm2_ptr,
145 int32_t filt_sth, int32_t filt_wgt,
146 uint32_t *acc, uint16_t *cnt) {
147 uint32_t row;
148 v16i8 frm1, frm2, frm3, frm4;
149 v16u8 frm_r, frm_l;
150 v16i8 zero = { 0 };
151 v8u16 frm2_r, frm2_l;
152 v8i16 diff0, diff1, mod0_h, mod1_h;
153 v4i32 cnst3, cnst16, filt_wt, strength;
154 v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
155 v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
156 v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
157 v4i32 acc0, acc1, acc2, acc3;
158 v8i16 cnt0, cnt1;
159
160 filt_wt = __msa_fill_w(filt_wgt);
161 strength = __msa_fill_w(filt_sth);
162 cnst3 = __msa_ldi_w(3);
163 cnst16 = __msa_ldi_w(16);
164
165 for (row = 8; row--;) {
166 LD_SB2(frm1_ptr, stride, frm1, frm3);
167 frm1_ptr += stride;
168
169 LD_SB2(frm2_ptr, 16, frm2, frm4);
170 frm2_ptr += 16;
171
172 LD_SW2(acc, 4, acc0, acc1);
173 LD_SW2(acc, 4, acc2, acc3);
174 LD_SH2(cnt, 8, cnt0, cnt1);
175
176 ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
177 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
178 UNPCK_SH_SW(diff0, diff0_r, diff0_l);
179 UNPCK_SH_SW(diff1, diff1_r, diff1_l);
180 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
181 mod0_w, mod1_w, mod2_w, mod3_w);
182 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
183 mod1_w, mod2_w, mod3_w);
184 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
185
186 diff0_r = (mod0_w < cnst16);
187 diff0_l = (mod1_w < cnst16);
188 diff1_r = (mod2_w < cnst16);
189 diff1_l = (mod3_w < cnst16);
190
191 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
192 mod1_w, mod2_w, mod3_w);
193
194 mod0_w = diff0_r & mod0_w;
195 mod1_w = diff0_l & mod1_w;
196 mod2_w = diff1_r & mod2_w;
197 mod3_w = diff1_l & mod3_w;
198
199 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
200 mod0_w, mod1_w, mod2_w, mod3_w);
201 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
202 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
203 ST_SH2(mod0_h, mod1_h, cnt, 8);
204 cnt += 16;
205
206 ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
207 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
208 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
209 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
210 mod0_w, mod1_w, mod2_w, mod3_w);
211 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
212 mod2_w, mod3_w);
213
214 ST_SW2(mod0_w, mod1_w, acc, 4);
215 acc += 8;
216 ST_SW2(mod2_w, mod3_w, acc, 4);
217 acc += 8;
218
219 LD_SW2(acc, 4, acc0, acc1);
220 LD_SW2(acc + 8, 4, acc2, acc3);
221 LD_SH2(cnt, 8, cnt0, cnt1);
222
223 ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
224 HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
225 UNPCK_SH_SW(diff0, diff0_r, diff0_l);
226 UNPCK_SH_SW(diff1, diff1_r, diff1_l);
227 MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
228 mod0_w, mod1_w, mod2_w, mod3_w);
229 MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
230 mod1_w, mod2_w, mod3_w);
231 SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
232
233 diff0_r = (mod0_w < cnst16);
234 diff0_l = (mod1_w < cnst16);
235 diff1_r = (mod2_w < cnst16);
236 diff1_l = (mod3_w < cnst16);
237
238 SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
239 mod1_w, mod2_w, mod3_w);
240
241 mod0_w = diff0_r & mod0_w;
242 mod1_w = diff0_l & mod1_w;
243 mod2_w = diff1_r & mod2_w;
244 mod3_w = diff1_l & mod3_w;
245
246 MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
247 mod0_w, mod1_w, mod2_w, mod3_w);
248 PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
249 ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
250 ST_SH2(mod0_h, mod1_h, cnt, 8);
251 cnt += 16;
252
253 ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
254 UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
255 UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
256 MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
257 mod0_w, mod1_w, mod2_w, mod3_w);
258 ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
259 mod2_w, mod3_w);
260 ST_SW2(mod0_w, mod1_w, acc, 4);
261 acc += 8;
262 ST_SW2(mod2_w, mod3_w, acc, 4);
263 acc += 8;
264
265 frm1_ptr += stride;
266 frm2_ptr += 16;
267 }
268 }
269
270 // TODO(yunqing) The following optimization is not used since c code changes.
av1_temporal_filter_apply_msa(uint8_t * frame1_ptr,uint32_t stride,uint8_t * frame2_ptr,uint32_t blk_w,uint32_t blk_h,int32_t strength,int32_t filt_wgt,uint32_t * accu,uint16_t * cnt)271 void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
272 uint8_t *frame2_ptr, uint32_t blk_w,
273 uint32_t blk_h, int32_t strength,
274 int32_t filt_wgt, uint32_t *accu,
275 uint16_t *cnt) {
276 if (8 == (blk_w * blk_h)) {
277 temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
278 filt_wgt, accu, cnt);
279 } else if (16 == (blk_w * blk_h)) {
280 temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
281 filt_wgt, accu, cnt);
282 } else {
283 av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
284 strength, filt_wgt, accu, cnt);
285 }
286 }
287