1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/av1_rtcd.h"
13 
14 #include "aom_dsp/mips/macros_msa.h"
15 
temporal_filter_apply_8size_msa(uint8_t * frm1_ptr,uint32_t stride,uint8_t * frm2_ptr,int32_t filt_sth,int32_t filt_wgt,uint32_t * acc,uint16_t * cnt)16 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
17                                             uint8_t *frm2_ptr, int32_t filt_sth,
18                                             int32_t filt_wgt, uint32_t *acc,
19                                             uint16_t *cnt) {
20   uint32_t row;
21   uint64_t f0, f1, f2, f3;
22   v16i8 frm2, frm1 = { 0 };
23   v16i8 frm4, frm3 = { 0 };
24   v16u8 frm_r, frm_l;
25   v8i16 frm2_r, frm2_l;
26   v8i16 diff0, diff1, mod0_h, mod1_h;
27   v4i32 cnst3, cnst16, filt_wt, strength;
28   v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
29   v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
30   v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
31   v4i32 acc0, acc1, acc2, acc3;
32   v8i16 cnt0, cnt1;
33 
34   filt_wt = __msa_fill_w(filt_wgt);
35   strength = __msa_fill_w(filt_sth);
36   cnst3 = __msa_ldi_w(3);
37   cnst16 = __msa_ldi_w(16);
38 
39   for (row = 2; row--;) {
40     LD4(frm1_ptr, stride, f0, f1, f2, f3);
41     frm1_ptr += (4 * stride);
42 
43     LD_SB2(frm2_ptr, 16, frm2, frm4);
44     frm2_ptr += 32;
45 
46     LD_SW2(acc, 4, acc0, acc1);
47     LD_SW2(acc + 8, 4, acc2, acc3);
48     LD_SH2(cnt, 8, cnt0, cnt1);
49 
50     INSERT_D2_SB(f0, f1, frm1);
51     INSERT_D2_SB(f2, f3, frm3);
52     ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
53     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
54     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
55     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
56     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
57          mod0_w, mod1_w, mod2_w, mod3_w);
58     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
59          mod1_w, mod2_w, mod3_w);
60     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
61 
62     diff0_r = (mod0_w < cnst16);
63     diff0_l = (mod1_w < cnst16);
64     diff1_r = (mod2_w < cnst16);
65     diff1_l = (mod3_w < cnst16);
66 
67     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
68          mod1_w, mod2_w, mod3_w);
69 
70     mod0_w = diff0_r & mod0_w;
71     mod1_w = diff0_l & mod1_w;
72     mod2_w = diff1_r & mod2_w;
73     mod3_w = diff1_l & mod3_w;
74 
75     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
76          mod0_w, mod1_w, mod2_w, mod3_w);
77     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
78     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
79     ST_SH2(mod0_h, mod1_h, cnt, 8);
80     cnt += 16;
81 
82     UNPCK_UB_SH(frm2, frm2_r, frm2_l);
83     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
84     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
85     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
86          mod0_w, mod1_w, mod2_w, mod3_w);
87     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
88          mod2_w, mod3_w);
89 
90     ST_SW2(mod0_w, mod1_w, acc, 4);
91     acc += 8;
92     ST_SW2(mod2_w, mod3_w, acc, 4);
93     acc += 8;
94 
95     LD_SW2(acc, 4, acc0, acc1);
96     LD_SW2(acc + 8, 4, acc2, acc3);
97     LD_SH2(cnt, 8, cnt0, cnt1);
98 
99     ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
100     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
101     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
102     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
103     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
104          mod0_w, mod1_w, mod2_w, mod3_w);
105     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
106          mod1_w, mod2_w, mod3_w);
107     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
108 
109     diff0_r = (mod0_w < cnst16);
110     diff0_l = (mod1_w < cnst16);
111     diff1_r = (mod2_w < cnst16);
112     diff1_l = (mod3_w < cnst16);
113 
114     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
115          mod1_w, mod2_w, mod3_w);
116 
117     mod0_w = diff0_r & mod0_w;
118     mod1_w = diff0_l & mod1_w;
119     mod2_w = diff1_r & mod2_w;
120     mod3_w = diff1_l & mod3_w;
121 
122     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
123          mod0_w, mod1_w, mod2_w, mod3_w);
124     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
125     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
126     ST_SH2(mod0_h, mod1_h, cnt, 8);
127     cnt += 16;
128     UNPCK_UB_SH(frm4, frm2_r, frm2_l);
129     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
130     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
131     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
132          mod0_w, mod1_w, mod2_w, mod3_w);
133     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
134          mod2_w, mod3_w);
135 
136     ST_SW2(mod0_w, mod1_w, acc, 4);
137     acc += 8;
138     ST_SW2(mod2_w, mod3_w, acc, 4);
139     acc += 8;
140   }
141 }
142 
temporal_filter_apply_16size_msa(uint8_t * frm1_ptr,uint32_t stride,uint8_t * frm2_ptr,int32_t filt_sth,int32_t filt_wgt,uint32_t * acc,uint16_t * cnt)143 static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
144                                              uint8_t *frm2_ptr,
145                                              int32_t filt_sth, int32_t filt_wgt,
146                                              uint32_t *acc, uint16_t *cnt) {
147   uint32_t row;
148   v16i8 frm1, frm2, frm3, frm4;
149   v16u8 frm_r, frm_l;
150   v16i8 zero = { 0 };
151   v8u16 frm2_r, frm2_l;
152   v8i16 diff0, diff1, mod0_h, mod1_h;
153   v4i32 cnst3, cnst16, filt_wt, strength;
154   v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
155   v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
156   v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
157   v4i32 acc0, acc1, acc2, acc3;
158   v8i16 cnt0, cnt1;
159 
160   filt_wt = __msa_fill_w(filt_wgt);
161   strength = __msa_fill_w(filt_sth);
162   cnst3 = __msa_ldi_w(3);
163   cnst16 = __msa_ldi_w(16);
164 
165   for (row = 8; row--;) {
166     LD_SB2(frm1_ptr, stride, frm1, frm3);
167     frm1_ptr += stride;
168 
169     LD_SB2(frm2_ptr, 16, frm2, frm4);
170     frm2_ptr += 16;
171 
172     LD_SW2(acc, 4, acc0, acc1);
173     LD_SW2(acc, 4, acc2, acc3);
174     LD_SH2(cnt, 8, cnt0, cnt1);
175 
176     ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
177     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
178     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
179     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
180     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
181          mod0_w, mod1_w, mod2_w, mod3_w);
182     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
183          mod1_w, mod2_w, mod3_w);
184     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
185 
186     diff0_r = (mod0_w < cnst16);
187     diff0_l = (mod1_w < cnst16);
188     diff1_r = (mod2_w < cnst16);
189     diff1_l = (mod3_w < cnst16);
190 
191     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
192          mod1_w, mod2_w, mod3_w);
193 
194     mod0_w = diff0_r & mod0_w;
195     mod1_w = diff0_l & mod1_w;
196     mod2_w = diff1_r & mod2_w;
197     mod3_w = diff1_l & mod3_w;
198 
199     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
200          mod0_w, mod1_w, mod2_w, mod3_w);
201     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
202     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
203     ST_SH2(mod0_h, mod1_h, cnt, 8);
204     cnt += 16;
205 
206     ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
207     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
208     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
209     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
210          mod0_w, mod1_w, mod2_w, mod3_w);
211     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
212          mod2_w, mod3_w);
213 
214     ST_SW2(mod0_w, mod1_w, acc, 4);
215     acc += 8;
216     ST_SW2(mod2_w, mod3_w, acc, 4);
217     acc += 8;
218 
219     LD_SW2(acc, 4, acc0, acc1);
220     LD_SW2(acc + 8, 4, acc2, acc3);
221     LD_SH2(cnt, 8, cnt0, cnt1);
222 
223     ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
224     HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
225     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
226     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
227     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
228          mod0_w, mod1_w, mod2_w, mod3_w);
229     MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
230          mod1_w, mod2_w, mod3_w);
231     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
232 
233     diff0_r = (mod0_w < cnst16);
234     diff0_l = (mod1_w < cnst16);
235     diff1_r = (mod2_w < cnst16);
236     diff1_l = (mod3_w < cnst16);
237 
238     SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
239          mod1_w, mod2_w, mod3_w);
240 
241     mod0_w = diff0_r & mod0_w;
242     mod1_w = diff0_l & mod1_w;
243     mod2_w = diff1_r & mod2_w;
244     mod3_w = diff1_l & mod3_w;
245 
246     MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
247          mod0_w, mod1_w, mod2_w, mod3_w);
248     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
249     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
250     ST_SH2(mod0_h, mod1_h, cnt, 8);
251     cnt += 16;
252 
253     ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
254     UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
255     UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
256     MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
257          mod0_w, mod1_w, mod2_w, mod3_w);
258     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
259          mod2_w, mod3_w);
260     ST_SW2(mod0_w, mod1_w, acc, 4);
261     acc += 8;
262     ST_SW2(mod2_w, mod3_w, acc, 4);
263     acc += 8;
264 
265     frm1_ptr += stride;
266     frm2_ptr += 16;
267   }
268 }
269 
270 // TODO(yunqing) The following optimization is not used since c code changes.
av1_temporal_filter_apply_msa(uint8_t * frame1_ptr,uint32_t stride,uint8_t * frame2_ptr,uint32_t blk_w,uint32_t blk_h,int32_t strength,int32_t filt_wgt,uint32_t * accu,uint16_t * cnt)271 void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
272                                    uint8_t *frame2_ptr, uint32_t blk_w,
273                                    uint32_t blk_h, int32_t strength,
274                                    int32_t filt_wgt, uint32_t *accu,
275                                    uint16_t *cnt) {
276   if (8 == (blk_w * blk_h)) {
277     temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
278                                     filt_wgt, accu, cnt);
279   } else if (16 == (blk_w * blk_h)) {
280     temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
281                                      filt_wgt, accu, cnt);
282   } else {
283     av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
284                                 strength, filt_wgt, accu, cnt);
285   }
286 }
287