1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "aom_ports/mem.h"
13 #include "aom_dsp/mips/loopfilter_msa.h"
14
aom_hz_lpf_t4_and_t8_16w(uint8_t * src,int32_t pitch,uint8_t * filter48,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)15 int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
16 const uint8_t *b_limit_ptr,
17 const uint8_t *limit_ptr,
18 const uint8_t *thresh_ptr) {
19 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
20 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
21 v16u8 flat, mask, hev, thresh, b_limit, limit;
22 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
23 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
24 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
25 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
26 v16u8 zero = { 0 };
27
28 /* load vector elements */
29 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
30
31 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
32 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
33 limit = (v16u8)__msa_fill_b(*limit_ptr);
34
35 /* mask and hev */
36 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
37 mask, flat);
38 AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
39 AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
40
41 if (__msa_test_bz_v(flat)) {
42 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
43
44 return 1;
45 } else {
46 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
47 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
48 AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
49 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
50
51 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
52 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
53 AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
54 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
55
56 /* convert 16 bit output data into 8 bit */
57 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
58 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
59 p0_filt8_r, q0_filt8_r);
60 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
61 q2_filt8_r);
62
63 /* store pixel values */
64 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
65 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
66 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
67 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
68 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
69 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
70
71 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
72 filter48 += (4 * 16);
73 ST_UB2(q1_out, q2_out, filter48, 16);
74 filter48 += (2 * 16);
75 ST_UB(flat, filter48);
76
77 return 0;
78 }
79 }
80
aom_hz_lpf_t16_16w(uint8_t * src,int32_t pitch,uint8_t * filter48)81 void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
82 v16u8 flat, flat2, filter8;
83 v16i8 zero = { 0 };
84 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
85 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
86 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
87 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
88 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
89 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
90 v8i16 l_out, r_out;
91
92 flat = LD_UB(filter48 + 96);
93
94 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
95 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
96 AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
97
98 if (__msa_test_bz_v(flat2)) {
99 LD_UB4(filter48, 16, p2, p1, p0, q0);
100 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
101
102 src -= 3 * pitch;
103 ST_UB4(p2, p1, p0, q0, src, pitch);
104 src += (4 * pitch);
105 ST_UB2(q1, q2, src, pitch);
106 } else {
107 src -= 7 * pitch;
108
109 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
110 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
111 p2_r_in, p1_r_in, p0_r_in);
112
113 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
114
115 tmp0_r = p7_r_in << 3;
116 tmp0_r -= p7_r_in;
117 tmp0_r += p6_r_in;
118 tmp0_r += q0_r_in;
119 tmp1_r = p6_r_in + p5_r_in;
120 tmp1_r += p4_r_in;
121 tmp1_r += p3_r_in;
122 tmp1_r += p2_r_in;
123 tmp1_r += p1_r_in;
124 tmp1_r += p0_r_in;
125 tmp1_r += tmp0_r;
126 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
127
128 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
129 p5_l_in, p4_l_in);
130 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
131 p1_l_in, p0_l_in);
132 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
133
134 tmp0_l = p7_l_in << 3;
135 tmp0_l -= p7_l_in;
136 tmp0_l += p6_l_in;
137 tmp0_l += q0_l_in;
138 tmp1_l = p6_l_in + p5_l_in;
139 tmp1_l += p4_l_in;
140 tmp1_l += p3_l_in;
141 tmp1_l += p2_l_in;
142 tmp1_l += p1_l_in;
143 tmp1_l += p0_l_in;
144 tmp1_l += tmp0_l;
145 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
146
147 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
148 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
149 ST_UB(p6, src);
150 src += pitch;
151
152 /* p5 */
153 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
154 tmp0_r = p5_r_in - p6_r_in;
155 tmp0_r += q1_r_in;
156 tmp0_r -= p7_r_in;
157 tmp1_r += tmp0_r;
158 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
159
160 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
161 tmp0_l = p5_l_in - p6_l_in;
162 tmp0_l += q1_l_in;
163 tmp0_l -= p7_l_in;
164 tmp1_l += tmp0_l;
165 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
166
167 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
168 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
169 ST_UB(p5, src);
170 src += pitch;
171
172 /* p4 */
173 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
174 tmp0_r = p4_r_in - p5_r_in;
175 tmp0_r += q2_r_in;
176 tmp0_r -= p7_r_in;
177 tmp1_r += tmp0_r;
178 r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
179
180 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
181 tmp0_l = p4_l_in - p5_l_in;
182 tmp0_l += q2_l_in;
183 tmp0_l -= p7_l_in;
184 tmp1_l += tmp0_l;
185 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
186
187 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
188 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
189 ST_UB(p4, src);
190 src += pitch;
191
192 /* p3 */
193 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
194 tmp0_r = p3_r_in - p4_r_in;
195 tmp0_r += q3_r_in;
196 tmp0_r -= p7_r_in;
197 tmp1_r += tmp0_r;
198 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
199
200 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
201 tmp0_l = p3_l_in - p4_l_in;
202 tmp0_l += q3_l_in;
203 tmp0_l -= p7_l_in;
204 tmp1_l += tmp0_l;
205 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
206
207 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
208 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
209 ST_UB(p3, src);
210 src += pitch;
211
212 /* p2 */
213 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
214 filter8 = LD_UB(filter48);
215 tmp0_r = p2_r_in - p3_r_in;
216 tmp0_r += q4_r_in;
217 tmp0_r -= p7_r_in;
218 tmp1_r += tmp0_r;
219 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
220
221 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
222 tmp0_l = p2_l_in - p3_l_in;
223 tmp0_l += q4_l_in;
224 tmp0_l -= p7_l_in;
225 tmp1_l += tmp0_l;
226 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
227
228 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
229 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
230 ST_UB(filter8, src);
231 src += pitch;
232
233 /* p1 */
234 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
235 filter8 = LD_UB(filter48 + 16);
236 tmp0_r = p1_r_in - p2_r_in;
237 tmp0_r += q5_r_in;
238 tmp0_r -= p7_r_in;
239 tmp1_r += tmp0_r;
240 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
241
242 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
243 tmp0_l = p1_l_in - p2_l_in;
244 tmp0_l += q5_l_in;
245 tmp0_l -= p7_l_in;
246 tmp1_l += tmp0_l;
247 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
248
249 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
250 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
251 ST_UB(filter8, src);
252 src += pitch;
253
254 /* p0 */
255 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
256 filter8 = LD_UB(filter48 + 32);
257 tmp0_r = p0_r_in - p1_r_in;
258 tmp0_r += q6_r_in;
259 tmp0_r -= p7_r_in;
260 tmp1_r += tmp0_r;
261 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
262
263 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
264 tmp0_l = p0_l_in - p1_l_in;
265 tmp0_l += q6_l_in;
266 tmp0_l -= p7_l_in;
267 tmp1_l += tmp0_l;
268 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
269
270 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
271 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
272 ST_UB(filter8, src);
273 src += pitch;
274
275 /* q0 */
276 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
277 filter8 = LD_UB(filter48 + 48);
278 tmp0_r = q7_r_in - p0_r_in;
279 tmp0_r += q0_r_in;
280 tmp0_r -= p7_r_in;
281 tmp1_r += tmp0_r;
282 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
283
284 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
285 tmp0_l = q7_l_in - p0_l_in;
286 tmp0_l += q0_l_in;
287 tmp0_l -= p7_l_in;
288 tmp1_l += tmp0_l;
289 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
290
291 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
292 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
293 ST_UB(filter8, src);
294 src += pitch;
295
296 /* q1 */
297 filter8 = LD_UB(filter48 + 64);
298 tmp0_r = q7_r_in - q0_r_in;
299 tmp0_r += q1_r_in;
300 tmp0_r -= p6_r_in;
301 tmp1_r += tmp0_r;
302 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
303
304 tmp0_l = q7_l_in - q0_l_in;
305 tmp0_l += q1_l_in;
306 tmp0_l -= p6_l_in;
307 tmp1_l += tmp0_l;
308 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
309
310 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
311 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
312 ST_UB(filter8, src);
313 src += pitch;
314
315 /* q2 */
316 filter8 = LD_UB(filter48 + 80);
317 tmp0_r = q7_r_in - q1_r_in;
318 tmp0_r += q2_r_in;
319 tmp0_r -= p5_r_in;
320 tmp1_r += tmp0_r;
321 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
322
323 tmp0_l = q7_l_in - q1_l_in;
324 tmp0_l += q2_l_in;
325 tmp0_l -= p5_l_in;
326 tmp1_l += tmp0_l;
327 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
328
329 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
330 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
331 ST_UB(filter8, src);
332 src += pitch;
333
334 /* q3 */
335 tmp0_r = q7_r_in - q2_r_in;
336 tmp0_r += q3_r_in;
337 tmp0_r -= p4_r_in;
338 tmp1_r += tmp0_r;
339 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
340
341 tmp0_l = q7_l_in - q2_l_in;
342 tmp0_l += q3_l_in;
343 tmp0_l -= p4_l_in;
344 tmp1_l += tmp0_l;
345 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
346
347 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
348 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
349 ST_UB(q3, src);
350 src += pitch;
351
352 /* q4 */
353 tmp0_r = q7_r_in - q3_r_in;
354 tmp0_r += q4_r_in;
355 tmp0_r -= p3_r_in;
356 tmp1_r += tmp0_r;
357 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
358
359 tmp0_l = q7_l_in - q3_l_in;
360 tmp0_l += q4_l_in;
361 tmp0_l -= p3_l_in;
362 tmp1_l += tmp0_l;
363 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
364
365 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
366 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
367 ST_UB(q4, src);
368 src += pitch;
369
370 /* q5 */
371 tmp0_r = q7_r_in - q4_r_in;
372 tmp0_r += q5_r_in;
373 tmp0_r -= p2_r_in;
374 tmp1_r += tmp0_r;
375 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
376
377 tmp0_l = q7_l_in - q4_l_in;
378 tmp0_l += q5_l_in;
379 tmp0_l -= p2_l_in;
380 tmp1_l += tmp0_l;
381 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
382
383 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
384 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
385 ST_UB(q5, src);
386 src += pitch;
387
388 /* q6 */
389 tmp0_r = q7_r_in - q5_r_in;
390 tmp0_r += q6_r_in;
391 tmp0_r -= p1_r_in;
392 tmp1_r += tmp0_r;
393 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
394
395 tmp0_l = q7_l_in - q5_l_in;
396 tmp0_l += q6_l_in;
397 tmp0_l -= p1_l_in;
398 tmp1_l += tmp0_l;
399 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
400
401 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
402 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
403 ST_UB(q6, src);
404 }
405 }
406
mb_lpf_horizontal_edge_dual(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)407 static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
408 const uint8_t *b_limit_ptr,
409 const uint8_t *limit_ptr,
410 const uint8_t *thresh_ptr,
411 int32_t count) {
412 DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
413 uint8_t early_exit = 0;
414
415 (void)count;
416
417 early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
418 limit_ptr, thresh_ptr);
419
420 if (0 == early_exit) {
421 aom_hz_lpf_t16_16w(src, pitch, filter48);
422 }
423 }
424
mb_lpf_horizontal_edge(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)425 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
426 const uint8_t *b_limit_ptr,
427 const uint8_t *limit_ptr,
428 const uint8_t *thresh_ptr, int32_t count) {
429 if (1 == count) {
430 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
431 uint64_t dword0, dword1;
432 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
433 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
434 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
435 v16u8 p0_filter16, p1_filter16;
436 v8i16 p2_filter8, p1_filter8, p0_filter8;
437 v8i16 q0_filter8, q1_filter8, q2_filter8;
438 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
439 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
440 v16i8 zero = { 0 };
441 v8u16 tmp0, tmp1, tmp2;
442
443 /* load vector elements */
444 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
445
446 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
447 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
448 limit = (v16u8)__msa_fill_b(*limit_ptr);
449
450 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
451 mask, flat);
452 AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
453 AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
454 q1_out);
455
456 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
457
458 if (__msa_test_bz_v(flat)) {
459 p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
460 p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
461 q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
462 q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
463 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
464 } else {
465 /* convert 8 bit input data into 16 bit */
466 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
467 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
468 q3_r);
469 AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
470 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
471
472 /* convert 16 bit output data into 8 bit */
473 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
474 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
475 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
476
477 /* store pixel values */
478 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
479 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
480 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
481 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
482 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
483 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
484
485 /* load 16 vector elements */
486 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
487 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
488
489 AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
490
491 if (__msa_test_bz_v(flat2)) {
492 p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
493 p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
494 p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
495 q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
496 q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
497 q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
498
499 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
500 SD(q1_d, src + pitch);
501 SD(q2_d, src + 2 * pitch);
502 } else {
503 /* LSB(right) 8 pixel operation */
504 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
505 zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
506 q7_r);
507
508 tmp0 = p7_r << 3;
509 tmp0 -= p7_r;
510 tmp0 += p6_r;
511 tmp0 += q0_r;
512
513 src -= 7 * pitch;
514
515 /* calculation of p6 and p5 */
516 tmp1 = p6_r + p5_r + p4_r + p3_r;
517 tmp1 += (p2_r + p1_r + p0_r);
518 tmp1 += tmp0;
519 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
520 tmp0 = p5_r - p6_r + q1_r - p7_r;
521 tmp1 += tmp0;
522 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
523 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
524 p1_filter16);
525 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
526 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
527 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
528 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
529 SD(dword0, src);
530 src += pitch;
531 SD(dword1, src);
532 src += pitch;
533
534 /* calculation of p4 and p3 */
535 tmp0 = p4_r - p5_r + q2_r - p7_r;
536 tmp2 = p3_r - p4_r + q3_r - p7_r;
537 tmp1 += tmp0;
538 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
539 tmp1 += tmp2;
540 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
541 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
542 p1_filter16);
543 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
544 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
545 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
546 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
547 SD(dword0, src);
548 src += pitch;
549 SD(dword1, src);
550 src += pitch;
551
552 /* calculation of p2 and p1 */
553 tmp0 = p2_r - p3_r + q4_r - p7_r;
554 tmp2 = p1_r - p2_r + q5_r - p7_r;
555 tmp1 += tmp0;
556 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
557 tmp1 += tmp2;
558 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
559 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
560 p1_filter16);
561 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
562 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
563 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
564 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
565 SD(dword0, src);
566 src += pitch;
567 SD(dword1, src);
568 src += pitch;
569
570 /* calculation of p0 and q0 */
571 tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
572 tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
573 tmp1 += tmp0;
574 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
575 tmp1 += tmp2;
576 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
577 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
578 p1_filter16);
579 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
580 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
581 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
582 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
583 SD(dword0, src);
584 src += pitch;
585 SD(dword1, src);
586 src += pitch;
587
588 /* calculation of q1 and q2 */
589 tmp0 = q7_r - q0_r + q1_r - p6_r;
590 tmp2 = q7_r - q1_r + q2_r - p5_r;
591 tmp1 += tmp0;
592 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
593 tmp1 += tmp2;
594 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
595 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
596 p1_filter16);
597 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
598 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
599 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
600 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
601 SD(dword0, src);
602 src += pitch;
603 SD(dword1, src);
604 src += pitch;
605
606 /* calculation of q3 and q4 */
607 tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
608 tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
609 tmp1 += tmp0;
610 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
611 tmp1 += tmp2;
612 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
613 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
614 p1_filter16);
615 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
616 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
617 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
618 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
619 SD(dword0, src);
620 src += pitch;
621 SD(dword1, src);
622 src += pitch;
623
624 /* calculation of q5 and q6 */
625 tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
626 tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
627 tmp1 += tmp0;
628 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
629 tmp1 += tmp2;
630 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
631 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
632 p1_filter16);
633 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
634 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
635 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
636 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
637 SD(dword0, src);
638 src += pitch;
639 SD(dword1, src);
640 }
641 }
642 } else {
643 mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
644 count);
645 }
646 }
647
aom_lpf_horizontal_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)648 void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
649 const uint8_t *b_limit_ptr,
650 const uint8_t *limit_ptr,
651 const uint8_t *thresh_ptr) {
652 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
653 }
654
aom_lpf_horizontal_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)655 void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
656 const uint8_t *b_limit_ptr,
657 const uint8_t *limit_ptr,
658 const uint8_t *thresh_ptr) {
659 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
660 }
661
transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)662 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
663 uint8_t *output, int32_t out_pitch) {
664 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
665 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
666 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
667
668 LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
669 p1_org, p0_org);
670 /* 8x8 transpose */
671 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
672 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
673 /* 8x8 transpose */
674 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
675 tmp0, tmp1, tmp2, tmp3);
676 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
677 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
678 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
679 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
680 SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
681
682 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
683 output += (8 * out_pitch);
684 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
685 }
686
transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)687 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
688 uint8_t *output, int32_t out_pitch) {
689 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
690 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
691
692 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
693 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
694 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
695 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
696 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
697 }
698
transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)699 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
700 int32_t out_pitch) {
701 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
702 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
703 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
704 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
705 v4i32 tmp2, tmp3;
706
707 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
708 input += (8 * in_pitch);
709 LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
710
711 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
712 row9, row10, row11, row12, row13, row14, row15, p7, p6,
713 p5, p4, p3, p2, p1, p0);
714
715 /* transpose 16x8 matrix into 8x16 */
716 /* total 8 intermediate register and 32 instructions */
717 q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
718 q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
719 q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
720 q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
721 q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
722 q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
723 q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
724 q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
725
726 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
727 tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
728 tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
729
730 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
731 tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
732 tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
733
734 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
735 q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
736 q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
737
738 tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
739 tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
740 q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
741 q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
742
743 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
744 q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
745 q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
746
747 tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
748 tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
749 q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
750 q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
751
752 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
753 output += (8 * out_pitch);
754 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
755 }
756
aom_vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)757 int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
758 uint8_t *src_org, int32_t pitch_org,
759 const uint8_t *b_limit_ptr,
760 const uint8_t *limit_ptr,
761 const uint8_t *thresh_ptr) {
762 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
763 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
764 v16u8 flat, mask, hev, thresh, b_limit, limit;
765 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
766 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
767 v16i8 zero = { 0 };
768 v8i16 vec0, vec1, vec2, vec3;
769
770 /* load vector elements */
771 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
772
773 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
774 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
775 limit = (v16u8)__msa_fill_b(*limit_ptr);
776
777 /* mask and hev */
778 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
779 mask, flat);
780 /* flat4 */
781 AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
782 /* filter4 */
783 AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
784
785 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
786
787 if (__msa_test_bz_v(flat)) {
788 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
789 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
790 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
791 return 1;
792 } else {
793 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
794 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
795 AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
796 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
797
798 /* convert 16 bit output data into 8 bit */
799 p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
800 p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
801 p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
802 q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
803 q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
804 q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
805
806 /* store pixel values */
807 p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
808 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
809 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
810 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
811 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
812 q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
813
814 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
815 filter48 += (4 * 16);
816 ST_UB2(q1_out, q2_out, filter48, 16);
817 filter48 += (2 * 16);
818 ST_UB(flat, filter48);
819
820 return 0;
821 }
822 }
823
aom_vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)824 int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
825 uint8_t *filter48) {
826 v16i8 zero = { 0 };
827 v16u8 filter8, flat, flat2;
828 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
829 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
830 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
831 v8u16 tmp0_r, tmp1_r;
832 v8i16 r_out;
833
834 flat = LD_UB(filter48 + 6 * 16);
835
836 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
837 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
838
839 AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
840
841 if (__msa_test_bz_v(flat2)) {
842 v8i16 vec0, vec1, vec2, vec3, vec4;
843
844 LD_UB4(filter48, 16, p2, p1, p0, q0);
845 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
846
847 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
848 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
849 vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
850
851 src_org -= 3;
852 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
853 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
854 src_org += (4 * pitch);
855 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
856 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
857
858 return 1;
859 } else {
860 src -= 7 * 16;
861
862 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
863 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
864 p2_r_in, p1_r_in, p0_r_in);
865 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
866
867 tmp0_r = p7_r_in << 3;
868 tmp0_r -= p7_r_in;
869 tmp0_r += p6_r_in;
870 tmp0_r += q0_r_in;
871 tmp1_r = p6_r_in + p5_r_in;
872 tmp1_r += p4_r_in;
873 tmp1_r += p3_r_in;
874 tmp1_r += p2_r_in;
875 tmp1_r += p1_r_in;
876 tmp1_r += p0_r_in;
877 tmp1_r += tmp0_r;
878
879 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
880 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
881 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
882 ST8x1_UB(p6, src);
883 src += 16;
884
885 /* p5 */
886 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
887 tmp0_r = p5_r_in - p6_r_in;
888 tmp0_r += q1_r_in;
889 tmp0_r -= p7_r_in;
890 tmp1_r += tmp0_r;
891 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
892 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
893 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
894 ST8x1_UB(p5, src);
895 src += 16;
896
897 /* p4 */
898 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
899 tmp0_r = p4_r_in - p5_r_in;
900 tmp0_r += q2_r_in;
901 tmp0_r -= p7_r_in;
902 tmp1_r += tmp0_r;
903 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
904 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
905 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
906 ST8x1_UB(p4, src);
907 src += 16;
908
909 /* p3 */
910 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
911 tmp0_r = p3_r_in - p4_r_in;
912 tmp0_r += q3_r_in;
913 tmp0_r -= p7_r_in;
914 tmp1_r += tmp0_r;
915 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
916 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
917 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
918 ST8x1_UB(p3, src);
919 src += 16;
920
921 /* p2 */
922 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
923 filter8 = LD_UB(filter48);
924 tmp0_r = p2_r_in - p3_r_in;
925 tmp0_r += q4_r_in;
926 tmp0_r -= p7_r_in;
927 tmp1_r += tmp0_r;
928 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
929 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
930 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
931 ST8x1_UB(filter8, src);
932 src += 16;
933
934 /* p1 */
935 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
936 filter8 = LD_UB(filter48 + 16);
937 tmp0_r = p1_r_in - p2_r_in;
938 tmp0_r += q5_r_in;
939 tmp0_r -= p7_r_in;
940 tmp1_r += tmp0_r;
941 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
942 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
943 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
944 ST8x1_UB(filter8, src);
945 src += 16;
946
947 /* p0 */
948 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
949 filter8 = LD_UB(filter48 + 32);
950 tmp0_r = p0_r_in - p1_r_in;
951 tmp0_r += q6_r_in;
952 tmp0_r -= p7_r_in;
953 tmp1_r += tmp0_r;
954 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
955 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
956 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
957 ST8x1_UB(filter8, src);
958 src += 16;
959
960 /* q0 */
961 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
962 filter8 = LD_UB(filter48 + 48);
963 tmp0_r = q7_r_in - p0_r_in;
964 tmp0_r += q0_r_in;
965 tmp0_r -= p7_r_in;
966 tmp1_r += tmp0_r;
967 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
968 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
969 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
970 ST8x1_UB(filter8, src);
971 src += 16;
972
973 /* q1 */
974 filter8 = LD_UB(filter48 + 64);
975 tmp0_r = q7_r_in - q0_r_in;
976 tmp0_r += q1_r_in;
977 tmp0_r -= p6_r_in;
978 tmp1_r += tmp0_r;
979 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
980 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
981 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
982 ST8x1_UB(filter8, src);
983 src += 16;
984
985 /* q2 */
986 filter8 = LD_UB(filter48 + 80);
987 tmp0_r = q7_r_in - q1_r_in;
988 tmp0_r += q2_r_in;
989 tmp0_r -= p5_r_in;
990 tmp1_r += tmp0_r;
991 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
992 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
993 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
994 ST8x1_UB(filter8, src);
995 src += 16;
996
997 /* q3 */
998 tmp0_r = q7_r_in - q2_r_in;
999 tmp0_r += q3_r_in;
1000 tmp0_r -= p4_r_in;
1001 tmp1_r += tmp0_r;
1002 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1003 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1004 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1005 ST8x1_UB(q3, src);
1006 src += 16;
1007
1008 /* q4 */
1009 tmp0_r = q7_r_in - q3_r_in;
1010 tmp0_r += q4_r_in;
1011 tmp0_r -= p3_r_in;
1012 tmp1_r += tmp0_r;
1013 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1014 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1015 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1016 ST8x1_UB(q4, src);
1017 src += 16;
1018
1019 /* q5 */
1020 tmp0_r = q7_r_in - q4_r_in;
1021 tmp0_r += q5_r_in;
1022 tmp0_r -= p2_r_in;
1023 tmp1_r += tmp0_r;
1024 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1025 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1026 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1027 ST8x1_UB(q5, src);
1028 src += 16;
1029
1030 /* q6 */
1031 tmp0_r = q7_r_in - q5_r_in;
1032 tmp0_r += q6_r_in;
1033 tmp0_r -= p1_r_in;
1034 tmp1_r += tmp0_r;
1035 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1036 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1037 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1038 ST8x1_UB(q6, src);
1039
1040 return 0;
1041 }
1042 }
1043
aom_lpf_vertical_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1044 void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
1045 const uint8_t *b_limit_ptr,
1046 const uint8_t *limit_ptr,
1047 const uint8_t *thresh_ptr) {
1048 uint8_t early_exit = 0;
1049 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1050 uint8_t *filter48 = &transposed_input[16 * 16];
1051
1052 transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
1053
1054 early_exit =
1055 aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
1056 pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1057
1058 if (0 == early_exit) {
1059 early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
1060 &filter48[0]);
1061
1062 if (0 == early_exit) {
1063 transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
1064 }
1065 }
1066 }
1067
aom_vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1068 int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
1069 uint8_t *src_org, int32_t pitch,
1070 const uint8_t *b_limit_ptr,
1071 const uint8_t *limit_ptr,
1072 const uint8_t *thresh_ptr) {
1073 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1074 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1075 v16u8 flat, mask, hev, thresh, b_limit, limit;
1076 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1077 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1078 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
1079 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
1080 v16i8 zero = { 0 };
1081 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
1082
1083 /* load vector elements */
1084 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1085
1086 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
1087 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
1088 limit = (v16u8)__msa_fill_b(*limit_ptr);
1089
1090 /* mask and hev */
1091 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
1092 mask, flat);
1093 /* flat4 */
1094 AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1095 /* filter4 */
1096 AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
1097
1098 if (__msa_test_bz_v(flat)) {
1099 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1100 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1101 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1102 ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1103
1104 src_org -= 2;
1105 ST4x8_UB(vec2, vec3, src_org, pitch);
1106 src_org += 8 * pitch;
1107 ST4x8_UB(vec4, vec5, src_org, pitch);
1108
1109 return 1;
1110 } else {
1111 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
1112 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
1113 AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1114 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1115 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
1116 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
1117 AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1118 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1119
1120 /* convert 16 bit output data into 8 bit */
1121 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1122 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1123 p0_filt8_r, q0_filt8_r);
1124 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1125 q2_filt8_r);
1126
1127 /* store pixel values */
1128 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
1129 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
1130 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
1131 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
1132 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
1133 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
1134
1135 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1136 filter48 += (4 * 16);
1137 ST_UB2(q1_out, q2_out, filter48, 16);
1138 filter48 += (2 * 16);
1139 ST_UB(flat, filter48);
1140
1141 return 0;
1142 }
1143 }
1144
aom_vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)1145 int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
1146 uint8_t *filter48) {
1147 v16u8 flat, flat2, filter8;
1148 v16i8 zero = { 0 };
1149 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1150 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1151 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1152 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
1153 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
1154 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
1155 v8i16 l_out, r_out;
1156
1157 flat = LD_UB(filter48 + 6 * 16);
1158
1159 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1160 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1161
1162 AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1163
1164 if (__msa_test_bz_v(flat2)) {
1165 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1166
1167 LD_UB4(filter48, 16, p2, p1, p0, q0);
1168 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1169
1170 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1171 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1172 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1173 ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1174 ILVRL_B2_SH(q2, q1, vec2, vec5);
1175
1176 src_org -= 3;
1177 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1178 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1179 src_org += (4 * pitch);
1180 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1181 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1182 src_org += (4 * pitch);
1183 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
1184 ST2x4_UB(vec5, 0, (src_org + 4), pitch);
1185 src_org += (4 * pitch);
1186 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
1187 ST2x4_UB(vec5, 4, (src_org + 4), pitch);
1188
1189 return 1;
1190 } else {
1191 src -= 7 * 16;
1192
1193 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
1194 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
1195 p2_r_in, p1_r_in, p0_r_in);
1196 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
1197
1198 tmp0_r = p7_r_in << 3;
1199 tmp0_r -= p7_r_in;
1200 tmp0_r += p6_r_in;
1201 tmp0_r += q0_r_in;
1202 tmp1_r = p6_r_in + p5_r_in;
1203 tmp1_r += p4_r_in;
1204 tmp1_r += p3_r_in;
1205 tmp1_r += p2_r_in;
1206 tmp1_r += p1_r_in;
1207 tmp1_r += p0_r_in;
1208 tmp1_r += tmp0_r;
1209 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1210
1211 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
1212 p5_l_in, p4_l_in);
1213 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
1214 p1_l_in, p0_l_in);
1215 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
1216
1217 tmp0_l = p7_l_in << 3;
1218 tmp0_l -= p7_l_in;
1219 tmp0_l += p6_l_in;
1220 tmp0_l += q0_l_in;
1221 tmp1_l = p6_l_in + p5_l_in;
1222 tmp1_l += p4_l_in;
1223 tmp1_l += p3_l_in;
1224 tmp1_l += p2_l_in;
1225 tmp1_l += p1_l_in;
1226 tmp1_l += p0_l_in;
1227 tmp1_l += tmp0_l;
1228 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1229
1230 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1231 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
1232 ST_UB(p6, src);
1233 src += 16;
1234
1235 /* p5 */
1236 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
1237 tmp0_r = p5_r_in - p6_r_in;
1238 tmp0_r += q1_r_in;
1239 tmp0_r -= p7_r_in;
1240 tmp1_r += tmp0_r;
1241 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1242 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
1243 tmp0_l = p5_l_in - p6_l_in;
1244 tmp0_l += q1_l_in;
1245 tmp0_l -= p7_l_in;
1246 tmp1_l += tmp0_l;
1247 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1248 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1249 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
1250 ST_UB(p5, src);
1251 src += 16;
1252
1253 /* p4 */
1254 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
1255 tmp0_r = p4_r_in - p5_r_in;
1256 tmp0_r += q2_r_in;
1257 tmp0_r -= p7_r_in;
1258 tmp1_r += tmp0_r;
1259 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1260 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
1261 tmp0_l = p4_l_in - p5_l_in;
1262 tmp0_l += q2_l_in;
1263 tmp0_l -= p7_l_in;
1264 tmp1_l += tmp0_l;
1265 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1266 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1267 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
1268 ST_UB(p4, src);
1269 src += 16;
1270
1271 /* p3 */
1272 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
1273 tmp0_r = p3_r_in - p4_r_in;
1274 tmp0_r += q3_r_in;
1275 tmp0_r -= p7_r_in;
1276 tmp1_r += tmp0_r;
1277 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1278 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
1279 tmp0_l = p3_l_in - p4_l_in;
1280 tmp0_l += q3_l_in;
1281 tmp0_l -= p7_l_in;
1282 tmp1_l += tmp0_l;
1283 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1284 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1285 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
1286 ST_UB(p3, src);
1287 src += 16;
1288
1289 /* p2 */
1290 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
1291 filter8 = LD_UB(filter48);
1292 tmp0_r = p2_r_in - p3_r_in;
1293 tmp0_r += q4_r_in;
1294 tmp0_r -= p7_r_in;
1295 tmp1_r += tmp0_r;
1296 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1297 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
1298 tmp0_l = p2_l_in - p3_l_in;
1299 tmp0_l += q4_l_in;
1300 tmp0_l -= p7_l_in;
1301 tmp1_l += tmp0_l;
1302 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1303 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1304 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1305 ST_UB(filter8, src);
1306 src += 16;
1307
1308 /* p1 */
1309 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
1310 filter8 = LD_UB(filter48 + 16);
1311 tmp0_r = p1_r_in - p2_r_in;
1312 tmp0_r += q5_r_in;
1313 tmp0_r -= p7_r_in;
1314 tmp1_r += tmp0_r;
1315 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1316 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
1317 tmp0_l = p1_l_in - p2_l_in;
1318 tmp0_l += q5_l_in;
1319 tmp0_l -= p7_l_in;
1320 tmp1_l += tmp0_l;
1321 l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
1322 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1323 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1324 ST_UB(filter8, src);
1325 src += 16;
1326
1327 /* p0 */
1328 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
1329 filter8 = LD_UB(filter48 + 32);
1330 tmp0_r = p0_r_in - p1_r_in;
1331 tmp0_r += q6_r_in;
1332 tmp0_r -= p7_r_in;
1333 tmp1_r += tmp0_r;
1334 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1335 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
1336 tmp0_l = p0_l_in - p1_l_in;
1337 tmp0_l += q6_l_in;
1338 tmp0_l -= p7_l_in;
1339 tmp1_l += tmp0_l;
1340 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1341 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1342 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1343 ST_UB(filter8, src);
1344 src += 16;
1345
1346 /* q0 */
1347 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
1348 filter8 = LD_UB(filter48 + 48);
1349 tmp0_r = q7_r_in - p0_r_in;
1350 tmp0_r += q0_r_in;
1351 tmp0_r -= p7_r_in;
1352 tmp1_r += tmp0_r;
1353 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1354 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
1355 tmp0_l = q7_l_in - p0_l_in;
1356 tmp0_l += q0_l_in;
1357 tmp0_l -= p7_l_in;
1358 tmp1_l += tmp0_l;
1359 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1360 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1361 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1362 ST_UB(filter8, src);
1363 src += 16;
1364
1365 /* q1 */
1366 filter8 = LD_UB(filter48 + 64);
1367 tmp0_r = q7_r_in - q0_r_in;
1368 tmp0_r += q1_r_in;
1369 tmp0_r -= p6_r_in;
1370 tmp1_r += tmp0_r;
1371 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1372 tmp0_l = q7_l_in - q0_l_in;
1373 tmp0_l += q1_l_in;
1374 tmp0_l -= p6_l_in;
1375 tmp1_l += tmp0_l;
1376 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1377 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1378 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1379 ST_UB(filter8, src);
1380 src += 16;
1381
1382 /* q2 */
1383 filter8 = LD_UB(filter48 + 80);
1384 tmp0_r = q7_r_in - q1_r_in;
1385 tmp0_r += q2_r_in;
1386 tmp0_r -= p5_r_in;
1387 tmp1_r += tmp0_r;
1388 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1389 tmp0_l = q7_l_in - q1_l_in;
1390 tmp0_l += q2_l_in;
1391 tmp0_l -= p5_l_in;
1392 tmp1_l += tmp0_l;
1393 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1394 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1395 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1396 ST_UB(filter8, src);
1397 src += 16;
1398
1399 /* q3 */
1400 tmp0_r = q7_r_in - q2_r_in;
1401 tmp0_r += q3_r_in;
1402 tmp0_r -= p4_r_in;
1403 tmp1_r += tmp0_r;
1404 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1405 tmp0_l = q7_l_in - q2_l_in;
1406 tmp0_l += q3_l_in;
1407 tmp0_l -= p4_l_in;
1408 tmp1_l += tmp0_l;
1409 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1410 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1411 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1412 ST_UB(q3, src);
1413 src += 16;
1414
1415 /* q4 */
1416 tmp0_r = q7_r_in - q3_r_in;
1417 tmp0_r += q4_r_in;
1418 tmp0_r -= p3_r_in;
1419 tmp1_r += tmp0_r;
1420 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1421 tmp0_l = q7_l_in - q3_l_in;
1422 tmp0_l += q4_l_in;
1423 tmp0_l -= p3_l_in;
1424 tmp1_l += tmp0_l;
1425 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1426 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1427 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1428 ST_UB(q4, src);
1429 src += 16;
1430
1431 /* q5 */
1432 tmp0_r = q7_r_in - q4_r_in;
1433 tmp0_r += q5_r_in;
1434 tmp0_r -= p2_r_in;
1435 tmp1_r += tmp0_r;
1436 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1437 tmp0_l = q7_l_in - q4_l_in;
1438 tmp0_l += q5_l_in;
1439 tmp0_l -= p2_l_in;
1440 tmp1_l += tmp0_l;
1441 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1442 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1443 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1444 ST_UB(q5, src);
1445 src += 16;
1446
1447 /* q6 */
1448 tmp0_r = q7_r_in - q5_r_in;
1449 tmp0_r += q6_r_in;
1450 tmp0_r -= p1_r_in;
1451 tmp1_r += tmp0_r;
1452 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1453 tmp0_l = q7_l_in - q5_l_in;
1454 tmp0_l += q6_l_in;
1455 tmp0_l -= p1_l_in;
1456 tmp1_l += tmp0_l;
1457 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1458 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1459 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1460 ST_UB(q6, src);
1461
1462 return 0;
1463 }
1464 }
1465
aom_lpf_vertical_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1466 void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
1467 const uint8_t *b_limit_ptr,
1468 const uint8_t *limit_ptr,
1469 const uint8_t *thresh_ptr) {
1470 uint8_t early_exit = 0;
1471 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1472 uint8_t *filter48 = &transposed_input[16 * 16];
1473
1474 transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
1475
1476 early_exit =
1477 aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
1478 pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1479
1480 if (0 == early_exit) {
1481 early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
1482 &filter48[0]);
1483
1484 if (0 == early_exit) {
1485 transpose_16x16(transposed_input, 16, (src - 8), pitch);
1486 }
1487 }
1488 }
1489