1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "aom_ports/mem.h"
13 #include "aom_dsp/mips/loopfilter_msa.h"
14
aom_hz_lpf_t4_and_t8_16w(uint8_t * src,int32_t pitch,uint8_t * filter48,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)15 int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
16 const uint8_t *b_limit_ptr,
17 const uint8_t *limit_ptr,
18 const uint8_t *thresh_ptr) {
19 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
20 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
21 v16u8 flat, mask, hev, thresh, b_limit, limit;
22 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
23 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
24 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
25 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
26 v16u8 zero = { 0 };
27
28 /* load vector elements */
29 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
30
31 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
32 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
33 limit = (v16u8)__msa_fill_b(*limit_ptr);
34
35 /* mask and hev */
36 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
37 mask, flat);
38 AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
39 AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
40
41 if (__msa_test_bz_v(flat)) {
42 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
43
44 return 1;
45 } else {
46 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
47 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
48 AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
49 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
50
51 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
52 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
53 AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
54 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
55
56 /* convert 16 bit output data into 8 bit */
57 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
58 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
59 p0_filt8_r, q0_filt8_r);
60 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
61 q2_filt8_r);
62
63 /* store pixel values */
64 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
65 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
66 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
67 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
68 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
69 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
70
71 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
72 filter48 += (4 * 16);
73 ST_UB2(q1_out, q2_out, filter48, 16);
74 filter48 += (2 * 16);
75 ST_UB(flat, filter48);
76
77 return 0;
78 }
79 }
80
aom_hz_lpf_t16_16w(uint8_t * src,int32_t pitch,uint8_t * filter48)81 void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
82 v16u8 flat, flat2, filter8;
83 v16i8 zero = { 0 };
84 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
85 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
86 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
87 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
88 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
89 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
90 v8i16 l_out, r_out;
91
92 flat = LD_UB(filter48 + 96);
93
94 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
95 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
96 AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
97
98 if (__msa_test_bz_v(flat2)) {
99 LD_UB4(filter48, 16, p2, p1, p0, q0);
100 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
101
102 src -= 3 * pitch;
103 ST_UB4(p2, p1, p0, q0, src, pitch);
104 src += (4 * pitch);
105 ST_UB2(q1, q2, src, pitch);
106 } else {
107 src -= 7 * pitch;
108
109 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
110 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
111 p2_r_in, p1_r_in, p0_r_in);
112
113 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
114
115 tmp0_r = p7_r_in << 3;
116 tmp0_r -= p7_r_in;
117 tmp0_r += p6_r_in;
118 tmp0_r += q0_r_in;
119 tmp1_r = p6_r_in + p5_r_in;
120 tmp1_r += p4_r_in;
121 tmp1_r += p3_r_in;
122 tmp1_r += p2_r_in;
123 tmp1_r += p1_r_in;
124 tmp1_r += p0_r_in;
125 tmp1_r += tmp0_r;
126 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
127
128 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
129 p5_l_in, p4_l_in);
130 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
131 p1_l_in, p0_l_in);
132 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
133
134 tmp0_l = p7_l_in << 3;
135 tmp0_l -= p7_l_in;
136 tmp0_l += p6_l_in;
137 tmp0_l += q0_l_in;
138 tmp1_l = p6_l_in + p5_l_in;
139 tmp1_l += p4_l_in;
140 tmp1_l += p3_l_in;
141 tmp1_l += p2_l_in;
142 tmp1_l += p1_l_in;
143 tmp1_l += p0_l_in;
144 tmp1_l += tmp0_l;
145 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
146
147 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
148 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
149 ST_UB(p6, src);
150 src += pitch;
151
152 /* p5 */
153 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
154 tmp0_r = p5_r_in - p6_r_in;
155 tmp0_r += q1_r_in;
156 tmp0_r -= p7_r_in;
157 tmp1_r += tmp0_r;
158 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
159
160 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
161 tmp0_l = p5_l_in - p6_l_in;
162 tmp0_l += q1_l_in;
163 tmp0_l -= p7_l_in;
164 tmp1_l += tmp0_l;
165 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
166
167 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
168 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
169 ST_UB(p5, src);
170 src += pitch;
171
172 /* p4 */
173 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
174 tmp0_r = p4_r_in - p5_r_in;
175 tmp0_r += q2_r_in;
176 tmp0_r -= p7_r_in;
177 tmp1_r += tmp0_r;
178 r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
179
180 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
181 tmp0_l = p4_l_in - p5_l_in;
182 tmp0_l += q2_l_in;
183 tmp0_l -= p7_l_in;
184 tmp1_l += tmp0_l;
185 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
186
187 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
188 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
189 ST_UB(p4, src);
190 src += pitch;
191
192 /* p3 */
193 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
194 tmp0_r = p3_r_in - p4_r_in;
195 tmp0_r += q3_r_in;
196 tmp0_r -= p7_r_in;
197 tmp1_r += tmp0_r;
198 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
199
200 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
201 tmp0_l = p3_l_in - p4_l_in;
202 tmp0_l += q3_l_in;
203 tmp0_l -= p7_l_in;
204 tmp1_l += tmp0_l;
205 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
206
207 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
208 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
209 ST_UB(p3, src);
210 src += pitch;
211
212 /* p2 */
213 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
214 filter8 = LD_UB(filter48);
215 tmp0_r = p2_r_in - p3_r_in;
216 tmp0_r += q4_r_in;
217 tmp0_r -= p7_r_in;
218 tmp1_r += tmp0_r;
219 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
220
221 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
222 tmp0_l = p2_l_in - p3_l_in;
223 tmp0_l += q4_l_in;
224 tmp0_l -= p7_l_in;
225 tmp1_l += tmp0_l;
226 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
227
228 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
229 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
230 ST_UB(filter8, src);
231 src += pitch;
232
233 /* p1 */
234 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
235 filter8 = LD_UB(filter48 + 16);
236 tmp0_r = p1_r_in - p2_r_in;
237 tmp0_r += q5_r_in;
238 tmp0_r -= p7_r_in;
239 tmp1_r += tmp0_r;
240 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
241
242 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
243 tmp0_l = p1_l_in - p2_l_in;
244 tmp0_l += q5_l_in;
245 tmp0_l -= p7_l_in;
246 tmp1_l += tmp0_l;
247 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
248
249 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
250 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
251 ST_UB(filter8, src);
252 src += pitch;
253
254 /* p0 */
255 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
256 filter8 = LD_UB(filter48 + 32);
257 tmp0_r = p0_r_in - p1_r_in;
258 tmp0_r += q6_r_in;
259 tmp0_r -= p7_r_in;
260 tmp1_r += tmp0_r;
261 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
262
263 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
264 tmp0_l = p0_l_in - p1_l_in;
265 tmp0_l += q6_l_in;
266 tmp0_l -= p7_l_in;
267 tmp1_l += tmp0_l;
268 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
269
270 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
271 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
272 ST_UB(filter8, src);
273 src += pitch;
274
275 /* q0 */
276 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
277 filter8 = LD_UB(filter48 + 48);
278 tmp0_r = q7_r_in - p0_r_in;
279 tmp0_r += q0_r_in;
280 tmp0_r -= p7_r_in;
281 tmp1_r += tmp0_r;
282 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
283
284 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
285 tmp0_l = q7_l_in - p0_l_in;
286 tmp0_l += q0_l_in;
287 tmp0_l -= p7_l_in;
288 tmp1_l += tmp0_l;
289 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
290
291 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
292 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
293 ST_UB(filter8, src);
294 src += pitch;
295
296 /* q1 */
297 filter8 = LD_UB(filter48 + 64);
298 tmp0_r = q7_r_in - q0_r_in;
299 tmp0_r += q1_r_in;
300 tmp0_r -= p6_r_in;
301 tmp1_r += tmp0_r;
302 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
303
304 tmp0_l = q7_l_in - q0_l_in;
305 tmp0_l += q1_l_in;
306 tmp0_l -= p6_l_in;
307 tmp1_l += tmp0_l;
308 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
309
310 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
311 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
312 ST_UB(filter8, src);
313 src += pitch;
314
315 /* q2 */
316 filter8 = LD_UB(filter48 + 80);
317 tmp0_r = q7_r_in - q1_r_in;
318 tmp0_r += q2_r_in;
319 tmp0_r -= p5_r_in;
320 tmp1_r += tmp0_r;
321 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
322
323 tmp0_l = q7_l_in - q1_l_in;
324 tmp0_l += q2_l_in;
325 tmp0_l -= p5_l_in;
326 tmp1_l += tmp0_l;
327 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
328
329 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
330 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
331 ST_UB(filter8, src);
332 src += pitch;
333
334 /* q3 */
335 tmp0_r = q7_r_in - q2_r_in;
336 tmp0_r += q3_r_in;
337 tmp0_r -= p4_r_in;
338 tmp1_r += tmp0_r;
339 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
340
341 tmp0_l = q7_l_in - q2_l_in;
342 tmp0_l += q3_l_in;
343 tmp0_l -= p4_l_in;
344 tmp1_l += tmp0_l;
345 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
346
347 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
348 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
349 ST_UB(q3, src);
350 src += pitch;
351
352 /* q4 */
353 tmp0_r = q7_r_in - q3_r_in;
354 tmp0_r += q4_r_in;
355 tmp0_r -= p3_r_in;
356 tmp1_r += tmp0_r;
357 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
358
359 tmp0_l = q7_l_in - q3_l_in;
360 tmp0_l += q4_l_in;
361 tmp0_l -= p3_l_in;
362 tmp1_l += tmp0_l;
363 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
364
365 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
366 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
367 ST_UB(q4, src);
368 src += pitch;
369
370 /* q5 */
371 tmp0_r = q7_r_in - q4_r_in;
372 tmp0_r += q5_r_in;
373 tmp0_r -= p2_r_in;
374 tmp1_r += tmp0_r;
375 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
376
377 tmp0_l = q7_l_in - q4_l_in;
378 tmp0_l += q5_l_in;
379 tmp0_l -= p2_l_in;
380 tmp1_l += tmp0_l;
381 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
382
383 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
384 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
385 ST_UB(q5, src);
386 src += pitch;
387
388 /* q6 */
389 tmp0_r = q7_r_in - q5_r_in;
390 tmp0_r += q6_r_in;
391 tmp0_r -= p1_r_in;
392 tmp1_r += tmp0_r;
393 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
394
395 tmp0_l = q7_l_in - q5_l_in;
396 tmp0_l += q6_l_in;
397 tmp0_l -= p1_l_in;
398 tmp1_l += tmp0_l;
399 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
400
401 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
402 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
403 ST_UB(q6, src);
404 }
405 }
406
aom_lpf_horizontal_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)407 void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
408 const uint8_t *b_limit_ptr,
409 const uint8_t *limit_ptr,
410 const uint8_t *thresh_ptr, int32_t count) {
411 DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
412 uint8_t early_exit = 0;
413
414 (void)count;
415
416 early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
417 limit_ptr, thresh_ptr);
418
419 if (0 == early_exit) {
420 aom_hz_lpf_t16_16w(src, pitch, filter48);
421 }
422 }
423
mb_lpf_horizontal_edge(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)424 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
425 const uint8_t *b_limit_ptr,
426 const uint8_t *limit_ptr,
427 const uint8_t *thresh_ptr, int32_t count) {
428 if (1 == count) {
429 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
430 uint64_t dword0, dword1;
431 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
432 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
433 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
434 v16u8 p0_filter16, p1_filter16;
435 v8i16 p2_filter8, p1_filter8, p0_filter8;
436 v8i16 q0_filter8, q1_filter8, q2_filter8;
437 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
438 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
439 v16i8 zero = { 0 };
440 v8u16 tmp0, tmp1, tmp2;
441
442 /* load vector elements */
443 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
444
445 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
446 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
447 limit = (v16u8)__msa_fill_b(*limit_ptr);
448
449 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
450 mask, flat);
451 AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
452 AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
453 q1_out);
454
455 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
456
457 if (__msa_test_bz_v(flat)) {
458 p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
459 p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
460 q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
461 q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
462 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
463 } else {
464 /* convert 8 bit input data into 16 bit */
465 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
466 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
467 q3_r);
468 AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
469 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
470
471 /* convert 16 bit output data into 8 bit */
472 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
473 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
474 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
475
476 /* store pixel values */
477 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
478 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
479 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
480 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
481 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
482 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
483
484 /* load 16 vector elements */
485 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
486 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
487
488 AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
489
490 if (__msa_test_bz_v(flat2)) {
491 p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
492 p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
493 p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
494 q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
495 q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
496 q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
497
498 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
499 SD(q1_d, src + pitch);
500 SD(q2_d, src + 2 * pitch);
501 } else {
502 /* LSB(right) 8 pixel operation */
503 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
504 zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
505 q7_r);
506
507 tmp0 = p7_r << 3;
508 tmp0 -= p7_r;
509 tmp0 += p6_r;
510 tmp0 += q0_r;
511
512 src -= 7 * pitch;
513
514 /* calculation of p6 and p5 */
515 tmp1 = p6_r + p5_r + p4_r + p3_r;
516 tmp1 += (p2_r + p1_r + p0_r);
517 tmp1 += tmp0;
518 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
519 tmp0 = p5_r - p6_r + q1_r - p7_r;
520 tmp1 += tmp0;
521 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
522 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
523 p1_filter16);
524 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
525 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
526 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
527 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
528 SD(dword0, src);
529 src += pitch;
530 SD(dword1, src);
531 src += pitch;
532
533 /* calculation of p4 and p3 */
534 tmp0 = p4_r - p5_r + q2_r - p7_r;
535 tmp2 = p3_r - p4_r + q3_r - p7_r;
536 tmp1 += tmp0;
537 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
538 tmp1 += tmp2;
539 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
540 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
541 p1_filter16);
542 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
543 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
544 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
545 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
546 SD(dword0, src);
547 src += pitch;
548 SD(dword1, src);
549 src += pitch;
550
551 /* calculation of p2 and p1 */
552 tmp0 = p2_r - p3_r + q4_r - p7_r;
553 tmp2 = p1_r - p2_r + q5_r - p7_r;
554 tmp1 += tmp0;
555 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
556 tmp1 += tmp2;
557 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
558 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
559 p1_filter16);
560 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
561 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
562 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
563 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
564 SD(dword0, src);
565 src += pitch;
566 SD(dword1, src);
567 src += pitch;
568
569 /* calculation of p0 and q0 */
570 tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
571 tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
572 tmp1 += tmp0;
573 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
574 tmp1 += tmp2;
575 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
576 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
577 p1_filter16);
578 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
579 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
580 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
581 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
582 SD(dword0, src);
583 src += pitch;
584 SD(dword1, src);
585 src += pitch;
586
587 /* calculation of q1 and q2 */
588 tmp0 = q7_r - q0_r + q1_r - p6_r;
589 tmp2 = q7_r - q1_r + q2_r - p5_r;
590 tmp1 += tmp0;
591 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
592 tmp1 += tmp2;
593 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
594 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
595 p1_filter16);
596 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
597 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
598 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
599 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
600 SD(dword0, src);
601 src += pitch;
602 SD(dword1, src);
603 src += pitch;
604
605 /* calculation of q3 and q4 */
606 tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
607 tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
608 tmp1 += tmp0;
609 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
610 tmp1 += tmp2;
611 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
612 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
613 p1_filter16);
614 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
615 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
616 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
617 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
618 SD(dword0, src);
619 src += pitch;
620 SD(dword1, src);
621 src += pitch;
622
623 /* calculation of q5 and q6 */
624 tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
625 tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
626 tmp1 += tmp0;
627 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
628 tmp1 += tmp2;
629 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
630 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
631 p1_filter16);
632 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
633 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
634 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
635 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
636 SD(dword0, src);
637 src += pitch;
638 SD(dword1, src);
639 }
640 }
641 } else {
642 aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
643 thresh_ptr, count);
644 }
645 }
646
aom_lpf_horizontal_edge_8_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)647 void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
648 const uint8_t *b_limit_ptr,
649 const uint8_t *limit_ptr,
650 const uint8_t *thresh_ptr) {
651 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
652 }
653
aom_lpf_horizontal_edge_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)654 void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
655 const uint8_t *b_limit_ptr,
656 const uint8_t *limit_ptr,
657 const uint8_t *thresh_ptr) {
658 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
659 }
660
transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)661 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
662 uint8_t *output, int32_t out_pitch) {
663 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
664 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
665 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
666
667 LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
668 p1_org, p0_org);
669 /* 8x8 transpose */
670 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
671 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
672 /* 8x8 transpose */
673 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
674 tmp0, tmp1, tmp2, tmp3);
675 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
676 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
677 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
678 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
679 SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
680
681 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
682 output += (8 * out_pitch);
683 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
684 }
685
transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)686 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
687 uint8_t *output, int32_t out_pitch) {
688 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
689 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
690
691 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
692 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
693 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
694 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
695 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
696 }
697
transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)698 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
699 int32_t out_pitch) {
700 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
701 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
702 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
703 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
704 v4i32 tmp2, tmp3;
705
706 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
707 input += (8 * in_pitch);
708 LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
709
710 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
711 row9, row10, row11, row12, row13, row14, row15, p7, p6,
712 p5, p4, p3, p2, p1, p0);
713
714 /* transpose 16x8 matrix into 8x16 */
715 /* total 8 intermediate register and 32 instructions */
716 q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
717 q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
718 q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
719 q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
720 q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
721 q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
722 q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
723 q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
724
725 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
726 tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
727 tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
728
729 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
730 tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
731 tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
732
733 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
734 q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
735 q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
736
737 tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
738 tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
739 q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
740 q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
741
742 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
743 q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
744 q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
745
746 tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
747 tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
748 q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
749 q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
750
751 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
752 output += (8 * out_pitch);
753 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
754 }
755
aom_vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)756 int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
757 uint8_t *src_org, int32_t pitch_org,
758 const uint8_t *b_limit_ptr,
759 const uint8_t *limit_ptr,
760 const uint8_t *thresh_ptr) {
761 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
762 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
763 v16u8 flat, mask, hev, thresh, b_limit, limit;
764 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
765 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
766 v16i8 zero = { 0 };
767 v8i16 vec0, vec1, vec2, vec3;
768
769 /* load vector elements */
770 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
771
772 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
773 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
774 limit = (v16u8)__msa_fill_b(*limit_ptr);
775
776 /* mask and hev */
777 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
778 mask, flat);
779 /* flat4 */
780 AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
781 /* filter4 */
782 AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
783
784 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
785
786 if (__msa_test_bz_v(flat)) {
787 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
788 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
789 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
790 return 1;
791 } else {
792 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
793 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
794 AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
795 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
796
797 /* convert 16 bit output data into 8 bit */
798 p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
799 p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
800 p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
801 q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
802 q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
803 q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
804
805 /* store pixel values */
806 p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
807 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
808 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
809 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
810 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
811 q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
812
813 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
814 filter48 += (4 * 16);
815 ST_UB2(q1_out, q2_out, filter48, 16);
816 filter48 += (2 * 16);
817 ST_UB(flat, filter48);
818
819 return 0;
820 }
821 }
822
aom_vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)823 int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
824 uint8_t *filter48) {
825 v16i8 zero = { 0 };
826 v16u8 filter8, flat, flat2;
827 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
828 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
829 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
830 v8u16 tmp0_r, tmp1_r;
831 v8i16 r_out;
832
833 flat = LD_UB(filter48 + 6 * 16);
834
835 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
836 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
837
838 AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
839
840 if (__msa_test_bz_v(flat2)) {
841 v8i16 vec0, vec1, vec2, vec3, vec4;
842
843 LD_UB4(filter48, 16, p2, p1, p0, q0);
844 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
845
846 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
847 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
848 vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
849
850 src_org -= 3;
851 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
852 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
853 src_org += (4 * pitch);
854 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
855 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
856
857 return 1;
858 } else {
859 src -= 7 * 16;
860
861 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
862 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
863 p2_r_in, p1_r_in, p0_r_in);
864 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
865
866 tmp0_r = p7_r_in << 3;
867 tmp0_r -= p7_r_in;
868 tmp0_r += p6_r_in;
869 tmp0_r += q0_r_in;
870 tmp1_r = p6_r_in + p5_r_in;
871 tmp1_r += p4_r_in;
872 tmp1_r += p3_r_in;
873 tmp1_r += p2_r_in;
874 tmp1_r += p1_r_in;
875 tmp1_r += p0_r_in;
876 tmp1_r += tmp0_r;
877
878 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
879 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
880 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
881 ST8x1_UB(p6, src);
882 src += 16;
883
884 /* p5 */
885 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
886 tmp0_r = p5_r_in - p6_r_in;
887 tmp0_r += q1_r_in;
888 tmp0_r -= p7_r_in;
889 tmp1_r += tmp0_r;
890 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
891 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
892 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
893 ST8x1_UB(p5, src);
894 src += 16;
895
896 /* p4 */
897 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
898 tmp0_r = p4_r_in - p5_r_in;
899 tmp0_r += q2_r_in;
900 tmp0_r -= p7_r_in;
901 tmp1_r += tmp0_r;
902 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
903 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
904 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
905 ST8x1_UB(p4, src);
906 src += 16;
907
908 /* p3 */
909 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
910 tmp0_r = p3_r_in - p4_r_in;
911 tmp0_r += q3_r_in;
912 tmp0_r -= p7_r_in;
913 tmp1_r += tmp0_r;
914 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
915 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
916 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
917 ST8x1_UB(p3, src);
918 src += 16;
919
920 /* p2 */
921 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
922 filter8 = LD_UB(filter48);
923 tmp0_r = p2_r_in - p3_r_in;
924 tmp0_r += q4_r_in;
925 tmp0_r -= p7_r_in;
926 tmp1_r += tmp0_r;
927 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
928 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
929 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
930 ST8x1_UB(filter8, src);
931 src += 16;
932
933 /* p1 */
934 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
935 filter8 = LD_UB(filter48 + 16);
936 tmp0_r = p1_r_in - p2_r_in;
937 tmp0_r += q5_r_in;
938 tmp0_r -= p7_r_in;
939 tmp1_r += tmp0_r;
940 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
941 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
942 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
943 ST8x1_UB(filter8, src);
944 src += 16;
945
946 /* p0 */
947 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
948 filter8 = LD_UB(filter48 + 32);
949 tmp0_r = p0_r_in - p1_r_in;
950 tmp0_r += q6_r_in;
951 tmp0_r -= p7_r_in;
952 tmp1_r += tmp0_r;
953 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
954 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
955 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
956 ST8x1_UB(filter8, src);
957 src += 16;
958
959 /* q0 */
960 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
961 filter8 = LD_UB(filter48 + 48);
962 tmp0_r = q7_r_in - p0_r_in;
963 tmp0_r += q0_r_in;
964 tmp0_r -= p7_r_in;
965 tmp1_r += tmp0_r;
966 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
967 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
968 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
969 ST8x1_UB(filter8, src);
970 src += 16;
971
972 /* q1 */
973 filter8 = LD_UB(filter48 + 64);
974 tmp0_r = q7_r_in - q0_r_in;
975 tmp0_r += q1_r_in;
976 tmp0_r -= p6_r_in;
977 tmp1_r += tmp0_r;
978 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
979 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
980 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
981 ST8x1_UB(filter8, src);
982 src += 16;
983
984 /* q2 */
985 filter8 = LD_UB(filter48 + 80);
986 tmp0_r = q7_r_in - q1_r_in;
987 tmp0_r += q2_r_in;
988 tmp0_r -= p5_r_in;
989 tmp1_r += tmp0_r;
990 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
991 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
992 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
993 ST8x1_UB(filter8, src);
994 src += 16;
995
996 /* q3 */
997 tmp0_r = q7_r_in - q2_r_in;
998 tmp0_r += q3_r_in;
999 tmp0_r -= p4_r_in;
1000 tmp1_r += tmp0_r;
1001 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1002 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1003 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1004 ST8x1_UB(q3, src);
1005 src += 16;
1006
1007 /* q4 */
1008 tmp0_r = q7_r_in - q3_r_in;
1009 tmp0_r += q4_r_in;
1010 tmp0_r -= p3_r_in;
1011 tmp1_r += tmp0_r;
1012 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1013 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1014 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1015 ST8x1_UB(q4, src);
1016 src += 16;
1017
1018 /* q5 */
1019 tmp0_r = q7_r_in - q4_r_in;
1020 tmp0_r += q5_r_in;
1021 tmp0_r -= p2_r_in;
1022 tmp1_r += tmp0_r;
1023 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1024 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1025 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1026 ST8x1_UB(q5, src);
1027 src += 16;
1028
1029 /* q6 */
1030 tmp0_r = q7_r_in - q5_r_in;
1031 tmp0_r += q6_r_in;
1032 tmp0_r -= p1_r_in;
1033 tmp1_r += tmp0_r;
1034 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1035 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1036 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1037 ST8x1_UB(q6, src);
1038
1039 return 0;
1040 }
1041 }
1042
aom_lpf_vertical_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1043 void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
1044 const uint8_t *b_limit_ptr,
1045 const uint8_t *limit_ptr,
1046 const uint8_t *thresh_ptr) {
1047 uint8_t early_exit = 0;
1048 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1049 uint8_t *filter48 = &transposed_input[16 * 16];
1050
1051 transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
1052
1053 early_exit =
1054 aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
1055 pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1056
1057 if (0 == early_exit) {
1058 early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
1059 &filter48[0]);
1060
1061 if (0 == early_exit) {
1062 transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
1063 }
1064 }
1065 }
1066
aom_vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1067 int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
1068 uint8_t *src_org, int32_t pitch,
1069 const uint8_t *b_limit_ptr,
1070 const uint8_t *limit_ptr,
1071 const uint8_t *thresh_ptr) {
1072 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1073 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1074 v16u8 flat, mask, hev, thresh, b_limit, limit;
1075 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1076 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1077 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
1078 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
1079 v16i8 zero = { 0 };
1080 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
1081
1082 /* load vector elements */
1083 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1084
1085 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
1086 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
1087 limit = (v16u8)__msa_fill_b(*limit_ptr);
1088
1089 /* mask and hev */
1090 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
1091 mask, flat);
1092 /* flat4 */
1093 AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1094 /* filter4 */
1095 AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
1096
1097 if (__msa_test_bz_v(flat)) {
1098 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1099 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1100 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1101 ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1102
1103 src_org -= 2;
1104 ST4x8_UB(vec2, vec3, src_org, pitch);
1105 src_org += 8 * pitch;
1106 ST4x8_UB(vec4, vec5, src_org, pitch);
1107
1108 return 1;
1109 } else {
1110 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
1111 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
1112 AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1113 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1114 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
1115 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
1116 AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1117 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1118
1119 /* convert 16 bit output data into 8 bit */
1120 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1121 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1122 p0_filt8_r, q0_filt8_r);
1123 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1124 q2_filt8_r);
1125
1126 /* store pixel values */
1127 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
1128 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
1129 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
1130 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
1131 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
1132 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
1133
1134 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1135 filter48 += (4 * 16);
1136 ST_UB2(q1_out, q2_out, filter48, 16);
1137 filter48 += (2 * 16);
1138 ST_UB(flat, filter48);
1139
1140 return 0;
1141 }
1142 }
1143
aom_vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)1144 int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
1145 uint8_t *filter48) {
1146 v16u8 flat, flat2, filter8;
1147 v16i8 zero = { 0 };
1148 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1149 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1150 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1151 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
1152 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
1153 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
1154 v8i16 l_out, r_out;
1155
1156 flat = LD_UB(filter48 + 6 * 16);
1157
1158 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1159 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1160
1161 AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1162
1163 if (__msa_test_bz_v(flat2)) {
1164 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1165
1166 LD_UB4(filter48, 16, p2, p1, p0, q0);
1167 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1168
1169 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1170 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1171 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1172 ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1173 ILVRL_B2_SH(q2, q1, vec2, vec5);
1174
1175 src_org -= 3;
1176 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1177 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1178 src_org += (4 * pitch);
1179 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1180 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1181 src_org += (4 * pitch);
1182 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
1183 ST2x4_UB(vec5, 0, (src_org + 4), pitch);
1184 src_org += (4 * pitch);
1185 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
1186 ST2x4_UB(vec5, 4, (src_org + 4), pitch);
1187
1188 return 1;
1189 } else {
1190 src -= 7 * 16;
1191
1192 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
1193 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
1194 p2_r_in, p1_r_in, p0_r_in);
1195 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
1196
1197 tmp0_r = p7_r_in << 3;
1198 tmp0_r -= p7_r_in;
1199 tmp0_r += p6_r_in;
1200 tmp0_r += q0_r_in;
1201 tmp1_r = p6_r_in + p5_r_in;
1202 tmp1_r += p4_r_in;
1203 tmp1_r += p3_r_in;
1204 tmp1_r += p2_r_in;
1205 tmp1_r += p1_r_in;
1206 tmp1_r += p0_r_in;
1207 tmp1_r += tmp0_r;
1208 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1209
1210 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
1211 p5_l_in, p4_l_in);
1212 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
1213 p1_l_in, p0_l_in);
1214 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
1215
1216 tmp0_l = p7_l_in << 3;
1217 tmp0_l -= p7_l_in;
1218 tmp0_l += p6_l_in;
1219 tmp0_l += q0_l_in;
1220 tmp1_l = p6_l_in + p5_l_in;
1221 tmp1_l += p4_l_in;
1222 tmp1_l += p3_l_in;
1223 tmp1_l += p2_l_in;
1224 tmp1_l += p1_l_in;
1225 tmp1_l += p0_l_in;
1226 tmp1_l += tmp0_l;
1227 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1228
1229 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1230 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
1231 ST_UB(p6, src);
1232 src += 16;
1233
1234 /* p5 */
1235 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
1236 tmp0_r = p5_r_in - p6_r_in;
1237 tmp0_r += q1_r_in;
1238 tmp0_r -= p7_r_in;
1239 tmp1_r += tmp0_r;
1240 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1241 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
1242 tmp0_l = p5_l_in - p6_l_in;
1243 tmp0_l += q1_l_in;
1244 tmp0_l -= p7_l_in;
1245 tmp1_l += tmp0_l;
1246 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1247 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1248 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
1249 ST_UB(p5, src);
1250 src += 16;
1251
1252 /* p4 */
1253 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
1254 tmp0_r = p4_r_in - p5_r_in;
1255 tmp0_r += q2_r_in;
1256 tmp0_r -= p7_r_in;
1257 tmp1_r += tmp0_r;
1258 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1259 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
1260 tmp0_l = p4_l_in - p5_l_in;
1261 tmp0_l += q2_l_in;
1262 tmp0_l -= p7_l_in;
1263 tmp1_l += tmp0_l;
1264 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1265 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1266 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
1267 ST_UB(p4, src);
1268 src += 16;
1269
1270 /* p3 */
1271 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
1272 tmp0_r = p3_r_in - p4_r_in;
1273 tmp0_r += q3_r_in;
1274 tmp0_r -= p7_r_in;
1275 tmp1_r += tmp0_r;
1276 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1277 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
1278 tmp0_l = p3_l_in - p4_l_in;
1279 tmp0_l += q3_l_in;
1280 tmp0_l -= p7_l_in;
1281 tmp1_l += tmp0_l;
1282 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1283 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1284 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
1285 ST_UB(p3, src);
1286 src += 16;
1287
1288 /* p2 */
1289 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
1290 filter8 = LD_UB(filter48);
1291 tmp0_r = p2_r_in - p3_r_in;
1292 tmp0_r += q4_r_in;
1293 tmp0_r -= p7_r_in;
1294 tmp1_r += tmp0_r;
1295 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1296 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
1297 tmp0_l = p2_l_in - p3_l_in;
1298 tmp0_l += q4_l_in;
1299 tmp0_l -= p7_l_in;
1300 tmp1_l += tmp0_l;
1301 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1302 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1303 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1304 ST_UB(filter8, src);
1305 src += 16;
1306
1307 /* p1 */
1308 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
1309 filter8 = LD_UB(filter48 + 16);
1310 tmp0_r = p1_r_in - p2_r_in;
1311 tmp0_r += q5_r_in;
1312 tmp0_r -= p7_r_in;
1313 tmp1_r += tmp0_r;
1314 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1315 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
1316 tmp0_l = p1_l_in - p2_l_in;
1317 tmp0_l += q5_l_in;
1318 tmp0_l -= p7_l_in;
1319 tmp1_l += tmp0_l;
1320 l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
1321 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1322 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1323 ST_UB(filter8, src);
1324 src += 16;
1325
1326 /* p0 */
1327 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
1328 filter8 = LD_UB(filter48 + 32);
1329 tmp0_r = p0_r_in - p1_r_in;
1330 tmp0_r += q6_r_in;
1331 tmp0_r -= p7_r_in;
1332 tmp1_r += tmp0_r;
1333 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1334 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
1335 tmp0_l = p0_l_in - p1_l_in;
1336 tmp0_l += q6_l_in;
1337 tmp0_l -= p7_l_in;
1338 tmp1_l += tmp0_l;
1339 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1340 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1341 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1342 ST_UB(filter8, src);
1343 src += 16;
1344
1345 /* q0 */
1346 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
1347 filter8 = LD_UB(filter48 + 48);
1348 tmp0_r = q7_r_in - p0_r_in;
1349 tmp0_r += q0_r_in;
1350 tmp0_r -= p7_r_in;
1351 tmp1_r += tmp0_r;
1352 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1353 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
1354 tmp0_l = q7_l_in - p0_l_in;
1355 tmp0_l += q0_l_in;
1356 tmp0_l -= p7_l_in;
1357 tmp1_l += tmp0_l;
1358 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1359 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1360 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1361 ST_UB(filter8, src);
1362 src += 16;
1363
1364 /* q1 */
1365 filter8 = LD_UB(filter48 + 64);
1366 tmp0_r = q7_r_in - q0_r_in;
1367 tmp0_r += q1_r_in;
1368 tmp0_r -= p6_r_in;
1369 tmp1_r += tmp0_r;
1370 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1371 tmp0_l = q7_l_in - q0_l_in;
1372 tmp0_l += q1_l_in;
1373 tmp0_l -= p6_l_in;
1374 tmp1_l += tmp0_l;
1375 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1376 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1377 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1378 ST_UB(filter8, src);
1379 src += 16;
1380
1381 /* q2 */
1382 filter8 = LD_UB(filter48 + 80);
1383 tmp0_r = q7_r_in - q1_r_in;
1384 tmp0_r += q2_r_in;
1385 tmp0_r -= p5_r_in;
1386 tmp1_r += tmp0_r;
1387 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1388 tmp0_l = q7_l_in - q1_l_in;
1389 tmp0_l += q2_l_in;
1390 tmp0_l -= p5_l_in;
1391 tmp1_l += tmp0_l;
1392 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1393 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1394 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1395 ST_UB(filter8, src);
1396 src += 16;
1397
1398 /* q3 */
1399 tmp0_r = q7_r_in - q2_r_in;
1400 tmp0_r += q3_r_in;
1401 tmp0_r -= p4_r_in;
1402 tmp1_r += tmp0_r;
1403 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1404 tmp0_l = q7_l_in - q2_l_in;
1405 tmp0_l += q3_l_in;
1406 tmp0_l -= p4_l_in;
1407 tmp1_l += tmp0_l;
1408 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1409 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1410 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1411 ST_UB(q3, src);
1412 src += 16;
1413
1414 /* q4 */
1415 tmp0_r = q7_r_in - q3_r_in;
1416 tmp0_r += q4_r_in;
1417 tmp0_r -= p3_r_in;
1418 tmp1_r += tmp0_r;
1419 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1420 tmp0_l = q7_l_in - q3_l_in;
1421 tmp0_l += q4_l_in;
1422 tmp0_l -= p3_l_in;
1423 tmp1_l += tmp0_l;
1424 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1425 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1426 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1427 ST_UB(q4, src);
1428 src += 16;
1429
1430 /* q5 */
1431 tmp0_r = q7_r_in - q4_r_in;
1432 tmp0_r += q5_r_in;
1433 tmp0_r -= p2_r_in;
1434 tmp1_r += tmp0_r;
1435 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1436 tmp0_l = q7_l_in - q4_l_in;
1437 tmp0_l += q5_l_in;
1438 tmp0_l -= p2_l_in;
1439 tmp1_l += tmp0_l;
1440 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1441 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1442 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1443 ST_UB(q5, src);
1444 src += 16;
1445
1446 /* q6 */
1447 tmp0_r = q7_r_in - q5_r_in;
1448 tmp0_r += q6_r_in;
1449 tmp0_r -= p1_r_in;
1450 tmp1_r += tmp0_r;
1451 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1452 tmp0_l = q7_l_in - q5_l_in;
1453 tmp0_l += q6_l_in;
1454 tmp0_l -= p1_l_in;
1455 tmp1_l += tmp0_l;
1456 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1457 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1458 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1459 ST_UB(q6, src);
1460
1461 return 0;
1462 }
1463 }
1464
aom_lpf_vertical_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1465 void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
1466 const uint8_t *b_limit_ptr,
1467 const uint8_t *limit_ptr,
1468 const uint8_t *thresh_ptr) {
1469 uint8_t early_exit = 0;
1470 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1471 uint8_t *filter48 = &transposed_input[16 * 16];
1472
1473 transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
1474
1475 early_exit =
1476 aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
1477 pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1478
1479 if (0 == early_exit) {
1480 early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
1481 &filter48[0]);
1482
1483 if (0 == early_exit) {
1484 transpose_16x16(transposed_input, 16, (src - 8), pitch);
1485 }
1486 }
1487 }
1488