1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/loopfilter_msa.h"
13 #include "vpx_ports/mem.h"
14
hz_lpf_t4_and_t8_16w(uint8_t * src,int32_t pitch,uint8_t * filter48,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)15 static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
16 uint8_t *filter48,
17 const uint8_t *b_limit_ptr,
18 const uint8_t *limit_ptr,
19 const uint8_t *thresh_ptr) {
20 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
21 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
22 v16u8 flat, mask, hev, thresh, b_limit, limit;
23 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
24 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
25 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
26 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
27 v16u8 zero = { 0 };
28
29 /* load vector elements */
30 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
31
32 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
33 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
34 limit = (v16u8)__msa_fill_b(*limit_ptr);
35
36 /* mask and hev */
37 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
38 mask, flat);
39 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
40 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
41
42 if (__msa_test_bz_v(flat)) {
43 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
44
45 return 1;
46 } else {
47 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
48 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
49 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
50 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
51
52 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
53 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
54 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
55 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
56
57 /* convert 16 bit output data into 8 bit */
58 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
59 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
60 p0_filt8_r, q0_filt8_r);
61 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
62 q2_filt8_r);
63
64 /* store pixel values */
65 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
66 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
67 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
68 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
69 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
70 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
71
72 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
73 filter48 += (4 * 16);
74 ST_UB2(q1_out, q2_out, filter48, 16);
75 filter48 += (2 * 16);
76 ST_UB(flat, filter48);
77
78 return 0;
79 }
80 }
81
hz_lpf_t16_16w(uint8_t * src,int32_t pitch,uint8_t * filter48)82 static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
83 v16u8 flat, flat2, filter8;
84 v16i8 zero = { 0 };
85 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
86 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
87 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
88 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
89 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
90 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
91 v8i16 l_out, r_out;
92
93 flat = LD_UB(filter48 + 96);
94
95 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
96 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
97 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
98
99 if (__msa_test_bz_v(flat2)) {
100 LD_UB4(filter48, 16, p2, p1, p0, q0);
101 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
102
103 src -= 3 * pitch;
104 ST_UB4(p2, p1, p0, q0, src, pitch);
105 src += (4 * pitch);
106 ST_UB2(q1, q2, src, pitch);
107 } else {
108 src -= 7 * pitch;
109
110 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
111 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
112 p2_r_in, p1_r_in, p0_r_in);
113
114 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
115
116 tmp0_r = p7_r_in << 3;
117 tmp0_r -= p7_r_in;
118 tmp0_r += p6_r_in;
119 tmp0_r += q0_r_in;
120 tmp1_r = p6_r_in + p5_r_in;
121 tmp1_r += p4_r_in;
122 tmp1_r += p3_r_in;
123 tmp1_r += p2_r_in;
124 tmp1_r += p1_r_in;
125 tmp1_r += p0_r_in;
126 tmp1_r += tmp0_r;
127 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
128
129 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
130 p5_l_in, p4_l_in);
131 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
132 p1_l_in, p0_l_in);
133 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
134
135 tmp0_l = p7_l_in << 3;
136 tmp0_l -= p7_l_in;
137 tmp0_l += p6_l_in;
138 tmp0_l += q0_l_in;
139 tmp1_l = p6_l_in + p5_l_in;
140 tmp1_l += p4_l_in;
141 tmp1_l += p3_l_in;
142 tmp1_l += p2_l_in;
143 tmp1_l += p1_l_in;
144 tmp1_l += p0_l_in;
145 tmp1_l += tmp0_l;
146 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
147
148 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
149 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
150 ST_UB(p6, src);
151 src += pitch;
152
153 /* p5 */
154 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
155 tmp0_r = p5_r_in - p6_r_in;
156 tmp0_r += q1_r_in;
157 tmp0_r -= p7_r_in;
158 tmp1_r += tmp0_r;
159 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
160
161 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
162 tmp0_l = p5_l_in - p6_l_in;
163 tmp0_l += q1_l_in;
164 tmp0_l -= p7_l_in;
165 tmp1_l += tmp0_l;
166 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
167
168 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
169 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
170 ST_UB(p5, src);
171 src += pitch;
172
173 /* p4 */
174 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
175 tmp0_r = p4_r_in - p5_r_in;
176 tmp0_r += q2_r_in;
177 tmp0_r -= p7_r_in;
178 tmp1_r += tmp0_r;
179 r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
180
181 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
182 tmp0_l = p4_l_in - p5_l_in;
183 tmp0_l += q2_l_in;
184 tmp0_l -= p7_l_in;
185 tmp1_l += tmp0_l;
186 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
187
188 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
189 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
190 ST_UB(p4, src);
191 src += pitch;
192
193 /* p3 */
194 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
195 tmp0_r = p3_r_in - p4_r_in;
196 tmp0_r += q3_r_in;
197 tmp0_r -= p7_r_in;
198 tmp1_r += tmp0_r;
199 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
200
201 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
202 tmp0_l = p3_l_in - p4_l_in;
203 tmp0_l += q3_l_in;
204 tmp0_l -= p7_l_in;
205 tmp1_l += tmp0_l;
206 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
207
208 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
209 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
210 ST_UB(p3, src);
211 src += pitch;
212
213 /* p2 */
214 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
215 filter8 = LD_UB(filter48);
216 tmp0_r = p2_r_in - p3_r_in;
217 tmp0_r += q4_r_in;
218 tmp0_r -= p7_r_in;
219 tmp1_r += tmp0_r;
220 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
221
222 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
223 tmp0_l = p2_l_in - p3_l_in;
224 tmp0_l += q4_l_in;
225 tmp0_l -= p7_l_in;
226 tmp1_l += tmp0_l;
227 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
228
229 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
230 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
231 ST_UB(filter8, src);
232 src += pitch;
233
234 /* p1 */
235 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
236 filter8 = LD_UB(filter48 + 16);
237 tmp0_r = p1_r_in - p2_r_in;
238 tmp0_r += q5_r_in;
239 tmp0_r -= p7_r_in;
240 tmp1_r += tmp0_r;
241 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
242
243 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
244 tmp0_l = p1_l_in - p2_l_in;
245 tmp0_l += q5_l_in;
246 tmp0_l -= p7_l_in;
247 tmp1_l += tmp0_l;
248 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
249
250 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
251 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
252 ST_UB(filter8, src);
253 src += pitch;
254
255 /* p0 */
256 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
257 filter8 = LD_UB(filter48 + 32);
258 tmp0_r = p0_r_in - p1_r_in;
259 tmp0_r += q6_r_in;
260 tmp0_r -= p7_r_in;
261 tmp1_r += tmp0_r;
262 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
263
264 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
265 tmp0_l = p0_l_in - p1_l_in;
266 tmp0_l += q6_l_in;
267 tmp0_l -= p7_l_in;
268 tmp1_l += tmp0_l;
269 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
270
271 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
272 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
273 ST_UB(filter8, src);
274 src += pitch;
275
276 /* q0 */
277 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
278 filter8 = LD_UB(filter48 + 48);
279 tmp0_r = q7_r_in - p0_r_in;
280 tmp0_r += q0_r_in;
281 tmp0_r -= p7_r_in;
282 tmp1_r += tmp0_r;
283 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
284
285 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
286 tmp0_l = q7_l_in - p0_l_in;
287 tmp0_l += q0_l_in;
288 tmp0_l -= p7_l_in;
289 tmp1_l += tmp0_l;
290 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
291
292 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
293 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
294 ST_UB(filter8, src);
295 src += pitch;
296
297 /* q1 */
298 filter8 = LD_UB(filter48 + 64);
299 tmp0_r = q7_r_in - q0_r_in;
300 tmp0_r += q1_r_in;
301 tmp0_r -= p6_r_in;
302 tmp1_r += tmp0_r;
303 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
304
305 tmp0_l = q7_l_in - q0_l_in;
306 tmp0_l += q1_l_in;
307 tmp0_l -= p6_l_in;
308 tmp1_l += tmp0_l;
309 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
310
311 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
312 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
313 ST_UB(filter8, src);
314 src += pitch;
315
316 /* q2 */
317 filter8 = LD_UB(filter48 + 80);
318 tmp0_r = q7_r_in - q1_r_in;
319 tmp0_r += q2_r_in;
320 tmp0_r -= p5_r_in;
321 tmp1_r += tmp0_r;
322 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
323
324 tmp0_l = q7_l_in - q1_l_in;
325 tmp0_l += q2_l_in;
326 tmp0_l -= p5_l_in;
327 tmp1_l += tmp0_l;
328 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
329
330 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
331 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
332 ST_UB(filter8, src);
333 src += pitch;
334
335 /* q3 */
336 tmp0_r = q7_r_in - q2_r_in;
337 tmp0_r += q3_r_in;
338 tmp0_r -= p4_r_in;
339 tmp1_r += tmp0_r;
340 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
341
342 tmp0_l = q7_l_in - q2_l_in;
343 tmp0_l += q3_l_in;
344 tmp0_l -= p4_l_in;
345 tmp1_l += tmp0_l;
346 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
347
348 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
349 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
350 ST_UB(q3, src);
351 src += pitch;
352
353 /* q4 */
354 tmp0_r = q7_r_in - q3_r_in;
355 tmp0_r += q4_r_in;
356 tmp0_r -= p3_r_in;
357 tmp1_r += tmp0_r;
358 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
359
360 tmp0_l = q7_l_in - q3_l_in;
361 tmp0_l += q4_l_in;
362 tmp0_l -= p3_l_in;
363 tmp1_l += tmp0_l;
364 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
365
366 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
367 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
368 ST_UB(q4, src);
369 src += pitch;
370
371 /* q5 */
372 tmp0_r = q7_r_in - q4_r_in;
373 tmp0_r += q5_r_in;
374 tmp0_r -= p2_r_in;
375 tmp1_r += tmp0_r;
376 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
377
378 tmp0_l = q7_l_in - q4_l_in;
379 tmp0_l += q5_l_in;
380 tmp0_l -= p2_l_in;
381 tmp1_l += tmp0_l;
382 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
383
384 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
385 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
386 ST_UB(q5, src);
387 src += pitch;
388
389 /* q6 */
390 tmp0_r = q7_r_in - q5_r_in;
391 tmp0_r += q6_r_in;
392 tmp0_r -= p1_r_in;
393 tmp1_r += tmp0_r;
394 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
395
396 tmp0_l = q7_l_in - q5_l_in;
397 tmp0_l += q6_l_in;
398 tmp0_l -= p1_l_in;
399 tmp1_l += tmp0_l;
400 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
401
402 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
403 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
404 ST_UB(q6, src);
405 }
406 }
407
mb_lpf_horizontal_edge_dual(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)408 static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
409 const uint8_t *b_limit_ptr,
410 const uint8_t *limit_ptr,
411 const uint8_t *thresh_ptr,
412 int32_t count) {
413 DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
414 uint8_t early_exit = 0;
415
416 (void)count;
417
418 early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
419 limit_ptr, thresh_ptr);
420
421 if (0 == early_exit) {
422 hz_lpf_t16_16w(src, pitch, filter48);
423 }
424 }
425
mb_lpf_horizontal_edge(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)426 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
427 const uint8_t *b_limit_ptr,
428 const uint8_t *limit_ptr,
429 const uint8_t *thresh_ptr, int32_t count) {
430 if (1 == count) {
431 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
432 uint64_t dword0, dword1;
433 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
434 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
435 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
436 v16u8 p0_filter16, p1_filter16;
437 v8i16 p2_filter8, p1_filter8, p0_filter8;
438 v8i16 q0_filter8, q1_filter8, q2_filter8;
439 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
440 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
441 v16i8 zero = { 0 };
442 v8u16 tmp0, tmp1, tmp2;
443
444 /* load vector elements */
445 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
446
447 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
448 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
449 limit = (v16u8)__msa_fill_b(*limit_ptr);
450
451 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
452 mask, flat);
453 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
454 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
455 q1_out);
456
457 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
458
459 if (__msa_test_bz_v(flat)) {
460 p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
461 p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
462 q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
463 q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
464 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
465 } else {
466 /* convert 8 bit input data into 16 bit */
467 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
468 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
469 q3_r);
470 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
471 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
472
473 /* convert 16 bit output data into 8 bit */
474 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
475 q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
476 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
477
478 /* store pixel values */
479 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
480 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
481 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
482 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
483 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
484 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
485
486 /* load 16 vector elements */
487 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
488 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
489
490 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
491
492 if (__msa_test_bz_v(flat2)) {
493 p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
494 p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
495 p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
496 q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
497 q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
498 q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
499
500 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
501 SD(q1_d, src + pitch);
502 SD(q2_d, src + 2 * pitch);
503 } else {
504 /* LSB(right) 8 pixel operation */
505 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
506 zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
507 q7_r);
508
509 tmp0 = p7_r << 3;
510 tmp0 -= p7_r;
511 tmp0 += p6_r;
512 tmp0 += q0_r;
513
514 src -= 7 * pitch;
515
516 /* calculation of p6 and p5 */
517 tmp1 = p6_r + p5_r + p4_r + p3_r;
518 tmp1 += (p2_r + p1_r + p0_r);
519 tmp1 += tmp0;
520 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
521 tmp0 = p5_r - p6_r + q1_r - p7_r;
522 tmp1 += tmp0;
523 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
524 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
525 p1_filter16);
526 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
527 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
528 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
529 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
530 SD(dword0, src);
531 src += pitch;
532 SD(dword1, src);
533 src += pitch;
534
535 /* calculation of p4 and p3 */
536 tmp0 = p4_r - p5_r + q2_r - p7_r;
537 tmp2 = p3_r - p4_r + q3_r - p7_r;
538 tmp1 += tmp0;
539 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
540 tmp1 += tmp2;
541 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
542 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
543 p1_filter16);
544 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
545 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
546 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
547 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
548 SD(dword0, src);
549 src += pitch;
550 SD(dword1, src);
551 src += pitch;
552
553 /* calculation of p2 and p1 */
554 tmp0 = p2_r - p3_r + q4_r - p7_r;
555 tmp2 = p1_r - p2_r + q5_r - p7_r;
556 tmp1 += tmp0;
557 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
558 tmp1 += tmp2;
559 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
560 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
561 p1_filter16);
562 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
563 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
564 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
565 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
566 SD(dword0, src);
567 src += pitch;
568 SD(dword1, src);
569 src += pitch;
570
571 /* calculation of p0 and q0 */
572 tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
573 tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
574 tmp1 += tmp0;
575 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
576 tmp1 += tmp2;
577 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
578 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
579 p1_filter16);
580 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
581 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
582 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
583 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
584 SD(dword0, src);
585 src += pitch;
586 SD(dword1, src);
587 src += pitch;
588
589 /* calculation of q1 and q2 */
590 tmp0 = q7_r - q0_r + q1_r - p6_r;
591 tmp2 = q7_r - q1_r + q2_r - p5_r;
592 tmp1 += tmp0;
593 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
594 tmp1 += tmp2;
595 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
596 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
597 p1_filter16);
598 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
599 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
600 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
601 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
602 SD(dword0, src);
603 src += pitch;
604 SD(dword1, src);
605 src += pitch;
606
607 /* calculation of q3 and q4 */
608 tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
609 tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
610 tmp1 += tmp0;
611 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
612 tmp1 += tmp2;
613 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
614 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
615 p1_filter16);
616 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
617 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
618 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
619 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
620 SD(dword0, src);
621 src += pitch;
622 SD(dword1, src);
623 src += pitch;
624
625 /* calculation of q5 and q6 */
626 tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
627 tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
628 tmp1 += tmp0;
629 p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
630 tmp1 += tmp2;
631 p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
632 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
633 p1_filter16);
634 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
635 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
636 dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
637 dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
638 SD(dword0, src);
639 src += pitch;
640 SD(dword1, src);
641 }
642 }
643 } else {
644 mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
645 count);
646 }
647 }
648
vpx_lpf_horizontal_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)649 void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
650 const uint8_t *b_limit_ptr,
651 const uint8_t *limit_ptr,
652 const uint8_t *thresh_ptr) {
653 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
654 }
655
vpx_lpf_horizontal_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)656 void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
657 const uint8_t *b_limit_ptr,
658 const uint8_t *limit_ptr,
659 const uint8_t *thresh_ptr) {
660 mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
661 }
662
transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)663 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
664 uint8_t *output, int32_t out_pitch) {
665 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
666 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
667 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
668
669 LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
670 p1_org, p0_org);
671 /* 8x8 transpose */
672 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
673 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
674 /* 8x8 transpose */
675 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
676 tmp0, tmp1, tmp2, tmp3);
677 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
678 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
679 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
680 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
681 SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
682
683 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
684 output += (8 * out_pitch);
685 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
686 }
687
transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)688 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
689 uint8_t *output, int32_t out_pitch) {
690 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
691 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
692
693 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
694 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
695 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
696 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
697 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
698 }
699
transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)700 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
701 int32_t out_pitch) {
702 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
703 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
704 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
705 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
706 v4i32 tmp2, tmp3;
707
708 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
709 input += (8 * in_pitch);
710 LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
711
712 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
713 row9, row10, row11, row12, row13, row14, row15, p7, p6,
714 p5, p4, p3, p2, p1, p0);
715
716 /* transpose 16x8 matrix into 8x16 */
717 /* total 8 intermediate register and 32 instructions */
718 q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
719 q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
720 q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
721 q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
722 q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
723 q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
724 q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
725 q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
726
727 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
728 tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
729 tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
730
731 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
732 tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
733 tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
734
735 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
736 q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
737 q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
738
739 tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
740 tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
741 q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
742 q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
743
744 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
745 q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
746 q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
747
748 tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
749 tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
750 q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
751 q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
752
753 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
754 output += (8 * out_pitch);
755 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
756 }
757
vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)758 static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
759 uint8_t *src_org, int32_t pitch_org,
760 const uint8_t *b_limit_ptr,
761 const uint8_t *limit_ptr,
762 const uint8_t *thresh_ptr) {
763 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
764 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
765 v16u8 flat, mask, hev, thresh, b_limit, limit;
766 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
767 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
768 v16i8 zero = { 0 };
769 v8i16 vec0, vec1, vec2, vec3;
770
771 /* load vector elements */
772 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
773
774 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
775 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
776 limit = (v16u8)__msa_fill_b(*limit_ptr);
777
778 /* mask and hev */
779 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
780 mask, flat);
781 /* flat4 */
782 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
783 /* filter4 */
784 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
785
786 flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
787
788 if (__msa_test_bz_v(flat)) {
789 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
790 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
791 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
792 return 1;
793 } else {
794 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
795 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
796 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
797 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
798
799 /* convert 16 bit output data into 8 bit */
800 p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
801 p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
802 p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
803 q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
804 q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
805 q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
806
807 /* store pixel values */
808 p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
809 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
810 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
811 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
812 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
813 q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
814
815 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
816 filter48 += (4 * 16);
817 ST_UB2(q1_out, q2_out, filter48, 16);
818 filter48 += (2 * 16);
819 ST_UB(flat, filter48);
820
821 return 0;
822 }
823 }
824
vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)825 static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
826 uint8_t *filter48) {
827 v16i8 zero = { 0 };
828 v16u8 filter8, flat, flat2;
829 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
830 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
831 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
832 v8u16 tmp0_r, tmp1_r;
833 v8i16 r_out;
834
835 flat = LD_UB(filter48 + 6 * 16);
836
837 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
838 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
839
840 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
841
842 if (__msa_test_bz_v(flat2)) {
843 v8i16 vec0, vec1, vec2, vec3, vec4;
844
845 LD_UB4(filter48, 16, p2, p1, p0, q0);
846 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
847
848 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
849 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
850 vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
851
852 src_org -= 3;
853 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
854 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
855 src_org += (4 * pitch);
856 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
857 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
858
859 return 1;
860 } else {
861 src -= 7 * 16;
862
863 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
864 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
865 p2_r_in, p1_r_in, p0_r_in);
866 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
867
868 tmp0_r = p7_r_in << 3;
869 tmp0_r -= p7_r_in;
870 tmp0_r += p6_r_in;
871 tmp0_r += q0_r_in;
872 tmp1_r = p6_r_in + p5_r_in;
873 tmp1_r += p4_r_in;
874 tmp1_r += p3_r_in;
875 tmp1_r += p2_r_in;
876 tmp1_r += p1_r_in;
877 tmp1_r += p0_r_in;
878 tmp1_r += tmp0_r;
879
880 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
881 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
882 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
883 ST8x1_UB(p6, src);
884 src += 16;
885
886 /* p5 */
887 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
888 tmp0_r = p5_r_in - p6_r_in;
889 tmp0_r += q1_r_in;
890 tmp0_r -= p7_r_in;
891 tmp1_r += tmp0_r;
892 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
893 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
894 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
895 ST8x1_UB(p5, src);
896 src += 16;
897
898 /* p4 */
899 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
900 tmp0_r = p4_r_in - p5_r_in;
901 tmp0_r += q2_r_in;
902 tmp0_r -= p7_r_in;
903 tmp1_r += tmp0_r;
904 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
905 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
906 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
907 ST8x1_UB(p4, src);
908 src += 16;
909
910 /* p3 */
911 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
912 tmp0_r = p3_r_in - p4_r_in;
913 tmp0_r += q3_r_in;
914 tmp0_r -= p7_r_in;
915 tmp1_r += tmp0_r;
916 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
917 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
918 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
919 ST8x1_UB(p3, src);
920 src += 16;
921
922 /* p2 */
923 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
924 filter8 = LD_UB(filter48);
925 tmp0_r = p2_r_in - p3_r_in;
926 tmp0_r += q4_r_in;
927 tmp0_r -= p7_r_in;
928 tmp1_r += tmp0_r;
929 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
930 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
931 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
932 ST8x1_UB(filter8, src);
933 src += 16;
934
935 /* p1 */
936 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
937 filter8 = LD_UB(filter48 + 16);
938 tmp0_r = p1_r_in - p2_r_in;
939 tmp0_r += q5_r_in;
940 tmp0_r -= p7_r_in;
941 tmp1_r += tmp0_r;
942 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
943 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
944 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
945 ST8x1_UB(filter8, src);
946 src += 16;
947
948 /* p0 */
949 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
950 filter8 = LD_UB(filter48 + 32);
951 tmp0_r = p0_r_in - p1_r_in;
952 tmp0_r += q6_r_in;
953 tmp0_r -= p7_r_in;
954 tmp1_r += tmp0_r;
955 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
956 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
957 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
958 ST8x1_UB(filter8, src);
959 src += 16;
960
961 /* q0 */
962 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
963 filter8 = LD_UB(filter48 + 48);
964 tmp0_r = q7_r_in - p0_r_in;
965 tmp0_r += q0_r_in;
966 tmp0_r -= p7_r_in;
967 tmp1_r += tmp0_r;
968 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
969 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
970 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
971 ST8x1_UB(filter8, src);
972 src += 16;
973
974 /* q1 */
975 filter8 = LD_UB(filter48 + 64);
976 tmp0_r = q7_r_in - q0_r_in;
977 tmp0_r += q1_r_in;
978 tmp0_r -= p6_r_in;
979 tmp1_r += tmp0_r;
980 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
981 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
982 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
983 ST8x1_UB(filter8, src);
984 src += 16;
985
986 /* q2 */
987 filter8 = LD_UB(filter48 + 80);
988 tmp0_r = q7_r_in - q1_r_in;
989 tmp0_r += q2_r_in;
990 tmp0_r -= p5_r_in;
991 tmp1_r += tmp0_r;
992 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
993 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
994 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
995 ST8x1_UB(filter8, src);
996 src += 16;
997
998 /* q3 */
999 tmp0_r = q7_r_in - q2_r_in;
1000 tmp0_r += q3_r_in;
1001 tmp0_r -= p4_r_in;
1002 tmp1_r += tmp0_r;
1003 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1004 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1005 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1006 ST8x1_UB(q3, src);
1007 src += 16;
1008
1009 /* q4 */
1010 tmp0_r = q7_r_in - q3_r_in;
1011 tmp0_r += q4_r_in;
1012 tmp0_r -= p3_r_in;
1013 tmp1_r += tmp0_r;
1014 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1015 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1016 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1017 ST8x1_UB(q4, src);
1018 src += 16;
1019
1020 /* q5 */
1021 tmp0_r = q7_r_in - q4_r_in;
1022 tmp0_r += q5_r_in;
1023 tmp0_r -= p2_r_in;
1024 tmp1_r += tmp0_r;
1025 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1026 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1027 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1028 ST8x1_UB(q5, src);
1029 src += 16;
1030
1031 /* q6 */
1032 tmp0_r = q7_r_in - q5_r_in;
1033 tmp0_r += q6_r_in;
1034 tmp0_r -= p1_r_in;
1035 tmp1_r += tmp0_r;
1036 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1037 r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1038 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1039 ST8x1_UB(q6, src);
1040
1041 return 0;
1042 }
1043 }
1044
vpx_lpf_vertical_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1045 void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
1046 const uint8_t *b_limit_ptr,
1047 const uint8_t *limit_ptr,
1048 const uint8_t *thresh_ptr) {
1049 uint8_t early_exit = 0;
1050 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1051 uint8_t *filter48 = &transposed_input[16 * 16];
1052
1053 transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
1054
1055 early_exit =
1056 vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch,
1057 b_limit_ptr, limit_ptr, thresh_ptr);
1058
1059 if (0 == early_exit) {
1060 early_exit =
1061 vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
1062
1063 if (0 == early_exit) {
1064 transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
1065 }
1066 }
1067 }
1068
vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1069 static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
1070 uint8_t *src_org, int32_t pitch,
1071 const uint8_t *b_limit_ptr,
1072 const uint8_t *limit_ptr,
1073 const uint8_t *thresh_ptr) {
1074 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1075 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1076 v16u8 flat, mask, hev, thresh, b_limit, limit;
1077 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1078 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1079 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
1080 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
1081 v16i8 zero = { 0 };
1082 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
1083
1084 /* load vector elements */
1085 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1086
1087 thresh = (v16u8)__msa_fill_b(*thresh_ptr);
1088 b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
1089 limit = (v16u8)__msa_fill_b(*limit_ptr);
1090
1091 /* mask and hev */
1092 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
1093 mask, flat);
1094 /* flat4 */
1095 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1096 /* filter4 */
1097 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
1098
1099 if (__msa_test_bz_v(flat)) {
1100 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1101 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1102 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1103 ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1104
1105 src_org -= 2;
1106 ST4x8_UB(vec2, vec3, src_org, pitch);
1107 src_org += 8 * pitch;
1108 ST4x8_UB(vec4, vec5, src_org, pitch);
1109
1110 return 1;
1111 } else {
1112 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
1113 q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
1114 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1115 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1116 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
1117 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
1118 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1119 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1120
1121 /* convert 16 bit output data into 8 bit */
1122 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1123 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1124 p0_filt8_r, q0_filt8_r);
1125 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1126 q2_filt8_r);
1127
1128 /* store pixel values */
1129 p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
1130 p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
1131 p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
1132 q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
1133 q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
1134 q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
1135
1136 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1137 filter48 += (4 * 16);
1138 ST_UB2(q1_out, q2_out, filter48, 16);
1139 filter48 += (2 * 16);
1140 ST_UB(flat, filter48);
1141
1142 return 0;
1143 }
1144 }
1145
vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)1146 static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
1147 uint8_t *filter48) {
1148 v16u8 flat, flat2, filter8;
1149 v16i8 zero = { 0 };
1150 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1151 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1152 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1153 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
1154 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
1155 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
1156 v8i16 l_out, r_out;
1157
1158 flat = LD_UB(filter48 + 6 * 16);
1159
1160 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1161 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1162
1163 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1164
1165 if (__msa_test_bz_v(flat2)) {
1166 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1167
1168 LD_UB4(filter48, 16, p2, p1, p0, q0);
1169 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1170
1171 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1172 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1173 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1174 ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1175 ILVRL_B2_SH(q2, q1, vec2, vec5);
1176
1177 src_org -= 3;
1178 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1179 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1180 src_org += (4 * pitch);
1181 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1182 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1183 src_org += (4 * pitch);
1184 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
1185 ST2x4_UB(vec5, 0, (src_org + 4), pitch);
1186 src_org += (4 * pitch);
1187 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
1188 ST2x4_UB(vec5, 4, (src_org + 4), pitch);
1189
1190 return 1;
1191 } else {
1192 src -= 7 * 16;
1193
1194 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
1195 p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
1196 p2_r_in, p1_r_in, p0_r_in);
1197 q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
1198
1199 tmp0_r = p7_r_in << 3;
1200 tmp0_r -= p7_r_in;
1201 tmp0_r += p6_r_in;
1202 tmp0_r += q0_r_in;
1203 tmp1_r = p6_r_in + p5_r_in;
1204 tmp1_r += p4_r_in;
1205 tmp1_r += p3_r_in;
1206 tmp1_r += p2_r_in;
1207 tmp1_r += p1_r_in;
1208 tmp1_r += p0_r_in;
1209 tmp1_r += tmp0_r;
1210 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1211
1212 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
1213 p5_l_in, p4_l_in);
1214 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
1215 p1_l_in, p0_l_in);
1216 q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
1217
1218 tmp0_l = p7_l_in << 3;
1219 tmp0_l -= p7_l_in;
1220 tmp0_l += p6_l_in;
1221 tmp0_l += q0_l_in;
1222 tmp1_l = p6_l_in + p5_l_in;
1223 tmp1_l += p4_l_in;
1224 tmp1_l += p3_l_in;
1225 tmp1_l += p2_l_in;
1226 tmp1_l += p1_l_in;
1227 tmp1_l += p0_l_in;
1228 tmp1_l += tmp0_l;
1229 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1230
1231 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1232 p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
1233 ST_UB(p6, src);
1234 src += 16;
1235
1236 /* p5 */
1237 q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
1238 tmp0_r = p5_r_in - p6_r_in;
1239 tmp0_r += q1_r_in;
1240 tmp0_r -= p7_r_in;
1241 tmp1_r += tmp0_r;
1242 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1243 q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
1244 tmp0_l = p5_l_in - p6_l_in;
1245 tmp0_l += q1_l_in;
1246 tmp0_l -= p7_l_in;
1247 tmp1_l += tmp0_l;
1248 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1249 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1250 p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
1251 ST_UB(p5, src);
1252 src += 16;
1253
1254 /* p4 */
1255 q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
1256 tmp0_r = p4_r_in - p5_r_in;
1257 tmp0_r += q2_r_in;
1258 tmp0_r -= p7_r_in;
1259 tmp1_r += tmp0_r;
1260 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1261 q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
1262 tmp0_l = p4_l_in - p5_l_in;
1263 tmp0_l += q2_l_in;
1264 tmp0_l -= p7_l_in;
1265 tmp1_l += tmp0_l;
1266 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1267 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1268 p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
1269 ST_UB(p4, src);
1270 src += 16;
1271
1272 /* p3 */
1273 q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
1274 tmp0_r = p3_r_in - p4_r_in;
1275 tmp0_r += q3_r_in;
1276 tmp0_r -= p7_r_in;
1277 tmp1_r += tmp0_r;
1278 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1279 q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
1280 tmp0_l = p3_l_in - p4_l_in;
1281 tmp0_l += q3_l_in;
1282 tmp0_l -= p7_l_in;
1283 tmp1_l += tmp0_l;
1284 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1285 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1286 p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
1287 ST_UB(p3, src);
1288 src += 16;
1289
1290 /* p2 */
1291 q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
1292 filter8 = LD_UB(filter48);
1293 tmp0_r = p2_r_in - p3_r_in;
1294 tmp0_r += q4_r_in;
1295 tmp0_r -= p7_r_in;
1296 tmp1_r += tmp0_r;
1297 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1298 q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
1299 tmp0_l = p2_l_in - p3_l_in;
1300 tmp0_l += q4_l_in;
1301 tmp0_l -= p7_l_in;
1302 tmp1_l += tmp0_l;
1303 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1304 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1305 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1306 ST_UB(filter8, src);
1307 src += 16;
1308
1309 /* p1 */
1310 q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
1311 filter8 = LD_UB(filter48 + 16);
1312 tmp0_r = p1_r_in - p2_r_in;
1313 tmp0_r += q5_r_in;
1314 tmp0_r -= p7_r_in;
1315 tmp1_r += tmp0_r;
1316 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1317 q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
1318 tmp0_l = p1_l_in - p2_l_in;
1319 tmp0_l += q5_l_in;
1320 tmp0_l -= p7_l_in;
1321 tmp1_l += tmp0_l;
1322 l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
1323 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1324 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1325 ST_UB(filter8, src);
1326 src += 16;
1327
1328 /* p0 */
1329 q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
1330 filter8 = LD_UB(filter48 + 32);
1331 tmp0_r = p0_r_in - p1_r_in;
1332 tmp0_r += q6_r_in;
1333 tmp0_r -= p7_r_in;
1334 tmp1_r += tmp0_r;
1335 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1336 q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
1337 tmp0_l = p0_l_in - p1_l_in;
1338 tmp0_l += q6_l_in;
1339 tmp0_l -= p7_l_in;
1340 tmp1_l += tmp0_l;
1341 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1342 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1343 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1344 ST_UB(filter8, src);
1345 src += 16;
1346
1347 /* q0 */
1348 q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
1349 filter8 = LD_UB(filter48 + 48);
1350 tmp0_r = q7_r_in - p0_r_in;
1351 tmp0_r += q0_r_in;
1352 tmp0_r -= p7_r_in;
1353 tmp1_r += tmp0_r;
1354 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1355 q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
1356 tmp0_l = q7_l_in - p0_l_in;
1357 tmp0_l += q0_l_in;
1358 tmp0_l -= p7_l_in;
1359 tmp1_l += tmp0_l;
1360 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1361 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1362 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1363 ST_UB(filter8, src);
1364 src += 16;
1365
1366 /* q1 */
1367 filter8 = LD_UB(filter48 + 64);
1368 tmp0_r = q7_r_in - q0_r_in;
1369 tmp0_r += q1_r_in;
1370 tmp0_r -= p6_r_in;
1371 tmp1_r += tmp0_r;
1372 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1373 tmp0_l = q7_l_in - q0_l_in;
1374 tmp0_l += q1_l_in;
1375 tmp0_l -= p6_l_in;
1376 tmp1_l += tmp0_l;
1377 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1378 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1379 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1380 ST_UB(filter8, src);
1381 src += 16;
1382
1383 /* q2 */
1384 filter8 = LD_UB(filter48 + 80);
1385 tmp0_r = q7_r_in - q1_r_in;
1386 tmp0_r += q2_r_in;
1387 tmp0_r -= p5_r_in;
1388 tmp1_r += tmp0_r;
1389 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1390 tmp0_l = q7_l_in - q1_l_in;
1391 tmp0_l += q2_l_in;
1392 tmp0_l -= p5_l_in;
1393 tmp1_l += tmp0_l;
1394 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1395 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1396 filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1397 ST_UB(filter8, src);
1398 src += 16;
1399
1400 /* q3 */
1401 tmp0_r = q7_r_in - q2_r_in;
1402 tmp0_r += q3_r_in;
1403 tmp0_r -= p4_r_in;
1404 tmp1_r += tmp0_r;
1405 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1406 tmp0_l = q7_l_in - q2_l_in;
1407 tmp0_l += q3_l_in;
1408 tmp0_l -= p4_l_in;
1409 tmp1_l += tmp0_l;
1410 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1411 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1412 q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1413 ST_UB(q3, src);
1414 src += 16;
1415
1416 /* q4 */
1417 tmp0_r = q7_r_in - q3_r_in;
1418 tmp0_r += q4_r_in;
1419 tmp0_r -= p3_r_in;
1420 tmp1_r += tmp0_r;
1421 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1422 tmp0_l = q7_l_in - q3_l_in;
1423 tmp0_l += q4_l_in;
1424 tmp0_l -= p3_l_in;
1425 tmp1_l += tmp0_l;
1426 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1427 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1428 q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1429 ST_UB(q4, src);
1430 src += 16;
1431
1432 /* q5 */
1433 tmp0_r = q7_r_in - q4_r_in;
1434 tmp0_r += q5_r_in;
1435 tmp0_r -= p2_r_in;
1436 tmp1_r += tmp0_r;
1437 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1438 tmp0_l = q7_l_in - q4_l_in;
1439 tmp0_l += q5_l_in;
1440 tmp0_l -= p2_l_in;
1441 tmp1_l += tmp0_l;
1442 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1443 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1444 q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1445 ST_UB(q5, src);
1446 src += 16;
1447
1448 /* q6 */
1449 tmp0_r = q7_r_in - q5_r_in;
1450 tmp0_r += q6_r_in;
1451 tmp0_r -= p1_r_in;
1452 tmp1_r += tmp0_r;
1453 r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1454 tmp0_l = q7_l_in - q5_l_in;
1455 tmp0_l += q6_l_in;
1456 tmp0_l -= p1_l_in;
1457 tmp1_l += tmp0_l;
1458 l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1459 r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1460 q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1461 ST_UB(q6, src);
1462
1463 return 0;
1464 }
1465 }
1466
vpx_lpf_vertical_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1467 void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
1468 const uint8_t *b_limit_ptr,
1469 const uint8_t *limit_ptr,
1470 const uint8_t *thresh_ptr) {
1471 uint8_t early_exit = 0;
1472 DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1473 uint8_t *filter48 = &transposed_input[16 * 16];
1474
1475 transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
1476
1477 early_exit =
1478 vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
1479 pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1480
1481 if (0 == early_exit) {
1482 early_exit =
1483 vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
1484
1485 if (0 == early_exit) {
1486 transpose_16x16(transposed_input, 16, (src - 8), pitch);
1487 }
1488 }
1489 }
1490